We use CMake to determine if we're compiling against a version of ROCm that supports gcnArchName and handles architecture checking appropriately. It includes a few helper functions as drop ins for the functionality we used gcnArch for before; sometimes to enable flags, and sometimes to set frequencies.
This commit is contained in:
Audrey MP
2023-09-12 15:34:40 -04:00
committed by GitHub
szülő e1dc4d5e42
commit e58ec78d35
16 fájl változott, egészen pontosan 193 új sor hozzáadva és 57 régi sor törölve
+16
Fájl megtekintése
@@ -122,6 +122,17 @@ message(STATUS "hipcc version: ${hipcc_version_string}")
### Check for hipEventDisableSystemFence support
check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVENT_DISABLE_FENCE)
### Check for hipDeviceMallocUncached support
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
message(STATUS "HIP Library version: ${hip_VERSION_MINOR}")
### Check the version of HIP to see if we can use gcnArchName instead of gcnArch (deprecated)
if(${hipcc_version_string} VERSION_LESS "5.7.31921")
set(HIP_NO_GCNARCHNAME ON)
else()
set(HIP_NO_GCNARCHNAME OFF)
endif()
### Check for indirect function call support
if(ENABLE_IFC)
if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201")
@@ -300,6 +311,7 @@ set(SRC_FILES
src/graph/xml.cc
src/graph/xml.h
src/group.cc
src/include/archinfo.h
src/include/align.h
src/include/alloc.h
src/include/argcheck.h
@@ -380,6 +392,7 @@ set(SRC_FILES
src/include/utils.h
src/init.cc
# src/init_nvtx.cc
src/misc/archinfo.cc
src/misc/argcheck.cc
# src/misc/cudawrap.cc
# src/misc/gdrwrap.cc
@@ -547,6 +560,9 @@ else()
target_compile_options(rccl PRIVATE --hipcc-func-supp)
endif()
endif()
if(HIP_NO_GCNARCHNAME)
target_compile_definitions(rccl PRIVATE HIP_NO_GCNARCHNAME)
endif()
if (BUILD_BFD)
if (HAVE_BFD)
target_compile_definitions(rccl PRIVATE HAVE_BFD)
+13 -18
Fájl megtekintése
@@ -61,7 +61,7 @@ CliqueManager::CliqueManager(int const rank,
m_opIndexHead(0),
m_opIndexTail(0),
m_init(false),
m_gcnArch(0),
m_gcnArchName(char[256]),
m_allReduceByteLimit(0),
m_pinnedCliquePtrs(NULL),
m_gpuBarrierGlobalCount(NULL),
@@ -243,13 +243,13 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
CUDACHECK(hipGetDevice(&deviceId));
hipDeviceProp_t devProp;
CUDACHECK(hipGetDeviceProperties(&devProp, deviceId));
m_gcnArch = devProp.gcnArch;
m_gcnArchName = devProp.gcnArchName;
// Establish when to use clique-based kernels based on input size
SetByteLimits();
m_init = true;
INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d) [GCN %d]", m_cliqueMode, m_gcnArch);
INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d) [GCN %d]", m_cliqueMode, m_gcnArchName);
return ncclSuccess;
dropback:
@@ -266,12 +266,12 @@ void CliqueManager::SetByteLimits()
m_allReduceByteLimit = rcclParamAllReduceCliqueByteLimit();
if (m_allReduceByteLimit == 0)
{
switch (m_gcnArch)
{
case 906: m_allReduceByteLimit = 16777216; break;
case 908: m_allReduceByteLimit = 8388608; break;
default: m_allReduceByteLimit = 16777216; break;
}
if (IsArchMatch(m_gcnArchName, "gfx906"))
m_allReduceByteLimit = 16777216;
else if (IsArchMatch(m_gcnArchName, "gfx908"))
m_allReduceByteLimit = 8388608;
else
m_allReduceByteLimit = 16777216;
}
}
@@ -368,23 +368,18 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
{
// NOTE: These are currently based on collected data and not necessarily ideal for all hardware
int numChannels;
switch (m_gcnArch)
{
case 906:
if (IsArchMatch(m_gcnArchName, "gfx906")) {
if (totalBytes <= 16384) numChannels = 1;
else numChannels = 2;
break;
case 908:
} else if (IsArchMatch(m_gcnArchName, "gfx908")) {
if (totalBytes <= 131072) numChannels = 2;
else if (totalBytes <= 524288) numChannels = 6;
else if (totalBytes <= 1048576) numChannels = 13;
else numChannels = 16;
break;
case 910:
} else if (IsArchMatch(m_gcnArchName, "gfx90a")) {
if (totalBytes <= 262144) numChannels = 4;
else numChannels = 8;
break;
default:
} else {
if (totalBytes <= 65536) numChannels = 1;
else if (totalBytes <= 262144) numChannels = 2;
else if (totalBytes <= 524288) numChannels = 4;
+1 -1
Fájl megtekintése
@@ -95,7 +95,7 @@ protected:
int32_t m_opIndexHead; // Track start of outstanding requests
int32_t m_opIndexTail; // Track end of outstanding requests
bool m_init; // Whether CliqueManager has been initialized
int m_gcnArch; // Device GCN arch value
char[256] m_gcnArchName; // Device GCN arch value
size_t m_allReduceByteLimit; // Byte limit for AllReduce
cliqueDevicePtrs_t* m_pinnedCliquePtrs; // Pinned-host-memory (device accessible) containing device pointers
int* m_gpuBarrierGlobalCount; // Part of GPU barrier (count variable shared across ranks)
+8 -1
Fájl megtekintése
@@ -369,7 +369,14 @@ ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* s
ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
NCCLCHECK(xmlGetAttrInt(xmlGpu, "gcn", &gpu->gpu.gcn));
NCCLCHECK(xmlGetAttr(xmlGpu, "gcn", &gpu->gpu.gcn));
if (strcmp(gpu->gpu.gcn, "906") == 0) {
gpu->gpu.gcn = "gfx906";
} else if (strcmp(gpu->gpu.gcn, "908") == 0) {
gpu->gpu.gcn = "gfx908";
} else if (strcmp(gpu->gpu.gcn, "910") == 0) {
gpu->gpu.gcn = "gfx90a";
}
rcclHipDeviceArch_t arch;
NCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value));
memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t));
+10 -12
Fájl megtekintése
@@ -10,6 +10,8 @@
#include "graph.h"
#include "core.h"
#include "archinfo.h"
#include <string.h>
#define LOC_BW 5000.0
#define SM60_NVLINK_BW 18.0
@@ -123,7 +125,7 @@ struct ncclTopoNode {
int rank;
int cudaCompCap;
int gdrSupport;
int gcn;
const char* gcn;
hipDeviceArch_t arch;
}gpu;
struct {
@@ -224,17 +226,13 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
}
// Returns XGMI speed in GB/s
static float ncclTopoXGMISpeed(int gcn) {
switch (gcn) {
case 910:
return MI200_XGMI_WIDTH;
case 940:
case 941:
case 942:
return GFX94X_XGMI_WIDTH;
default:
return VEGA_XGMI_WIDTH;
}
static float ncclTopoXGMISpeed(const char* gcn) {
if (IsArchMatch(gcn, "gfx90a"))
return MI200_XGMI_WIDTH;
else if (IsArchMatch(gcn, "gfx94"))
return GFX94X_XGMI_WIDTH;
else
return VEGA_XGMI_WIDTH;
}
#if ENABLE_COLLTRACE
+1 -1
Fájl megtekintése
@@ -455,7 +455,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
#if defined(ENABLE_LL128)
// Enable LL128 by default only on gfx90a with available tuning table
pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
(comm->topo->nodes[GPU].nodes[0].gpu.gcn == 910 && comm->topo->ll128Enabled) ? 1 : 0;
(IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && comm->topo->ll128Enabled) ? 1 : 0;
#else
pEnable = 0;
#endif
+1
Fájl megtekintése
@@ -14,6 +14,7 @@
#include "nvmlwrap.h"
#include "xml.h"
#include "rocm_smi_wrap.h"
#include "archinfo.h"
/*******************/
/* XML File Parser */
+1
Fájl megtekintése
@@ -12,6 +12,7 @@
#include "debug.h"
#include "checks.h"
#include <stdlib.h>
#include "archinfo.h"
// A few constraints to make the implementation easy
#define MAX_STR_LEN 255
+39
Fájl megtekintése
@@ -0,0 +1,39 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef ARCHINFO_H_
#define ARCHINFO_H_
#include <string.h>
/*
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
*/
void GcnArchNameFormat(char *gcnArchName, char* out);
void GcnArchConvertToGcnArchName(int gcnArch, char* out);
int GetGcnArchName(int deviceId, char* out);
double GetDeviceWallClockRateInKhz(int deviceId);
bool IsArchMatch(char const* arch, char const* target);
#endif // ARCHINFO_H
+6 -10
Fájl megtekintése
@@ -33,6 +33,7 @@
#include <unistd.h>
#include "graph/topo.h"
#include "graph/xml.h"
#include "archinfo.h"
// [RCCL]
#include "git_version.h"
@@ -174,15 +175,10 @@ RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0);
void *ncclCommThreadMain(void *arg) {
ncclComm_t comm = (ncclComm_t)arg;
int head[MAXCHANNELS];
hipDeviceProp_t devProp;
double vega_gpu_rtc_freq;
memset(head, 0, sizeof(int)*MAXCHANNELS);
hipError_t status = hipGetDeviceProperties(&devProp, comm->cudaDev);
if (devProp.gcnArch/10 == 94 && status == hipSuccess)
vega_gpu_rtc_freq = 1.0E8;
else
vega_gpu_rtc_freq = 2.5E7;
vega_gpu_rtc_freq = GetDeviceWallClockRateInKhz(comm->cudaDev) * 1.0E3;
#define MAX_NAME_LENGTH 64
char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+2));
for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) {
@@ -1230,17 +1226,17 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
allGather3Data[rank].nc = 2;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx906") && allXgmi)
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx908"))
allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
(comm->topo->type & RCCL_TOPO_CR8G))
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
if (ringGraph.nChannels > MAXCHANNELS/2)
allGather3Data[rank].nc = 1;
+86
Fájl megtekintése
@@ -0,0 +1,86 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "archinfo.h"
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
void GcnArchNameFormat(char* gcnArchName, char* out) {
// this function parses the char array from the device properties into something easier to handle.
// as the gcnArchName attribute looks something like: "gfx900:xnack+:blah-:etc-"
char *gcnArchNameToken = strtok(gcnArchName, ":");
strcpy(gcnArchNameToken, out);
}
void GcnArchConvertToGcnArchName(int gcnArch, char* gcnArchName) {
// gcnArch is deprecated and we should instead use gcnArchName; however, some data files still have
// the older gcnArch value. There's only a handful of architectures that were coded prior to deprecation,
// so we handle those cases here.
//char gcnArchName[256] = {0}; // why 256? Because that's what gcnArchName gives us, so we're matching it.
gcnArchName[6] = 0;
switch (gcnArch) {
case 906:
strncpy(gcnArchName, "gfx906", 6);
break;
case 908:
strncpy(gcnArchName, "gfx908", 6);
break;
case 910:
// this is actually 90a
strncpy(gcnArchName, "gfx90a", 6);
break;
}
}
int GetGcnArchName(int deviceId, char* out) {
// this is a generic call in to get a consistent gcnArchName regardless of which version of rocm we're using.
// or which version of rocm we're using.
hipDeviceProp_t devProp;
hipError_t status = hipGetDeviceProperties(&devProp, deviceId);
if (status != hipSuccess) {
//std::cerr << "Encountered HIP error getting device properties: "
// << hipGetErrorString(status) << "\n";
exit(-1);
}
#ifdef HIP_NO_GCNARCHNAME
// we're using a HIP version before 3.7.
GcnArchConvertToGcnArchName(devProp.gcnArch, out);
return 1;
#else
GcnArchNameFormat(devProp.gcnArchName, out);
return 0;
#endif
}
double GetDeviceWallClockRateInKhz(int deviceId) {
char* gcn;
GetGcnArchName(deviceId, gcn);
if (strncmp("gfx94", gcn, 5) == 0)
return 1.0E5;
else
return 2.5E4;
}
bool IsArchMatch(char const* arch, char const* target) {
// helper function to reduce clutter in code elsewhere. Returns true on match.
return (strncmp(arch, target, strlen(target)) == 0);
}
+3 -7
Fájl megtekintése
@@ -9,6 +9,7 @@
#include "alloc.h"
#include "npkit/npkit.h"
#include "archinfo.h"
uint64_t NpKit::rank_ = 0;
@@ -120,13 +121,8 @@ ncclResult_t NpKit::Dump(const std::string& dump_dir) {
dump_file_path = dump_dir;
dump_file_path += "/gpu_clock_rate_rank_";
dump_file_path += std::to_string(rank_);
hipDeviceProp_t devProp;
int vega_gpu_rtc_freq_in_khz;
CUDACHECK(hipGetDeviceProperties(&devProp, 0));
if (devProp.gcnArch/10 == 94)
vega_gpu_rtc_freq_in_khz = 100000;
else
vega_gpu_rtc_freq_in_khz = 25000;
// get the rtc frequency directly from HIP itself (via a wrapper)
double vega_gpu_rtc_freq_in_khz = GetDeviceWallClockRateInKhz(0);
std::string clock_rate_str = std::to_string(vega_gpu_rtc_freq_in_khz);
auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out);
gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
+1 -1
Fájl megtekintése
@@ -190,7 +190,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
send->conn.curr_hdp_reg = req.curr_hdp_reg;
}
+1 -1
Fájl megtekintése
@@ -354,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
return ncclInternalError;
}
if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
}
+2 -1
Fájl megtekintése
@@ -9,7 +9,7 @@ EXE = topo_expl
CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL
files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc
all: hipify $(EXE)
@@ -21,6 +21,7 @@ hipify:
mkdir -p hipify_rccl
cp -a ../../src/include/ hipify_rccl/
cp -a ../../src/graph/ hipify_rccl/
cp -ar ../../src/misc/archinfo.cc hipify_rccl/graph/
hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h
hipify-perl -inplace -quiet-warnings hipify_rccl/graph/*
+4 -4
Fájl megtekintése
@@ -862,17 +862,17 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
allGather3Data[rank].nc = 2;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx906") && allXgmi)
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx908"))
allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
(comm->topo->type & RCCL_TOPO_CR8G))
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
if (ringGraph.nChannels > MAXCHANNELS/2)
allGather3Data[rank].nc = 1;