From e58ec78d359860d1689a49f455df32c8be90a869 Mon Sep 17 00:00:00 2001 From: Audrey MP Date: Tue, 12 Sep 2023 15:34:40 -0400 Subject: [PATCH] Gcn arch name (#886) We use CMake to determine if we're compiling against a version of ROCm that supports gcnArchName and handles architecture checking appropriately. It includes a few helper functions as drop ins for the functionality we used gcnArch for before; sometimes to enable flags, and sometimes to set frequencies. --- CMakeLists.txt | 16 +++++++ src/clique/CliqueManager.cc | 31 ++++++------- src/clique/CliqueManager.h | 2 +- src/graph/topo.cc | 9 +++- src/graph/topo.h | 22 +++++----- src/graph/tuning.cc | 2 +- src/graph/xml.cc | 1 + src/graph/xml.h | 1 + src/include/archinfo.h | 39 +++++++++++++++++ src/init.cc | 16 +++---- src/misc/archinfo.cc | 86 +++++++++++++++++++++++++++++++++++++ src/misc/npkit.cc | 10 ++--- src/transport/net.cc | 2 +- src/transport/p2p.cc | 2 +- tools/topo_expl/Makefile | 3 +- tools/topo_expl/utils.cpp | 8 ++-- 16 files changed, 193 insertions(+), 57 deletions(-) create mode 100644 src/include/archinfo.h create mode 100644 src/misc/archinfo.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index c8bb9ffa16..90ace2446a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -122,6 +122,17 @@ message(STATUS "hipcc version: ${hipcc_version_string}") ### Check for hipEventDisableSystemFence support check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVENT_DISABLE_FENCE) +### Check for hipDeviceMallocUncached support +check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY) + +message(STATUS "HIP Library version: ${hip_VERSION_MINOR}") +### Check the version of HIP to see if we can use gcnArchName instead of gcnArch (deprecated) +if(${hipcc_version_string} VERSION_LESS "5.7.31921") + set(HIP_NO_GCNARCHNAME ON) + else() + set(HIP_NO_GCNARCHNAME OFF) +endif() + ### Check for indirect function call support if(ENABLE_IFC) if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201") @@ -300,6 +311,7 @@ set(SRC_FILES src/graph/xml.cc src/graph/xml.h src/group.cc + src/include/archinfo.h src/include/align.h src/include/alloc.h src/include/argcheck.h @@ -380,6 +392,7 @@ set(SRC_FILES src/include/utils.h src/init.cc # src/init_nvtx.cc + src/misc/archinfo.cc src/misc/argcheck.cc # src/misc/cudawrap.cc # src/misc/gdrwrap.cc @@ -547,6 +560,9 @@ else() target_compile_options(rccl PRIVATE --hipcc-func-supp) endif() endif() +if(HIP_NO_GCNARCHNAME) + target_compile_definitions(rccl PRIVATE HIP_NO_GCNARCHNAME) +endif() if (BUILD_BFD) if (HAVE_BFD) target_compile_definitions(rccl PRIVATE HAVE_BFD) diff --git a/src/clique/CliqueManager.cc b/src/clique/CliqueManager.cc index 722c7ac1e1..7abfc67fda 100644 --- a/src/clique/CliqueManager.cc +++ b/src/clique/CliqueManager.cc @@ -61,7 +61,7 @@ CliqueManager::CliqueManager(int const rank, m_opIndexHead(0), m_opIndexTail(0), m_init(false), - m_gcnArch(0), + m_gcnArchName(char[256]), m_allReduceByteLimit(0), m_pinnedCliquePtrs(NULL), m_gpuBarrierGlobalCount(NULL), @@ -243,13 +243,13 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix) CUDACHECK(hipGetDevice(&deviceId)); hipDeviceProp_t devProp; CUDACHECK(hipGetDeviceProperties(&devProp, deviceId)); - m_gcnArch = devProp.gcnArch; + m_gcnArchName = devProp.gcnArchName; // Establish when to use clique-based kernels based on input size SetByteLimits(); m_init = true; - INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d) [GCN %d]", m_cliqueMode, m_gcnArch); + INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d) [GCN %d]", m_cliqueMode, m_gcnArchName); return ncclSuccess; dropback: @@ -266,12 +266,12 @@ void CliqueManager::SetByteLimits() m_allReduceByteLimit = rcclParamAllReduceCliqueByteLimit(); if (m_allReduceByteLimit == 0) { - switch (m_gcnArch) - { - case 906: m_allReduceByteLimit = 16777216; break; - case 908: m_allReduceByteLimit = 8388608; break; - default: m_allReduceByteLimit = 16777216; break; - } + if (IsArchMatch(m_gcnArchName, "gfx906")) + m_allReduceByteLimit = 16777216; + else if (IsArchMatch(m_gcnArchName, "gfx908")) + m_allReduceByteLimit = 8388608; + else + m_allReduceByteLimit = 16777216; } } @@ -368,23 +368,18 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll, { // NOTE: These are currently based on collected data and not necessarily ideal for all hardware int numChannels; - switch (m_gcnArch) - { - case 906: + if (IsArchMatch(m_gcnArchName, "gfx906")) { if (totalBytes <= 16384) numChannels = 1; else numChannels = 2; - break; - case 908: + } else if (IsArchMatch(m_gcnArchName, "gfx908")) { if (totalBytes <= 131072) numChannels = 2; else if (totalBytes <= 524288) numChannels = 6; else if (totalBytes <= 1048576) numChannels = 13; else numChannels = 16; - break; - case 910: + } else if (IsArchMatch(m_gcnArchName, "gfx90a")) { if (totalBytes <= 262144) numChannels = 4; else numChannels = 8; - break; - default: + } else { if (totalBytes <= 65536) numChannels = 1; else if (totalBytes <= 262144) numChannels = 2; else if (totalBytes <= 524288) numChannels = 4; diff --git a/src/clique/CliqueManager.h b/src/clique/CliqueManager.h index caf2bdc8bd..33264759e7 100644 --- a/src/clique/CliqueManager.h +++ b/src/clique/CliqueManager.h @@ -95,7 +95,7 @@ protected: int32_t m_opIndexHead; // Track start of outstanding requests int32_t m_opIndexTail; // Track end of outstanding requests bool m_init; // Whether CliqueManager has been initialized - int m_gcnArch; // Device GCN arch value + char[256] m_gcnArchName; // Device GCN arch value size_t m_allReduceByteLimit; // Byte limit for AllReduce cliqueDevicePtrs_t* m_pinnedCliquePtrs; // Pinned-host-memory (device accessible) containing device pointers int* m_gpuBarrierGlobalCount; // Part of GPU barrier (count variable shared across ranks) diff --git a/src/graph/topo.cc b/src/graph/topo.cc index f246630fce..345b37b6ab 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -369,7 +369,14 @@ ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* s ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap)); - NCCLCHECK(xmlGetAttrInt(xmlGpu, "gcn", &gpu->gpu.gcn)); + NCCLCHECK(xmlGetAttr(xmlGpu, "gcn", &gpu->gpu.gcn)); + if (strcmp(gpu->gpu.gcn, "906") == 0) { + gpu->gpu.gcn = "gfx906"; + } else if (strcmp(gpu->gpu.gcn, "908") == 0) { + gpu->gpu.gcn = "gfx908"; + } else if (strcmp(gpu->gpu.gcn, "910") == 0) { + gpu->gpu.gcn = "gfx90a"; + } rcclHipDeviceArch_t arch; NCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value)); memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t)); diff --git a/src/graph/topo.h b/src/graph/topo.h index 1845b5f3e5..5cd973e9c8 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -10,6 +10,8 @@ #include "graph.h" #include "core.h" +#include "archinfo.h" +#include #define LOC_BW 5000.0 #define SM60_NVLINK_BW 18.0 @@ -123,7 +125,7 @@ struct ncclTopoNode { int rank; int cudaCompCap; int gdrSupport; - int gcn; + const char* gcn; hipDeviceArch_t arch; }gpu; struct { @@ -224,17 +226,13 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in } // Returns XGMI speed in GB/s -static float ncclTopoXGMISpeed(int gcn) { - switch (gcn) { - case 910: - return MI200_XGMI_WIDTH; - case 940: - case 941: - case 942: - return GFX94X_XGMI_WIDTH; - default: - return VEGA_XGMI_WIDTH; - } +static float ncclTopoXGMISpeed(const char* gcn) { + if (IsArchMatch(gcn, "gfx90a")) + return MI200_XGMI_WIDTH; + else if (IsArchMatch(gcn, "gfx94")) + return GFX94X_XGMI_WIDTH; + else + return VEGA_XGMI_WIDTH; } #if ENABLE_COLLTRACE diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 8a9f964db1..47f57f7e65 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -455,7 +455,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom #if defined(ENABLE_LL128) // Enable LL128 by default only on gfx90a with available tuning table pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL && - (comm->topo->nodes[GPU].nodes[0].gpu.gcn == 910 && comm->topo->ll128Enabled) ? 1 : 0; + (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && comm->topo->ll128Enabled) ? 1 : 0; #else pEnable = 0; #endif diff --git a/src/graph/xml.cc b/src/graph/xml.cc index 138d48e401..d11f80864a 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -14,6 +14,7 @@ #include "nvmlwrap.h" #include "xml.h" #include "rocm_smi_wrap.h" +#include "archinfo.h" /*******************/ /* XML File Parser */ diff --git a/src/graph/xml.h b/src/graph/xml.h index 5ffa6c90c5..d56fb32768 100644 --- a/src/graph/xml.h +++ b/src/graph/xml.h @@ -12,6 +12,7 @@ #include "debug.h" #include "checks.h" #include +#include "archinfo.h" // A few constraints to make the implementation easy #define MAX_STR_LEN 255 diff --git a/src/include/archinfo.h b/src/include/archinfo.h new file mode 100644 index 0000000000..a4a163bc3f --- /dev/null +++ b/src/include/archinfo.h @@ -0,0 +1,39 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef ARCHINFO_H_ +#define ARCHINFO_H_ + +#include + +/* +#include +#include +*/ + +void GcnArchNameFormat(char *gcnArchName, char* out); +void GcnArchConvertToGcnArchName(int gcnArch, char* out); +int GetGcnArchName(int deviceId, char* out); +double GetDeviceWallClockRateInKhz(int deviceId); +bool IsArchMatch(char const* arch, char const* target); + +#endif // ARCHINFO_H diff --git a/src/init.cc b/src/init.cc index e228120202..d19edd8b37 100644 --- a/src/init.cc +++ b/src/init.cc @@ -33,6 +33,7 @@ #include #include "graph/topo.h" #include "graph/xml.h" +#include "archinfo.h" // [RCCL] #include "git_version.h" @@ -174,15 +175,10 @@ RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0); void *ncclCommThreadMain(void *arg) { ncclComm_t comm = (ncclComm_t)arg; int head[MAXCHANNELS]; - hipDeviceProp_t devProp; double vega_gpu_rtc_freq; memset(head, 0, sizeof(int)*MAXCHANNELS); - hipError_t status = hipGetDeviceProperties(&devProp, comm->cudaDev); - if (devProp.gcnArch/10 == 94 && status == hipSuccess) - vega_gpu_rtc_freq = 1.0E8; - else - vega_gpu_rtc_freq = 2.5E7; + vega_gpu_rtc_freq = GetDeviceWallClockRateInKhz(comm->cudaDev) * 1.0E3; #define MAX_NAME_LENGTH 64 char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+2)); for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) { @@ -1230,17 +1226,17 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx)); allGather3Data[rank].nc = 2; if (comm->topo->nodes[GPU].count == comm->topo->nRanks && - comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi) + IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx906") && allXgmi) allGather3Data[rank].nc = 4; - if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908) + if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx908")) allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2); if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G)) allGather3Data[rank].nc = 4; if (comm->topo->nodes[GPU].count == comm->topo->nRanks && - comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) + IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a")) allGather3Data[rank].nc = 4; - if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) + if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a")) allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels); if (ringGraph.nChannels > MAXCHANNELS/2) allGather3Data[rank].nc = 1; diff --git a/src/misc/archinfo.cc b/src/misc/archinfo.cc new file mode 100644 index 0000000000..565e0a7ddb --- /dev/null +++ b/src/misc/archinfo.cc @@ -0,0 +1,86 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "archinfo.h" +#include +#include + +void GcnArchNameFormat(char* gcnArchName, char* out) { + // this function parses the char array from the device properties into something easier to handle. + // as the gcnArchName attribute looks something like: "gfx900:xnack+:blah-:etc-" + char *gcnArchNameToken = strtok(gcnArchName, ":"); + strcpy(gcnArchNameToken, out); +} + +void GcnArchConvertToGcnArchName(int gcnArch, char* gcnArchName) { + // gcnArch is deprecated and we should instead use gcnArchName; however, some data files still have + // the older gcnArch value. There's only a handful of architectures that were coded prior to deprecation, + // so we handle those cases here. + //char gcnArchName[256] = {0}; // why 256? Because that's what gcnArchName gives us, so we're matching it. + gcnArchName[6] = 0; + switch (gcnArch) { + case 906: + strncpy(gcnArchName, "gfx906", 6); + break; + case 908: + strncpy(gcnArchName, "gfx908", 6); + break; + case 910: + // this is actually 90a + strncpy(gcnArchName, "gfx90a", 6); + break; + } +} + +int GetGcnArchName(int deviceId, char* out) { + // this is a generic call in to get a consistent gcnArchName regardless of which version of rocm we're using. + // or which version of rocm we're using. + hipDeviceProp_t devProp; + hipError_t status = hipGetDeviceProperties(&devProp, deviceId); + if (status != hipSuccess) { + //std::cerr << "Encountered HIP error getting device properties: " + // << hipGetErrorString(status) << "\n"; + exit(-1); + } +#ifdef HIP_NO_GCNARCHNAME + // we're using a HIP version before 3.7. + GcnArchConvertToGcnArchName(devProp.gcnArch, out); + return 1; +#else + GcnArchNameFormat(devProp.gcnArchName, out); + return 0; +#endif +} + +double GetDeviceWallClockRateInKhz(int deviceId) { + char* gcn; + GetGcnArchName(deviceId, gcn); + if (strncmp("gfx94", gcn, 5) == 0) + return 1.0E5; + else + return 2.5E4; +} + +bool IsArchMatch(char const* arch, char const* target) { + // helper function to reduce clutter in code elsewhere. Returns true on match. + return (strncmp(arch, target, strlen(target)) == 0); +} diff --git a/src/misc/npkit.cc b/src/misc/npkit.cc index 0302fbd92b..0c72779645 100644 --- a/src/misc/npkit.cc +++ b/src/misc/npkit.cc @@ -9,6 +9,7 @@ #include "alloc.h" #include "npkit/npkit.h" +#include "archinfo.h" uint64_t NpKit::rank_ = 0; @@ -120,13 +121,8 @@ ncclResult_t NpKit::Dump(const std::string& dump_dir) { dump_file_path = dump_dir; dump_file_path += "/gpu_clock_rate_rank_"; dump_file_path += std::to_string(rank_); - hipDeviceProp_t devProp; - int vega_gpu_rtc_freq_in_khz; - CUDACHECK(hipGetDeviceProperties(&devProp, 0)); - if (devProp.gcnArch/10 == 94) - vega_gpu_rtc_freq_in_khz = 100000; - else - vega_gpu_rtc_freq_in_khz = 25000; + // get the rtc frequency directly from HIP itself (via a wrapper) + double vega_gpu_rtc_freq_in_khz = GetDeviceWallClockRateInKhz(0); std::string clock_rate_str = std::to_string(vega_gpu_rtc_freq_in_khz); auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out); gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length()); diff --git a/src/transport/net.cc b/src/transport/net.cc index 274006d97f..7c454b87d8 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -190,7 +190,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; - if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) { + if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) { CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev)); send->conn.curr_hdp_reg = req.curr_hdp_reg; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index fd9e67c72d..67ad2c2c31 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -354,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank); return ncclInternalError; } - if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) { + if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) { CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev)); TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg); } diff --git a/tools/topo_expl/Makefile b/tools/topo_expl/Makefile index 7446d7b52a..23d76e8a16 100644 --- a/tools/topo_expl/Makefile +++ b/tools/topo_expl/Makefile @@ -9,7 +9,7 @@ EXE = topo_expl CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \ - hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc + hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc all: hipify $(EXE) @@ -21,6 +21,7 @@ hipify: mkdir -p hipify_rccl cp -a ../../src/include/ hipify_rccl/ cp -a ../../src/graph/ hipify_rccl/ + cp -ar ../../src/misc/archinfo.cc hipify_rccl/graph/ hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h hipify-perl -inplace -quiet-warnings hipify_rccl/graph/* diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp index 3ce4e7f5f5..8a3d04298f 100644 --- a/tools/topo_expl/utils.cpp +++ b/tools/topo_expl/utils.cpp @@ -862,17 +862,17 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx)); allGather3Data[rank].nc = 2; if (comm->topo->nodes[GPU].count == comm->topo->nRanks && - comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi) + IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx906") && allXgmi) allGather3Data[rank].nc = 4; - if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908) + if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx908")) allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2); if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G)) allGather3Data[rank].nc = 4; if (comm->topo->nodes[GPU].count == comm->topo->nRanks && - comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) + IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a")) allGather3Data[rank].nc = 4; - if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) + if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a")) allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels); if (ringGraph.nChannels > MAXCHANNELS/2) allGather3Data[rank].nc = 1;