Gcn arch name (#886)
We use CMake to determine if we're compiling against a version of ROCm that supports gcnArchName and handles architecture checking appropriately. It includes a few helper functions as drop ins for the functionality we used gcnArch for before; sometimes to enable flags, and sometimes to set frequencies.
This commit is contained in:
@@ -122,6 +122,17 @@ message(STATUS "hipcc version: ${hipcc_version_string}")
|
||||
### Check for hipEventDisableSystemFence support
|
||||
check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVENT_DISABLE_FENCE)
|
||||
|
||||
### Check for hipDeviceMallocUncached support
|
||||
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
|
||||
|
||||
message(STATUS "HIP Library version: ${hip_VERSION_MINOR}")
|
||||
### Check the version of HIP to see if we can use gcnArchName instead of gcnArch (deprecated)
|
||||
if(${hipcc_version_string} VERSION_LESS "5.7.31921")
|
||||
set(HIP_NO_GCNARCHNAME ON)
|
||||
else()
|
||||
set(HIP_NO_GCNARCHNAME OFF)
|
||||
endif()
|
||||
|
||||
### Check for indirect function call support
|
||||
if(ENABLE_IFC)
|
||||
if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201")
|
||||
@@ -300,6 +311,7 @@ set(SRC_FILES
|
||||
src/graph/xml.cc
|
||||
src/graph/xml.h
|
||||
src/group.cc
|
||||
src/include/archinfo.h
|
||||
src/include/align.h
|
||||
src/include/alloc.h
|
||||
src/include/argcheck.h
|
||||
@@ -380,6 +392,7 @@ set(SRC_FILES
|
||||
src/include/utils.h
|
||||
src/init.cc
|
||||
# src/init_nvtx.cc
|
||||
src/misc/archinfo.cc
|
||||
src/misc/argcheck.cc
|
||||
# src/misc/cudawrap.cc
|
||||
# src/misc/gdrwrap.cc
|
||||
@@ -547,6 +560,9 @@ else()
|
||||
target_compile_options(rccl PRIVATE --hipcc-func-supp)
|
||||
endif()
|
||||
endif()
|
||||
if(HIP_NO_GCNARCHNAME)
|
||||
target_compile_definitions(rccl PRIVATE HIP_NO_GCNARCHNAME)
|
||||
endif()
|
||||
if (BUILD_BFD)
|
||||
if (HAVE_BFD)
|
||||
target_compile_definitions(rccl PRIVATE HAVE_BFD)
|
||||
|
||||
@@ -61,7 +61,7 @@ CliqueManager::CliqueManager(int const rank,
|
||||
m_opIndexHead(0),
|
||||
m_opIndexTail(0),
|
||||
m_init(false),
|
||||
m_gcnArch(0),
|
||||
m_gcnArchName(char[256]),
|
||||
m_allReduceByteLimit(0),
|
||||
m_pinnedCliquePtrs(NULL),
|
||||
m_gpuBarrierGlobalCount(NULL),
|
||||
@@ -243,13 +243,13 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
|
||||
CUDACHECK(hipGetDevice(&deviceId));
|
||||
hipDeviceProp_t devProp;
|
||||
CUDACHECK(hipGetDeviceProperties(&devProp, deviceId));
|
||||
m_gcnArch = devProp.gcnArch;
|
||||
m_gcnArchName = devProp.gcnArchName;
|
||||
|
||||
// Establish when to use clique-based kernels based on input size
|
||||
SetByteLimits();
|
||||
|
||||
m_init = true;
|
||||
INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d) [GCN %d]", m_cliqueMode, m_gcnArch);
|
||||
INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d) [GCN %d]", m_cliqueMode, m_gcnArchName);
|
||||
return ncclSuccess;
|
||||
|
||||
dropback:
|
||||
@@ -266,12 +266,12 @@ void CliqueManager::SetByteLimits()
|
||||
m_allReduceByteLimit = rcclParamAllReduceCliqueByteLimit();
|
||||
if (m_allReduceByteLimit == 0)
|
||||
{
|
||||
switch (m_gcnArch)
|
||||
{
|
||||
case 906: m_allReduceByteLimit = 16777216; break;
|
||||
case 908: m_allReduceByteLimit = 8388608; break;
|
||||
default: m_allReduceByteLimit = 16777216; break;
|
||||
}
|
||||
if (IsArchMatch(m_gcnArchName, "gfx906"))
|
||||
m_allReduceByteLimit = 16777216;
|
||||
else if (IsArchMatch(m_gcnArchName, "gfx908"))
|
||||
m_allReduceByteLimit = 8388608;
|
||||
else
|
||||
m_allReduceByteLimit = 16777216;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -368,23 +368,18 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
|
||||
{
|
||||
// NOTE: These are currently based on collected data and not necessarily ideal for all hardware
|
||||
int numChannels;
|
||||
switch (m_gcnArch)
|
||||
{
|
||||
case 906:
|
||||
if (IsArchMatch(m_gcnArchName, "gfx906")) {
|
||||
if (totalBytes <= 16384) numChannels = 1;
|
||||
else numChannels = 2;
|
||||
break;
|
||||
case 908:
|
||||
} else if (IsArchMatch(m_gcnArchName, "gfx908")) {
|
||||
if (totalBytes <= 131072) numChannels = 2;
|
||||
else if (totalBytes <= 524288) numChannels = 6;
|
||||
else if (totalBytes <= 1048576) numChannels = 13;
|
||||
else numChannels = 16;
|
||||
break;
|
||||
case 910:
|
||||
} else if (IsArchMatch(m_gcnArchName, "gfx90a")) {
|
||||
if (totalBytes <= 262144) numChannels = 4;
|
||||
else numChannels = 8;
|
||||
break;
|
||||
default:
|
||||
} else {
|
||||
if (totalBytes <= 65536) numChannels = 1;
|
||||
else if (totalBytes <= 262144) numChannels = 2;
|
||||
else if (totalBytes <= 524288) numChannels = 4;
|
||||
|
||||
@@ -95,7 +95,7 @@ protected:
|
||||
int32_t m_opIndexHead; // Track start of outstanding requests
|
||||
int32_t m_opIndexTail; // Track end of outstanding requests
|
||||
bool m_init; // Whether CliqueManager has been initialized
|
||||
int m_gcnArch; // Device GCN arch value
|
||||
char[256] m_gcnArchName; // Device GCN arch value
|
||||
size_t m_allReduceByteLimit; // Byte limit for AllReduce
|
||||
cliqueDevicePtrs_t* m_pinnedCliquePtrs; // Pinned-host-memory (device accessible) containing device pointers
|
||||
int* m_gpuBarrierGlobalCount; // Part of GPU barrier (count variable shared across ranks)
|
||||
|
||||
@@ -369,7 +369,14 @@ ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* s
|
||||
|
||||
ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
|
||||
NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
|
||||
NCCLCHECK(xmlGetAttrInt(xmlGpu, "gcn", &gpu->gpu.gcn));
|
||||
NCCLCHECK(xmlGetAttr(xmlGpu, "gcn", &gpu->gpu.gcn));
|
||||
if (strcmp(gpu->gpu.gcn, "906") == 0) {
|
||||
gpu->gpu.gcn = "gfx906";
|
||||
} else if (strcmp(gpu->gpu.gcn, "908") == 0) {
|
||||
gpu->gpu.gcn = "gfx908";
|
||||
} else if (strcmp(gpu->gpu.gcn, "910") == 0) {
|
||||
gpu->gpu.gcn = "gfx90a";
|
||||
}
|
||||
rcclHipDeviceArch_t arch;
|
||||
NCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value));
|
||||
memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t));
|
||||
|
||||
+10
-12
@@ -10,6 +10,8 @@
|
||||
|
||||
#include "graph.h"
|
||||
#include "core.h"
|
||||
#include "archinfo.h"
|
||||
#include <string.h>
|
||||
|
||||
#define LOC_BW 5000.0
|
||||
#define SM60_NVLINK_BW 18.0
|
||||
@@ -123,7 +125,7 @@ struct ncclTopoNode {
|
||||
int rank;
|
||||
int cudaCompCap;
|
||||
int gdrSupport;
|
||||
int gcn;
|
||||
const char* gcn;
|
||||
hipDeviceArch_t arch;
|
||||
}gpu;
|
||||
struct {
|
||||
@@ -224,17 +226,13 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
|
||||
}
|
||||
|
||||
// Returns XGMI speed in GB/s
|
||||
static float ncclTopoXGMISpeed(int gcn) {
|
||||
switch (gcn) {
|
||||
case 910:
|
||||
return MI200_XGMI_WIDTH;
|
||||
case 940:
|
||||
case 941:
|
||||
case 942:
|
||||
return GFX94X_XGMI_WIDTH;
|
||||
default:
|
||||
return VEGA_XGMI_WIDTH;
|
||||
}
|
||||
static float ncclTopoXGMISpeed(const char* gcn) {
|
||||
if (IsArchMatch(gcn, "gfx90a"))
|
||||
return MI200_XGMI_WIDTH;
|
||||
else if (IsArchMatch(gcn, "gfx94"))
|
||||
return GFX94X_XGMI_WIDTH;
|
||||
else
|
||||
return VEGA_XGMI_WIDTH;
|
||||
}
|
||||
|
||||
#if ENABLE_COLLTRACE
|
||||
|
||||
@@ -455,7 +455,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
#if defined(ENABLE_LL128)
|
||||
// Enable LL128 by default only on gfx90a with available tuning table
|
||||
pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
|
||||
(comm->topo->nodes[GPU].nodes[0].gpu.gcn == 910 && comm->topo->ll128Enabled) ? 1 : 0;
|
||||
(IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && comm->topo->ll128Enabled) ? 1 : 0;
|
||||
#else
|
||||
pEnable = 0;
|
||||
#endif
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "nvmlwrap.h"
|
||||
#include "xml.h"
|
||||
#include "rocm_smi_wrap.h"
|
||||
#include "archinfo.h"
|
||||
|
||||
/*******************/
|
||||
/* XML File Parser */
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "debug.h"
|
||||
#include "checks.h"
|
||||
#include <stdlib.h>
|
||||
#include "archinfo.h"
|
||||
|
||||
// A few constraints to make the implementation easy
|
||||
#define MAX_STR_LEN 255
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef ARCHINFO_H_
|
||||
#define ARCHINFO_H_
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
*/
|
||||
|
||||
void GcnArchNameFormat(char *gcnArchName, char* out);
|
||||
void GcnArchConvertToGcnArchName(int gcnArch, char* out);
|
||||
int GetGcnArchName(int deviceId, char* out);
|
||||
double GetDeviceWallClockRateInKhz(int deviceId);
|
||||
bool IsArchMatch(char const* arch, char const* target);
|
||||
|
||||
#endif // ARCHINFO_H
|
||||
+6
-10
@@ -33,6 +33,7 @@
|
||||
#include <unistd.h>
|
||||
#include "graph/topo.h"
|
||||
#include "graph/xml.h"
|
||||
#include "archinfo.h"
|
||||
|
||||
// [RCCL]
|
||||
#include "git_version.h"
|
||||
@@ -174,15 +175,10 @@ RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0);
|
||||
void *ncclCommThreadMain(void *arg) {
|
||||
ncclComm_t comm = (ncclComm_t)arg;
|
||||
int head[MAXCHANNELS];
|
||||
hipDeviceProp_t devProp;
|
||||
double vega_gpu_rtc_freq;
|
||||
|
||||
memset(head, 0, sizeof(int)*MAXCHANNELS);
|
||||
hipError_t status = hipGetDeviceProperties(&devProp, comm->cudaDev);
|
||||
if (devProp.gcnArch/10 == 94 && status == hipSuccess)
|
||||
vega_gpu_rtc_freq = 1.0E8;
|
||||
else
|
||||
vega_gpu_rtc_freq = 2.5E7;
|
||||
vega_gpu_rtc_freq = GetDeviceWallClockRateInKhz(comm->cudaDev) * 1.0E3;
|
||||
#define MAX_NAME_LENGTH 64
|
||||
char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+2));
|
||||
for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) {
|
||||
@@ -1230,17 +1226,17 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
|
||||
allGather3Data[rank].nc = 2;
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
|
||||
comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
|
||||
IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx906") && allXgmi)
|
||||
allGather3Data[rank].nc = 4;
|
||||
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
|
||||
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx908"))
|
||||
allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
|
||||
(comm->topo->type & RCCL_TOPO_CR8G))
|
||||
allGather3Data[rank].nc = 4;
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
|
||||
comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
|
||||
IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
|
||||
allGather3Data[rank].nc = 4;
|
||||
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
|
||||
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
|
||||
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
|
||||
if (ringGraph.nChannels > MAXCHANNELS/2)
|
||||
allGather3Data[rank].nc = 1;
|
||||
|
||||
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "archinfo.h"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
|
||||
void GcnArchNameFormat(char* gcnArchName, char* out) {
|
||||
// this function parses the char array from the device properties into something easier to handle.
|
||||
// as the gcnArchName attribute looks something like: "gfx900:xnack+:blah-:etc-"
|
||||
char *gcnArchNameToken = strtok(gcnArchName, ":");
|
||||
strcpy(gcnArchNameToken, out);
|
||||
}
|
||||
|
||||
void GcnArchConvertToGcnArchName(int gcnArch, char* gcnArchName) {
|
||||
// gcnArch is deprecated and we should instead use gcnArchName; however, some data files still have
|
||||
// the older gcnArch value. There's only a handful of architectures that were coded prior to deprecation,
|
||||
// so we handle those cases here.
|
||||
//char gcnArchName[256] = {0}; // why 256? Because that's what gcnArchName gives us, so we're matching it.
|
||||
gcnArchName[6] = 0;
|
||||
switch (gcnArch) {
|
||||
case 906:
|
||||
strncpy(gcnArchName, "gfx906", 6);
|
||||
break;
|
||||
case 908:
|
||||
strncpy(gcnArchName, "gfx908", 6);
|
||||
break;
|
||||
case 910:
|
||||
// this is actually 90a
|
||||
strncpy(gcnArchName, "gfx90a", 6);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int GetGcnArchName(int deviceId, char* out) {
|
||||
// this is a generic call in to get a consistent gcnArchName regardless of which version of rocm we're using.
|
||||
// or which version of rocm we're using.
|
||||
hipDeviceProp_t devProp;
|
||||
hipError_t status = hipGetDeviceProperties(&devProp, deviceId);
|
||||
if (status != hipSuccess) {
|
||||
//std::cerr << "Encountered HIP error getting device properties: "
|
||||
// << hipGetErrorString(status) << "\n";
|
||||
exit(-1);
|
||||
}
|
||||
#ifdef HIP_NO_GCNARCHNAME
|
||||
// we're using a HIP version before 3.7.
|
||||
GcnArchConvertToGcnArchName(devProp.gcnArch, out);
|
||||
return 1;
|
||||
#else
|
||||
GcnArchNameFormat(devProp.gcnArchName, out);
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
double GetDeviceWallClockRateInKhz(int deviceId) {
|
||||
char* gcn;
|
||||
GetGcnArchName(deviceId, gcn);
|
||||
if (strncmp("gfx94", gcn, 5) == 0)
|
||||
return 1.0E5;
|
||||
else
|
||||
return 2.5E4;
|
||||
}
|
||||
|
||||
bool IsArchMatch(char const* arch, char const* target) {
|
||||
// helper function to reduce clutter in code elsewhere. Returns true on match.
|
||||
return (strncmp(arch, target, strlen(target)) == 0);
|
||||
}
|
||||
@@ -9,6 +9,7 @@
|
||||
|
||||
#include "alloc.h"
|
||||
#include "npkit/npkit.h"
|
||||
#include "archinfo.h"
|
||||
|
||||
uint64_t NpKit::rank_ = 0;
|
||||
|
||||
@@ -120,13 +121,8 @@ ncclResult_t NpKit::Dump(const std::string& dump_dir) {
|
||||
dump_file_path = dump_dir;
|
||||
dump_file_path += "/gpu_clock_rate_rank_";
|
||||
dump_file_path += std::to_string(rank_);
|
||||
hipDeviceProp_t devProp;
|
||||
int vega_gpu_rtc_freq_in_khz;
|
||||
CUDACHECK(hipGetDeviceProperties(&devProp, 0));
|
||||
if (devProp.gcnArch/10 == 94)
|
||||
vega_gpu_rtc_freq_in_khz = 100000;
|
||||
else
|
||||
vega_gpu_rtc_freq_in_khz = 25000;
|
||||
// get the rtc frequency directly from HIP itself (via a wrapper)
|
||||
double vega_gpu_rtc_freq_in_khz = GetDeviceWallClockRateInKhz(0);
|
||||
std::string clock_rate_str = std::to_string(vega_gpu_rtc_freq_in_khz);
|
||||
auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out);
|
||||
gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
|
||||
|
||||
@@ -190,7 +190,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
||||
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
|
||||
if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
|
||||
CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
|
||||
send->conn.curr_hdp_reg = req.curr_hdp_reg;
|
||||
}
|
||||
|
||||
@@ -354,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
|
||||
if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
|
||||
CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ EXE = topo_expl
|
||||
CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL
|
||||
|
||||
files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
|
||||
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc
|
||||
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc
|
||||
|
||||
all: hipify $(EXE)
|
||||
|
||||
@@ -21,6 +21,7 @@ hipify:
|
||||
mkdir -p hipify_rccl
|
||||
cp -a ../../src/include/ hipify_rccl/
|
||||
cp -a ../../src/graph/ hipify_rccl/
|
||||
cp -ar ../../src/misc/archinfo.cc hipify_rccl/graph/
|
||||
hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h
|
||||
hipify-perl -inplace -quiet-warnings hipify_rccl/graph/*
|
||||
|
||||
|
||||
@@ -862,17 +862,17 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
|
||||
allGather3Data[rank].nc = 2;
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
|
||||
comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
|
||||
IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx906") && allXgmi)
|
||||
allGather3Data[rank].nc = 4;
|
||||
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
|
||||
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx908"))
|
||||
allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
|
||||
(comm->topo->type & RCCL_TOPO_CR8G))
|
||||
allGather3Data[rank].nc = 4;
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
|
||||
comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
|
||||
IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
|
||||
allGather3Data[rank].nc = 4;
|
||||
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
|
||||
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
|
||||
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
|
||||
if (ringGraph.nChannels > MAXCHANNELS/2)
|
||||
allGather3Data[rank].nc = 1;
|
||||
|
||||
Reference in New Issue
Block a user