enable building rccl for gfx950 (#1571)
This commit is contained in:
+14
-3
@@ -41,6 +41,7 @@ set(DEFAULT_GPUS
|
||||
gfx908
|
||||
gfx90a
|
||||
gfx942
|
||||
gfx950
|
||||
gfx1030
|
||||
gfx1100
|
||||
gfx1101
|
||||
@@ -313,10 +314,20 @@ if (HAVE_PARALLEL_JOBS)
|
||||
endif()
|
||||
|
||||
## Disable building MSCCL++ if the build environment is invalid
|
||||
## Currently MSCCL++ is supported only on gfx942, and only on Ubuntu and CentOS
|
||||
if (ENABLE_MSCCLPP AND NOT ("gfx942" IN_LIST GPU_TARGETS OR "gfx942:xnack-" IN_LIST GPU_TARGETS OR "gfx942:xnack+" IN_LIST GPU_TARGETS))
|
||||
## Currently MSCCL++ is supported only on gfx942 and gfx950, and only on Ubuntu and CentOS
|
||||
set(MSCCLPP_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
|
||||
# Check if any of the supported architectures are in GPU_TARGETS
|
||||
set(ARCH_MATCH_FOUND OFF)
|
||||
foreach(ARCH ${MSCCLPP_SUPPORTED_ARCHS})
|
||||
if(ARCH IN_LIST GPU_TARGETS)
|
||||
set(ARCH_MATCH_FOUND ON)
|
||||
break()
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
if (ENABLE_MSCCLPP AND NOT ARCH_MATCH_FOUND)
|
||||
set(ENABLE_MSCCLPP OFF)
|
||||
message(WARNING "Can only build MSCCL++ for gfx942; disabling MSCCL++ build")
|
||||
message(WARNING "Can only build MSCCL++ for supported GPU_TARGETS (${MSCCLPP_SUPPORTED_ARCHS}); disabling MSCCL++ build")
|
||||
endif()
|
||||
if (ENABLE_MSCCLPP AND ROCM_VERSION VERSION_LESS "60200")
|
||||
set(ENABLE_MSCCLPP OFF)
|
||||
|
||||
+5
-4
@@ -95,21 +95,22 @@ if(ENABLE_MSCCLPP)
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
message(STATUS "Building mscclpp only for gfx942.")
|
||||
message(STATUS "Building mscclpp only for supported variants:gfx942,gfx950")
|
||||
mscclpp_cmake_arg(CMAKE_PREFIX_PATH)
|
||||
mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH)
|
||||
mscclpp_cmake_arg(HIP_COMPILER)
|
||||
|
||||
set(GFX942_VARIANT "gfx942")
|
||||
#gfx950 change is added for testing assuming cmake args are space separated values for list
|
||||
set(GFX_VARIANT "gfx942 gfx950")
|
||||
if(BUILD_ADDRESS_SANITIZER)
|
||||
set(GFX942_VARIANT "gfx942:xnack+")
|
||||
set(GFX_VARIANT "gfx942:xnack+ gfx950:xnack+")
|
||||
endif()
|
||||
|
||||
download_project(PROJ mscclpp_nccl
|
||||
#GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
|
||||
#GIT_TAG 4ee15b7ad085daaf74349d4c49c9b8480d28f0dc
|
||||
INSTALL_DIR ${MSCCLPP_ROOT}
|
||||
CMAKE_ARGS -DAMDGPU_TARGETS=${GFX942_VARIANT} -DGPU_TARGETS=${GFX942_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE}
|
||||
CMAKE_ARGS -DAMDGPU_TARGETS=${GFX_VARIANT} -DGPU_TARGETS=${GFX_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE}
|
||||
LOG_DOWNLOAD FALSE
|
||||
LOG_CONFIGURE FALSE
|
||||
LOG_BUILD FALSE
|
||||
|
||||
@@ -31,7 +31,7 @@ if(list_result EQUAL 0)
|
||||
|
||||
## Extract file paths for the selected gfx archs
|
||||
foreach(line ${cmd_output})
|
||||
if(line MATCHES "(gfx90a|gfx940|gfx941|gfx942)")
|
||||
if(line MATCHES "(gfx90a|gfx940|gfx941|gfx942|gfx950)")
|
||||
string(REGEX MATCH "\\file://(.*)" file_match ${line})
|
||||
if(file_match)
|
||||
list(APPEND file_paths ${file_match})
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
|
||||
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#else
|
||||
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
|
||||
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#else
|
||||
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
@@ -209,7 +209,7 @@ namespace {
|
||||
}
|
||||
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
|
||||
__device__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#else
|
||||
__device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
@@ -357,7 +357,7 @@ namespace {
|
||||
}
|
||||
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
|
||||
__device__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#else
|
||||
__device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
|
||||
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#else
|
||||
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
|
||||
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#else
|
||||
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
|
||||
@@ -152,7 +152,7 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
int npKitCtxIdx = bid;
|
||||
int xcc_id = 0;
|
||||
if (tid == 0) {
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#include "common.h"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#if defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
#define COLL_UNROLL 2
|
||||
#else
|
||||
#define COLL_UNROLL 4
|
||||
|
||||
+1
-1
@@ -124,7 +124,7 @@ union alignas(16) BytePack<16> {
|
||||
uint32_t u32[4];
|
||||
uint64_t u64[2];
|
||||
ulong2 ul2, native;
|
||||
#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
inline __device__ BytePack<16>& operator=(BytePack<16> other) {
|
||||
u64[0] = other.u64[0];
|
||||
u64[1] = other.u64[1];
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
|
||||
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
#define __THREAD_FENCE __threadfence_block()
|
||||
#else
|
||||
#define __THREAD_FENCE __threadfence()
|
||||
|
||||
+1
-1
@@ -11,7 +11,7 @@
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
|
||||
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#else
|
||||
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
|
||||
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#else
|
||||
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
|
||||
@@ -132,7 +132,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
|
||||
__device__ void run() {
|
||||
#else
|
||||
__device__ __attribute__((noinline)) void run() {
|
||||
@@ -241,7 +241,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
|
||||
}
|
||||
|
||||
if (isCopy) {
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
reduceCopy<COLL_UNROLL*2, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
|
||||
(subtid, subtn, 0, nullptr, false, 1, &work->sendAddr, 1, &work->recvAddr, (ssize_t)work->sendBytes);
|
||||
#else
|
||||
@@ -254,7 +254,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
|
||||
} else {
|
||||
#if defined(__gfx90a__)
|
||||
runSend<ProtoSimple<1,1,8>>(subtid, subtn, group, work);
|
||||
#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
runSend<ProtoSimple<1,1,4>>(subtid, subtn, group, work);
|
||||
#else
|
||||
runSend<ProtoSimple<1,1>>(subtid, subtn, group, work);
|
||||
@@ -266,7 +266,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
|
||||
} else {
|
||||
#if defined(__gfx90a__)
|
||||
runRecv<ProtoSimple<1,1,8>>(subtid, subtn, group, work);
|
||||
#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
runRecv<ProtoSimple<1,1,4>>(subtid, subtn, group, work);
|
||||
#else
|
||||
runRecv<ProtoSimple<1,1>>(subtid, subtn, group, work);
|
||||
|
||||
@@ -286,7 +286,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
|
||||
|
||||
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
|
||||
|
||||
const int channelLimit = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 2*CHANNEL_LIMIT : CHANNEL_LIMIT;
|
||||
const int channelLimit = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? 2*CHANNEL_LIMIT : CHANNEL_LIMIT;
|
||||
const int nChannels = (comm->nChannels > channelLimit) ? comm->nChannels / 2 : comm->nChannels;
|
||||
const int nNodes = comm->nNodes, node = comm->node;
|
||||
|
||||
@@ -803,8 +803,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
}
|
||||
}
|
||||
|
||||
// Only use full MAXCHANNELS for gfx94x
|
||||
int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ?
|
||||
// Only use full MAXCHANNELS for gfx94x and gfx950
|
||||
int maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ?
|
||||
((comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 38)
|
||||
? comm->topo->nodes[GPU].nodes[0].gpu.cu : MAXCHANNELS) : 2*CHANNEL_LIMIT;
|
||||
|
||||
|
||||
+4
-4
@@ -656,7 +656,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
}
|
||||
}
|
||||
|
||||
// Special handling of gfx94x
|
||||
// Special handling of gfx94x and gfx950
|
||||
|
||||
#if !defined(TOPO_EXPL)
|
||||
char strValue[1024];
|
||||
@@ -666,7 +666,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL &&
|
||||
IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
|
||||
(IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) &&
|
||||
((system->nodes[GPU].count == 8 && system->nodes[NET].count == 8 && system->nodes[GPU].count == system->nRanks) ||
|
||||
(system->nodes[GPU].count != system->nRanks))) {
|
||||
if (!rcclPathOverride(system, 0x100000) && !rcclPathOverride(system, 0x1000))
|
||||
@@ -843,7 +843,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
|
||||
path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
|
||||
if (path->type == PATH_NVL) {
|
||||
float nvlBw = ncclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn);
|
||||
*nChannels = (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 4 : 2)*std::max(1, (int)(path->bw / nvlBw));
|
||||
*nChannels = ((IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? 4 : 2)*std::max(1, (int)(path->bw / nvlBw));
|
||||
} else {
|
||||
*nChannels = 2;
|
||||
}
|
||||
@@ -906,7 +906,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
// Round to next pow2 nChannelsPerPeer and nChannels
|
||||
comm->p2pnChannelsPerPeer = (ncclParamNChannelsPerPeer() == -2 ? pow2Up(minChannels) : ncclParamNChannelsPerPeer());
|
||||
// Doubling P2P channels per peer on single node
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) comm->p2pnChannelsPerPeer *= 2;
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))) comm->p2pnChannelsPerPeer *= 2;
|
||||
comm->p2pnChannels = std::min(pow2Up(comm->p2pnChannels), 4*CHANNEL_LIMIT);
|
||||
}
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
#define VEGA_XGMI_WIDTH 24.0
|
||||
#define MI200_XGMI_WIDTH 36.0
|
||||
#define GFX94X_XGMI_WIDTH 48.0
|
||||
#define GFX95X_XGMI_WIDTH 48.0
|
||||
|
||||
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
|
||||
// to GPU traffic consumes more PCI bandwidth.
|
||||
@@ -263,6 +264,8 @@ static float ncclTopoXGMISpeed(const char* gcn) {
|
||||
return MI200_XGMI_WIDTH;
|
||||
else if (IsArchMatch(gcn, "gfx94"))
|
||||
return GFX94X_XGMI_WIDTH;
|
||||
else if (IsArchMatch(gcn, "gfx95"))
|
||||
return GFX95X_XGMI_WIDTH;
|
||||
else
|
||||
return VEGA_XGMI_WIDTH;
|
||||
}
|
||||
|
||||
+2
-2
@@ -360,7 +360,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue;
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue;
|
||||
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
|
||||
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
|
||||
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
|
||||
@@ -376,7 +376,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p];
|
||||
else
|
||||
busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p];
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 1.65; }
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 1.65; }
|
||||
#else
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); }
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
|
||||
|
||||
@@ -344,7 +344,7 @@ struct rccl_float8
|
||||
// default constructor
|
||||
HIP_HOST_DEVICE rccl_float8() = default;
|
||||
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
// device specific optimized F8 down-conversion code
|
||||
|
||||
template <bool stochastic_rounding = false>
|
||||
@@ -384,7 +384,7 @@ struct rccl_float8
|
||||
#endif // __gfx940__
|
||||
|
||||
// constructor from float
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
|
||||
// NOTE: ON-DEVICE... always optimal bias
|
||||
explicit HIP_DEVICE rccl_float8(float v,
|
||||
@@ -446,7 +446,7 @@ struct rccl_float8
|
||||
}
|
||||
|
||||
// convert to float
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
// upcast using device specific intrinsic
|
||||
explicit inline HIP_DEVICE operator float() const
|
||||
{
|
||||
@@ -511,7 +511,7 @@ struct rccl_bfloat8
|
||||
// default constructor
|
||||
HIP_HOST_DEVICE rccl_bfloat8() = default;
|
||||
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
// device specific optimized F8 down-conversion code
|
||||
|
||||
template <bool stochastic_rounding = false>
|
||||
@@ -551,7 +551,7 @@ struct rccl_bfloat8
|
||||
#endif // __gfx940__
|
||||
|
||||
// constructor from float
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
|
||||
// NOTE: ON-DEVICE... always optimal bias
|
||||
explicit HIP_DEVICE rccl_bfloat8(float v,
|
||||
@@ -613,7 +613,7 @@ struct rccl_bfloat8
|
||||
}
|
||||
|
||||
// convert to float
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
// upcast using device specific intrinsic
|
||||
explicit inline HIP_DEVICE operator float() const
|
||||
{
|
||||
@@ -980,7 +980,7 @@ template <
|
||||
= 0>
|
||||
inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
|
||||
{
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
// NOTE: we are directly calling cast_to_f8_from_f32 instead of constructor to optimize away one runtime branch
|
||||
T val;
|
||||
if(std::is_same<T, rccl_float8>::value)
|
||||
|
||||
+4
-4
@@ -97,7 +97,7 @@ static uint64_t hashUniqueId(ncclUniqueId const &id) {
|
||||
ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
|
||||
hipDeviceProp_t devProp;
|
||||
CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev));
|
||||
if(IsArchMatch(devProp.gcnArchName, "gfx908") || (IsArchMatch(devProp.gcnArchName, "gfx94")
|
||||
if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx94") || IsArchMatch(devProp.gcnArchName, "gfx950"))
|
||||
&& devProp.multiProcessorCount > 80))
|
||||
comm->unroll = NCCL_UNROLL_2;
|
||||
else
|
||||
@@ -1337,7 +1337,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels);
|
||||
if (ringGraph->nChannels > MAXCHANNELS/2)
|
||||
allGather3Data[rank].nc = 1;
|
||||
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94")) {
|
||||
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) {
|
||||
// Multi-node MI300A
|
||||
int managed = 0;
|
||||
CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
|
||||
@@ -1858,7 +1858,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
|
||||
#ifdef USE_INDIRECT_FUNCTION_CALL
|
||||
CUDACHECK(hipGetDeviceProperties(&devProp, 0));
|
||||
if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx94")) {
|
||||
if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx94") && !IsArchMatch(devProp.gcnArchName,"gfx950")) {
|
||||
stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
|
||||
if (stackSize == 0) {
|
||||
if (IsArchMatch(devProp.gcnArchName,"gfx906"))
|
||||
@@ -1924,7 +1924,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled()) && mscclppCommCompatible(comm)) {
|
||||
hipDeviceProp_t devProp;
|
||||
CUDACHECK(hipGetDeviceProperties(&devProp, cudaDev));
|
||||
comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94");
|
||||
comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94") || IsArchMatch(devProp.gcnArchName, "gfx950");
|
||||
if (comm->mscclppCompatible) {
|
||||
bool mapContainsId = (mscclpp_uniqueIdMap.count(job->commId) > 0);
|
||||
auto& mscclppUniqueId = mscclpp_uniqueIdMap[job->commId];
|
||||
|
||||
@@ -48,6 +48,8 @@ void convertGcnArchToGcnArchName(const char* gcnArch, const char** gcnArchName)
|
||||
*gcnArchName = "gfx941";
|
||||
else if (strcmp(gcnArch, "942") == 0)
|
||||
*gcnArchName = "gfx942";
|
||||
else if (strcmp(gcnArch, "950") == 0)
|
||||
*gcnArchName = "gfx950";
|
||||
else
|
||||
*gcnArchName = gcnArch;
|
||||
}
|
||||
@@ -65,6 +67,8 @@ double GetDeviceWallClockRateInKhz(int deviceId) {
|
||||
GetGcnArchName(deviceId, gcn);
|
||||
if (strncmp("gfx94", gcn, 5) == 0)
|
||||
return 1.0E5;
|
||||
else if(strncmp("gfx950", gcn, 6) == 0)
|
||||
return 1.0E5;
|
||||
else
|
||||
return 2.5E4;
|
||||
}
|
||||
|
||||
@@ -207,7 +207,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
|
||||
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
|
||||
if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
|
||||
CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
|
||||
send->conn.curr_hdp_reg = req.curr_hdp_reg;
|
||||
}
|
||||
|
||||
@@ -368,7 +368,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
|
||||
if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
|
||||
CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ THE SOFTWARE.
|
||||
} while (0)
|
||||
|
||||
// Macro for collecting HW_REG_XCC_ID
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
#define GetXccId(val) \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
|
||||
#else
|
||||
|
||||
Reference in New Issue
Block a user