enable building rccl for gfx950 (#1571)

This commit is contained in:
Pedram Alizadeh
2025-02-25 16:13:48 -05:00
committed by GitHub
parent daaa6e155f
commit f268553ee4
24 changed files with 66 additions and 47 deletions
+14 -3
View File
@@ -41,6 +41,7 @@ set(DEFAULT_GPUS
gfx908
gfx90a
gfx942
gfx950
gfx1030
gfx1100
gfx1101
@@ -313,10 +314,20 @@ if (HAVE_PARALLEL_JOBS)
endif()
## Disable building MSCCL++ if the build environment is invalid
## Currently MSCCL++ is supported only on gfx942, and only on Ubuntu and CentOS
if (ENABLE_MSCCLPP AND NOT ("gfx942" IN_LIST GPU_TARGETS OR "gfx942:xnack-" IN_LIST GPU_TARGETS OR "gfx942:xnack+" IN_LIST GPU_TARGETS))
## Currently MSCCL++ is supported only on gfx942 and gfx950, and only on Ubuntu and CentOS
set(MSCCLPP_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
# Check if any of the supported architectures are in GPU_TARGETS
set(ARCH_MATCH_FOUND OFF)
foreach(ARCH ${MSCCLPP_SUPPORTED_ARCHS})
if(ARCH IN_LIST GPU_TARGETS)
set(ARCH_MATCH_FOUND ON)
break()
endif()
endforeach()
if (ENABLE_MSCCLPP AND NOT ARCH_MATCH_FOUND)
set(ENABLE_MSCCLPP OFF)
message(WARNING "Can only build MSCCL++ for gfx942; disabling MSCCL++ build")
message(WARNING "Can only build MSCCL++ for supported GPU_TARGETS (${MSCCLPP_SUPPORTED_ARCHS}); disabling MSCCL++ build")
endif()
if (ENABLE_MSCCLPP AND ROCM_VERSION VERSION_LESS "60200")
set(ENABLE_MSCCLPP OFF)
+5 -4
View File
@@ -95,21 +95,22 @@ if(ENABLE_MSCCLPP)
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
message(STATUS "Building mscclpp only for gfx942.")
message(STATUS "Building mscclpp only for supported variants:gfx942,gfx950")
mscclpp_cmake_arg(CMAKE_PREFIX_PATH)
mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH)
mscclpp_cmake_arg(HIP_COMPILER)
set(GFX942_VARIANT "gfx942")
#gfx950 change is added for testing assuming cmake args are space separated values for list
set(GFX_VARIANT "gfx942 gfx950")
if(BUILD_ADDRESS_SANITIZER)
set(GFX942_VARIANT "gfx942:xnack+")
set(GFX_VARIANT "gfx942:xnack+ gfx950:xnack+")
endif()
download_project(PROJ mscclpp_nccl
#GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
#GIT_TAG 4ee15b7ad085daaf74349d4c49c9b8480d28f0dc
INSTALL_DIR ${MSCCLPP_ROOT}
CMAKE_ARGS -DAMDGPU_TARGETS=${GFX942_VARIANT} -DGPU_TARGETS=${GFX942_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE}
CMAKE_ARGS -DAMDGPU_TARGETS=${GFX_VARIANT} -DGPU_TARGETS=${GFX_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE}
LOG_DOWNLOAD FALSE
LOG_CONFIGURE FALSE
LOG_BUILD FALSE
+1 -1
View File
@@ -31,7 +31,7 @@ if(list_result EQUAL 0)
## Extract file paths for the selected gfx archs
foreach(line ${cmd_output})
if(line MATCHES "(gfx90a|gfx940|gfx941|gfx942)")
if(line MATCHES "(gfx90a|gfx940|gfx941|gfx942|gfx950)")
string(REGEX MATCH "\\file://(.*)" file_match ${line})
if(file_match)
list(APPEND file_paths ${file_match})
+1 -1
View File
@@ -11,7 +11,7 @@
namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
+3 -3
View File
@@ -15,7 +15,7 @@
namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
@@ -209,7 +209,7 @@ namespace {
}
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
@@ -357,7 +357,7 @@ namespace {
}
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
+1 -1
View File
@@ -10,7 +10,7 @@
namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
+1 -1
View File
@@ -10,7 +10,7 @@
namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
+1 -1
View File
@@ -152,7 +152,7 @@ __device__ __forceinline__ void mscclRunInterpreter(
int npKitCtxIdx = bid;
int xcc_id = 0;
if (tid == 0) {
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
#endif
}
+1 -1
View File
@@ -11,7 +11,7 @@
#include "common.h"
#include <cuda_runtime.h>
#if defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
#define COLL_UNROLL 2
#else
#define COLL_UNROLL 4
+1 -1
View File
@@ -124,7 +124,7 @@ union alignas(16) BytePack<16> {
uint32_t u32[4];
uint64_t u64[2];
ulong2 ul2, native;
#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
inline __device__ BytePack<16>& operator=(BytePack<16> other) {
u64[0] = other.u64[0];
u64[1] = other.u64[1];
+1 -1
View File
@@ -15,7 +15,7 @@
#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
#define __THREAD_FENCE __threadfence_block()
#else
#define __THREAD_FENCE __threadfence()
+1 -1
View File
@@ -11,7 +11,7 @@
namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
+1 -1
View File
@@ -11,7 +11,7 @@
namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
+4 -4
View File
@@ -132,7 +132,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
#endif
}
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void run() {
#else
__device__ __attribute__((noinline)) void run() {
@@ -241,7 +241,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
}
if (isCopy) {
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
reduceCopy<COLL_UNROLL*2, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
(subtid, subtn, 0, nullptr, false, 1, &work->sendAddr, 1, &work->recvAddr, (ssize_t)work->sendBytes);
#else
@@ -254,7 +254,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
} else {
#if defined(__gfx90a__)
runSend<ProtoSimple<1,1,8>>(subtid, subtn, group, work);
#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
runSend<ProtoSimple<1,1,4>>(subtid, subtn, group, work);
#else
runSend<ProtoSimple<1,1>>(subtid, subtn, group, work);
@@ -266,7 +266,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
} else {
#if defined(__gfx90a__)
runRecv<ProtoSimple<1,1,8>>(subtid, subtn, group, work);
#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
runRecv<ProtoSimple<1,1,4>>(subtid, subtn, group, work);
#else
runRecv<ProtoSimple<1,1>>(subtid, subtn, group, work);
+3 -3
View File
@@ -286,7 +286,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
const int channelLimit = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 2*CHANNEL_LIMIT : CHANNEL_LIMIT;
const int channelLimit = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? 2*CHANNEL_LIMIT : CHANNEL_LIMIT;
const int nChannels = (comm->nChannels > channelLimit) ? comm->nChannels / 2 : comm->nChannels;
const int nNodes = comm->nNodes, node = comm->node;
@@ -803,8 +803,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
}
}
// Only use full MAXCHANNELS for gfx94x
int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ?
// Only use full MAXCHANNELS for gfx94x and gfx950
int maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ?
((comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 38)
? comm->topo->nodes[GPU].nodes[0].gpu.cu : MAXCHANNELS) : 2*CHANNEL_LIMIT;
+4 -4
View File
@@ -656,7 +656,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
}
}
// Special handling of gfx94x
// Special handling of gfx94x and gfx950
#if !defined(TOPO_EXPL)
char strValue[1024];
@@ -666,7 +666,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL &&
IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
(IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) &&
((system->nodes[GPU].count == 8 && system->nodes[NET].count == 8 && system->nodes[GPU].count == system->nRanks) ||
(system->nodes[GPU].count != system->nRanks))) {
if (!rcclPathOverride(system, 0x100000) && !rcclPathOverride(system, 0x1000))
@@ -843,7 +843,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
if (path->type == PATH_NVL) {
float nvlBw = ncclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn);
*nChannels = (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 4 : 2)*std::max(1, (int)(path->bw / nvlBw));
*nChannels = ((IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? 4 : 2)*std::max(1, (int)(path->bw / nvlBw));
} else {
*nChannels = 2;
}
@@ -906,7 +906,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
// Round to next pow2 nChannelsPerPeer and nChannels
comm->p2pnChannelsPerPeer = (ncclParamNChannelsPerPeer() == -2 ? pow2Up(minChannels) : ncclParamNChannelsPerPeer());
// Doubling P2P channels per peer on single node
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) comm->p2pnChannelsPerPeer *= 2;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))) comm->p2pnChannelsPerPeer *= 2;
comm->p2pnChannels = std::min(pow2Up(comm->p2pnChannels), 4*CHANNEL_LIMIT);
}
+3
View File
@@ -31,6 +31,7 @@
#define VEGA_XGMI_WIDTH 24.0
#define MI200_XGMI_WIDTH 36.0
#define GFX94X_XGMI_WIDTH 48.0
#define GFX95X_XGMI_WIDTH 48.0
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
// to GPU traffic consumes more PCI bandwidth.
@@ -263,6 +264,8 @@ static float ncclTopoXGMISpeed(const char* gcn) {
return MI200_XGMI_WIDTH;
else if (IsArchMatch(gcn, "gfx94"))
return GFX94X_XGMI_WIDTH;
else if (IsArchMatch(gcn, "gfx95"))
return GFX95X_XGMI_WIDTH;
else
return VEGA_XGMI_WIDTH;
}
+2 -2
View File
@@ -360,7 +360,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue;
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue;
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
@@ -376,7 +376,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p];
else
busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p];
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 1.65; }
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 1.65; }
#else
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); }
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
+7 -7
View File
@@ -344,7 +344,7 @@ struct rccl_float8
// default constructor
HIP_HOST_DEVICE rccl_float8() = default;
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
// device specific optimized F8 down-conversion code
template <bool stochastic_rounding = false>
@@ -384,7 +384,7 @@ struct rccl_float8
#endif // __gfx940__
// constructor from float
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
// NOTE: ON-DEVICE... always optimal bias
explicit HIP_DEVICE rccl_float8(float v,
@@ -446,7 +446,7 @@ struct rccl_float8
}
// convert to float
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
// upcast using device specific intrinsic
explicit inline HIP_DEVICE operator float() const
{
@@ -511,7 +511,7 @@ struct rccl_bfloat8
// default constructor
HIP_HOST_DEVICE rccl_bfloat8() = default;
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
// device specific optimized F8 down-conversion code
template <bool stochastic_rounding = false>
@@ -551,7 +551,7 @@ struct rccl_bfloat8
#endif // __gfx940__
// constructor from float
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
// NOTE: ON-DEVICE... always optimal bias
explicit HIP_DEVICE rccl_bfloat8(float v,
@@ -613,7 +613,7 @@ struct rccl_bfloat8
}
// convert to float
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
// upcast using device specific intrinsic
explicit inline HIP_DEVICE operator float() const
{
@@ -980,7 +980,7 @@ template <
= 0>
inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
{
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
// NOTE: we are directly calling cast_to_f8_from_f32 instead of constructor to optimize away one runtime branch
T val;
if(std::is_same<T, rccl_float8>::value)
+4 -4
View File
@@ -97,7 +97,7 @@ static uint64_t hashUniqueId(ncclUniqueId const &id) {
ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
hipDeviceProp_t devProp;
CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev));
if(IsArchMatch(devProp.gcnArchName, "gfx908") || (IsArchMatch(devProp.gcnArchName, "gfx94")
if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx94") || IsArchMatch(devProp.gcnArchName, "gfx950"))
&& devProp.multiProcessorCount > 80))
comm->unroll = NCCL_UNROLL_2;
else
@@ -1337,7 +1337,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels);
if (ringGraph->nChannels > MAXCHANNELS/2)
allGather3Data[rank].nc = 1;
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94")) {
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) {
// Multi-node MI300A
int managed = 0;
CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
@@ -1858,7 +1858,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
#ifdef USE_INDIRECT_FUNCTION_CALL
CUDACHECK(hipGetDeviceProperties(&devProp, 0));
if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx94")) {
if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx94") && !IsArchMatch(devProp.gcnArchName,"gfx950")) {
stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
if (stackSize == 0) {
if (IsArchMatch(devProp.gcnArchName,"gfx906"))
@@ -1924,7 +1924,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled()) && mscclppCommCompatible(comm)) {
hipDeviceProp_t devProp;
CUDACHECK(hipGetDeviceProperties(&devProp, cudaDev));
comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94");
comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94") || IsArchMatch(devProp.gcnArchName, "gfx950");
if (comm->mscclppCompatible) {
bool mapContainsId = (mscclpp_uniqueIdMap.count(job->commId) > 0);
auto& mscclppUniqueId = mscclpp_uniqueIdMap[job->commId];
+4
View File
@@ -48,6 +48,8 @@ void convertGcnArchToGcnArchName(const char* gcnArch, const char** gcnArchName)
*gcnArchName = "gfx941";
else if (strcmp(gcnArch, "942") == 0)
*gcnArchName = "gfx942";
else if (strcmp(gcnArch, "950") == 0)
*gcnArchName = "gfx950";
else
*gcnArchName = gcnArch;
}
@@ -65,6 +67,8 @@ double GetDeviceWallClockRateInKhz(int deviceId) {
GetGcnArchName(deviceId, gcn);
if (strncmp("gfx94", gcn, 5) == 0)
return 1.0E5;
else if(strncmp("gfx950", gcn, 6) == 0)
return 1.0E5;
else
return 2.5E4;
}
+1 -1
View File
@@ -207,7 +207,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
send->conn.curr_hdp_reg = req.curr_hdp_reg;
}
+1 -1
View File
@@ -368,7 +368,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
return ncclInternalError;
}
if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
}
+1 -1
View File
@@ -34,7 +34,7 @@ THE SOFTWARE.
} while (0)
// Macro for collecting HW_REG_XCC_ID
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
#define GetXccId(val) \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
#else