diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b78e74663..7e057c4af4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,7 @@ set(DEFAULT_GPUS gfx908 gfx90a gfx942 + gfx950 gfx1030 gfx1100 gfx1101 @@ -313,10 +314,20 @@ if (HAVE_PARALLEL_JOBS) endif() ## Disable building MSCCL++ if the build environment is invalid -## Currently MSCCL++ is supported only on gfx942, and only on Ubuntu and CentOS -if (ENABLE_MSCCLPP AND NOT ("gfx942" IN_LIST GPU_TARGETS OR "gfx942:xnack-" IN_LIST GPU_TARGETS OR "gfx942:xnack+" IN_LIST GPU_TARGETS)) +## Currently MSCCL++ is supported only on gfx942 and gfx950, and only on Ubuntu and CentOS +set(MSCCLPP_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+") +# Check if any of the supported architectures are in GPU_TARGETS +set(ARCH_MATCH_FOUND OFF) +foreach(ARCH ${MSCCLPP_SUPPORTED_ARCHS}) + if(ARCH IN_LIST GPU_TARGETS) + set(ARCH_MATCH_FOUND ON) + break() + endif() +endforeach() + +if (ENABLE_MSCCLPP AND NOT ARCH_MATCH_FOUND) set(ENABLE_MSCCLPP OFF) - message(WARNING "Can only build MSCCL++ for gfx942; disabling MSCCL++ build") + message(WARNING "Can only build MSCCL++ for supported GPU_TARGETS (${MSCCLPP_SUPPORTED_ARCHS}); disabling MSCCL++ build") endif() if (ENABLE_MSCCLPP AND ROCM_VERSION VERSION_LESS "60200") set(ENABLE_MSCCLPP OFF) diff --git a/cmake/MSCCLPP.cmake b/cmake/MSCCLPP.cmake index 1f870860e5..491cdadc2f 100644 --- a/cmake/MSCCLPP.cmake +++ b/cmake/MSCCLPP.cmake @@ -95,21 +95,22 @@ if(ENABLE_MSCCLPP) WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) - message(STATUS "Building mscclpp only for gfx942.") + message(STATUS "Building mscclpp only for supported variants:gfx942,gfx950") mscclpp_cmake_arg(CMAKE_PREFIX_PATH) mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH) mscclpp_cmake_arg(HIP_COMPILER) - set(GFX942_VARIANT "gfx942") + #gfx950 change is added for testing assuming cmake args are space separated values for list + set(GFX_VARIANT "gfx942 gfx950") if(BUILD_ADDRESS_SANITIZER) - set(GFX942_VARIANT "gfx942:xnack+") + set(GFX_VARIANT "gfx942:xnack+ gfx950:xnack+") endif() download_project(PROJ mscclpp_nccl #GIT_REPOSITORY https://github.com/microsoft/mscclpp.git #GIT_TAG 4ee15b7ad085daaf74349d4c49c9b8480d28f0dc INSTALL_DIR ${MSCCLPP_ROOT} - CMAKE_ARGS -DAMDGPU_TARGETS=${GFX942_VARIANT} -DGPU_TARGETS=${GFX942_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX= "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE} + CMAKE_ARGS -DAMDGPU_TARGETS=${GFX_VARIANT} -DGPU_TARGETS=${GFX_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX= "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE} LOG_DOWNLOAD FALSE LOG_CONFIGURE FALSE LOG_BUILD FALSE diff --git a/cmake/scripts/extract_metadata.cmake b/cmake/scripts/extract_metadata.cmake index ca025ab033..46dba319e1 100644 --- a/cmake/scripts/extract_metadata.cmake +++ b/cmake/scripts/extract_metadata.cmake @@ -31,7 +31,7 @@ if(list_result EQUAL 0) ## Extract file paths for the selected gfx archs foreach(line ${cmd_output}) - if(line MATCHES "(gfx90a|gfx940|gfx941|gfx942)") + if(line MATCHES "(gfx90a|gfx940|gfx941|gfx942|gfx950)") string(REGEX MATCH "\\file://(.*)" file_match ${line}) if(file_match) list(APPEND file_paths ${file_match}) diff --git a/src/device/all_gather.h b/src/device/all_gather.h index c36c9c9100..88700ada79 100644 --- a/src/device/all_gather.h +++ b/src/device/all_gather.h @@ -11,7 +11,7 @@ namespace { template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { #else __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h index 37b35c72ea..cabf12ea74 100644 --- a/src/device/all_reduce.h +++ b/src/device/all_reduce.h @@ -15,7 +15,7 @@ namespace { template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { #else __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { @@ -209,7 +209,7 @@ namespace { } template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) { #else __device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) { @@ -357,7 +357,7 @@ namespace { } template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) { #else __device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) { diff --git a/src/device/alltoall_pivot.h b/src/device/alltoall_pivot.h index 2924fb43e4..6b5a6392ff 100644 --- a/src/device/alltoall_pivot.h +++ b/src/device/alltoall_pivot.h @@ -10,7 +10,7 @@ namespace { template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { #else __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { diff --git a/src/device/broadcast.h b/src/device/broadcast.h index 8737bd4eae..c53f1553c8 100644 --- a/src/device/broadcast.h +++ b/src/device/broadcast.h @@ -10,7 +10,7 @@ namespace { template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { #else __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { diff --git a/src/device/msccl_kernel_impl.h b/src/device/msccl_kernel_impl.h index a2aeccf2c0..75ae68890f 100644 --- a/src/device/msccl_kernel_impl.h +++ b/src/device/msccl_kernel_impl.h @@ -152,7 +152,7 @@ __device__ __forceinline__ void mscclRunInterpreter( int npKitCtxIdx = bid; int xcc_id = 0; if (tid == 0) { -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id)); #endif } diff --git a/src/device/onerank.cu b/src/device/onerank.cu index 770ecba1e3..809e6af9ca 100644 --- a/src/device/onerank.cu +++ b/src/device/onerank.cu @@ -11,7 +11,7 @@ #include "common.h" #include -#if defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) #define COLL_UNROLL 2 #else #define COLL_UNROLL 4 diff --git a/src/device/op128.h b/src/device/op128.h index 72f90d0c14..7fc1bcdc33 100644 --- a/src/device/op128.h +++ b/src/device/op128.h @@ -124,7 +124,7 @@ union alignas(16) BytePack<16> { uint32_t u32[4]; uint64_t u64[2]; ulong2 ul2, native; -#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) inline __device__ BytePack<16>& operator=(BytePack<16> other) { u64[0] = other.u64[0]; u64[1] = other.u64[1]; diff --git a/src/device/primitives.h b/src/device/primitives.h index 10c8710c61..c49c28d582 100644 --- a/src/device/primitives.h +++ b/src/device/primitives.h @@ -15,7 +15,7 @@ #define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000 -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) #define __THREAD_FENCE __threadfence_block() #else #define __THREAD_FENCE __threadfence() diff --git a/src/device/reduce.h b/src/device/reduce.h index c9856cd865..689e4261f7 100644 --- a/src/device/reduce.h +++ b/src/device/reduce.h @@ -11,7 +11,7 @@ namespace { template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { #else __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h index fc2da869b2..7d51c7ea12 100644 --- a/src/device/reduce_scatter.h +++ b/src/device/reduce_scatter.h @@ -11,7 +11,7 @@ namespace { template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { #else __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h index b49ac008c8..72ef207a4c 100644 --- a/src/device/sendrecv.h +++ b/src/device/sendrecv.h @@ -132,7 +132,7 @@ struct RunWorkBatch (subtid, subtn, 0, nullptr, false, 1, &work->sendAddr, 1, &work->recvAddr, (ssize_t)work->sendBytes); #else @@ -254,7 +254,7 @@ struct RunWorkBatch>(subtid, subtn, group, work); -#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) runSend>(subtid, subtn, group, work); #else runSend>(subtid, subtn, group, work); @@ -266,7 +266,7 @@ struct RunWorkBatch>(subtid, subtn, group, work); -#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) runRecv>(subtid, subtn, group, work); #else runRecv>(subtid, subtn, group, work); diff --git a/src/graph/connect.cc b/src/graph/connect.cc index f58c480ccc..6c6a743542 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -286,7 +286,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) { static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) { - const int channelLimit = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 2*CHANNEL_LIMIT : CHANNEL_LIMIT; + const int channelLimit = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? 2*CHANNEL_LIMIT : CHANNEL_LIMIT; const int nChannels = (comm->nChannels > channelLimit) ? comm->nChannels / 2 : comm->nChannels; const int nNodes = comm->nNodes, node = comm->node; @@ -803,8 +803,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa } } - // Only use full MAXCHANNELS for gfx94x - int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? + // Only use full MAXCHANNELS for gfx94x and gfx950 + int maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? ((comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 38) ? comm->topo->nodes[GPU].nodes[0].gpu.cu : MAXCHANNELS) : 2*CHANNEL_LIMIT; diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 159b66e7b3..e78026b2f2 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -656,7 +656,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm } } - // Special handling of gfx94x + // Special handling of gfx94x and gfx950 #if !defined(TOPO_EXPL) char strValue[1024]; @@ -666,7 +666,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL && - IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && + (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && ((system->nodes[GPU].count == 8 && system->nodes[NET].count == 8 && system->nodes[GPU].count == system->nRanks) || (system->nodes[GPU].count != system->nRanks))) { if (!rcclPathOverride(system, 0x100000) && !rcclPathOverride(system, 0x1000)) @@ -843,7 +843,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp path = system->nodes[GPU].nodes[peer].paths[GPU]+g; if (path->type == PATH_NVL) { float nvlBw = ncclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn); - *nChannels = (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 4 : 2)*std::max(1, (int)(path->bw / nvlBw)); + *nChannels = ((IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? 4 : 2)*std::max(1, (int)(path->bw / nvlBw)); } else { *nChannels = 2; } @@ -906,7 +906,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { // Round to next pow2 nChannelsPerPeer and nChannels comm->p2pnChannelsPerPeer = (ncclParamNChannelsPerPeer() == -2 ? pow2Up(minChannels) : ncclParamNChannelsPerPeer()); // Doubling P2P channels per peer on single node - if (comm->topo->nodes[GPU].count == comm->topo->nRanks && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) comm->p2pnChannelsPerPeer *= 2; + if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))) comm->p2pnChannelsPerPeer *= 2; comm->p2pnChannels = std::min(pow2Up(comm->p2pnChannels), 4*CHANNEL_LIMIT); } diff --git a/src/graph/topo.h b/src/graph/topo.h index d20b9479f6..47713db7ce 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -31,6 +31,7 @@ #define VEGA_XGMI_WIDTH 24.0 #define MI200_XGMI_WIDTH 36.0 #define GFX94X_XGMI_WIDTH 48.0 +#define GFX95X_XGMI_WIDTH 48.0 // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU // to GPU traffic consumes more PCI bandwidth. @@ -263,6 +264,8 @@ static float ncclTopoXGMISpeed(const char* gcn) { return MI200_XGMI_WIDTH; else if (IsArchMatch(gcn, "gfx94")) return GFX94X_XGMI_WIDTH; + else if (IsArchMatch(gcn, "gfx95")) + return GFX95X_XGMI_WIDTH; else return VEGA_XGMI_WIDTH; } diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index ca843b267a..570bc375d2 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -360,7 +360,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue; for (int p=0; ptopo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue; + if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue; if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue; int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0; float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter; @@ -376,7 +376,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p]; else busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p]; - if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 1.65; } + if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 1.65; } #else if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); } if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw); diff --git a/src/include/rccl_float8.h b/src/include/rccl_float8.h index 01cab41f71..944d781a11 100644 --- a/src/include/rccl_float8.h +++ b/src/include/rccl_float8.h @@ -344,7 +344,7 @@ struct rccl_float8 // default constructor HIP_HOST_DEVICE rccl_float8() = default; -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) // device specific optimized F8 down-conversion code template @@ -384,7 +384,7 @@ struct rccl_float8 #endif // __gfx940__ // constructor from float -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) // NOTE: ON-DEVICE... always optimal bias explicit HIP_DEVICE rccl_float8(float v, @@ -446,7 +446,7 @@ struct rccl_float8 } // convert to float -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) // upcast using device specific intrinsic explicit inline HIP_DEVICE operator float() const { @@ -511,7 +511,7 @@ struct rccl_bfloat8 // default constructor HIP_HOST_DEVICE rccl_bfloat8() = default; -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) // device specific optimized F8 down-conversion code template @@ -551,7 +551,7 @@ struct rccl_bfloat8 #endif // __gfx940__ // constructor from float -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) // NOTE: ON-DEVICE... always optimal bias explicit HIP_DEVICE rccl_bfloat8(float v, @@ -613,7 +613,7 @@ struct rccl_bfloat8 } // convert to float -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) // upcast using device specific intrinsic explicit inline HIP_DEVICE operator float() const { @@ -980,7 +980,7 @@ template < = 0> inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng) { -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) // NOTE: we are directly calling cast_to_f8_from_f32 instead of constructor to optimize away one runtime branch T val; if(std::is_same::value) diff --git a/src/init.cc b/src/init.cc index ec9407c4c9..1bcd4e2620 100644 --- a/src/init.cc +++ b/src/init.cc @@ -97,7 +97,7 @@ static uint64_t hashUniqueId(ncclUniqueId const &id) { ncclResult_t commSetUnrollFactor(struct ncclComm* comm) { hipDeviceProp_t devProp; CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev)); - if(IsArchMatch(devProp.gcnArchName, "gfx908") || (IsArchMatch(devProp.gcnArchName, "gfx94") + if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx94") || IsArchMatch(devProp.gcnArchName, "gfx950")) && devProp.multiProcessorCount > 80)) comm->unroll = NCCL_UNROLL_2; else @@ -1337,7 +1337,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels); if (ringGraph->nChannels > MAXCHANNELS/2) allGather3Data[rank].nc = 1; - if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94")) { + if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) { // Multi-node MI300A int managed = 0; CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0)); @@ -1858,7 +1858,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { // a CUDA memory reconfig on load (c.f. NVSHMEM issue) #ifdef USE_INDIRECT_FUNCTION_CALL CUDACHECK(hipGetDeviceProperties(&devProp, 0)); - if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx94")) { + if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx94") && !IsArchMatch(devProp.gcnArchName,"gfx950")) { stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes; if (stackSize == 0) { if (IsArchMatch(devProp.gcnArchName,"gfx906")) @@ -1924,7 +1924,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled()) && mscclppCommCompatible(comm)) { hipDeviceProp_t devProp; CUDACHECK(hipGetDeviceProperties(&devProp, cudaDev)); - comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94"); + comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94") || IsArchMatch(devProp.gcnArchName, "gfx950"); if (comm->mscclppCompatible) { bool mapContainsId = (mscclpp_uniqueIdMap.count(job->commId) > 0); auto& mscclppUniqueId = mscclpp_uniqueIdMap[job->commId]; diff --git a/src/misc/archinfo.cc b/src/misc/archinfo.cc index 3cdb5df1e6..547bcccd92 100644 --- a/src/misc/archinfo.cc +++ b/src/misc/archinfo.cc @@ -48,6 +48,8 @@ void convertGcnArchToGcnArchName(const char* gcnArch, const char** gcnArchName) *gcnArchName = "gfx941"; else if (strcmp(gcnArch, "942") == 0) *gcnArchName = "gfx942"; + else if (strcmp(gcnArch, "950") == 0) + *gcnArchName = "gfx950"; else *gcnArchName = gcnArch; } @@ -65,6 +67,8 @@ double GetDeviceWallClockRateInKhz(int deviceId) { GetGcnArchName(deviceId, gcn); if (strncmp("gfx94", gcn, 5) == 0) return 1.0E5; + else if(strncmp("gfx950", gcn, 6) == 0) + return 1.0E5; else return 2.5E4; } diff --git a/src/transport/net.cc b/src/transport/net.cc index 4261dd0fb0..5fd36f6f71 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -207,7 +207,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; - if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) { + if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) { CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev)); send->conn.curr_hdp_reg = req.curr_hdp_reg; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index f3caa9e70a..5c3871163a 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -368,7 +368,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank); return ncclInternalError; } - if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) { + if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) { CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev)); TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg); } diff --git a/tools/JitterBench/Common.hpp b/tools/JitterBench/Common.hpp index 8fad0ac3c1..16aa82c4ce 100644 --- a/tools/JitterBench/Common.hpp +++ b/tools/JitterBench/Common.hpp @@ -34,7 +34,7 @@ THE SOFTWARE. } while (0) // Macro for collecting HW_REG_XCC_ID -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) #define GetXccId(val) \ asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val)); #else