enable building rccl for gfx950 (#1571)

2025-02-25 16:13:48 -05:00
parent daaa6e155f
commit f268553ee4
24 changed files with 66 additions and 47 deletions
@@ -41,6 +41,7 @@ set(DEFAULT_GPUS
      gfx908
      gfx90a
      gfx942
+      gfx950
      gfx1030
      gfx1100
      gfx1101
@@ -313,10 +314,20 @@ if (HAVE_PARALLEL_JOBS)
 endif()

 ## Disable building MSCCL++ if the build environment is invalid
-## Currently MSCCL++ is supported only on gfx942, and only on Ubuntu and CentOS
-if (ENABLE_MSCCLPP AND NOT ("gfx942" IN_LIST GPU_TARGETS OR "gfx942:xnack-" IN_LIST GPU_TARGETS OR "gfx942:xnack+" IN_LIST GPU_TARGETS))
+## Currently MSCCL++ is supported only on gfx942 and gfx950, and only on Ubuntu and CentOS
+set(MSCCLPP_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
+# Check if any of the supported architectures are in GPU_TARGETS
+set(ARCH_MATCH_FOUND OFF)
+foreach(ARCH ${MSCCLPP_SUPPORTED_ARCHS})
+  if(ARCH IN_LIST GPU_TARGETS)
+    set(ARCH_MATCH_FOUND ON)
+    break()
+  endif()
+endforeach()
+
+if (ENABLE_MSCCLPP AND NOT ARCH_MATCH_FOUND)
  set(ENABLE_MSCCLPP OFF)
-  message(WARNING "Can only build MSCCL++ for gfx942; disabling MSCCL++ build")
+  message(WARNING "Can only build MSCCL++ for supported GPU_TARGETS (${MSCCLPP_SUPPORTED_ARCHS}); disabling MSCCL++ build")
 endif()
 if (ENABLE_MSCCLPP AND ROCM_VERSION VERSION_LESS "60200")
  set(ENABLE_MSCCLPP OFF)
@@ -95,21 +95,22 @@ if(ENABLE_MSCCLPP)
 	    WORKING_DIRECTORY ${MSCCLPP_SOURCE}
 	)

-        message(STATUS "Building mscclpp only for gfx942.")
+        message(STATUS "Building mscclpp only for supported variants:gfx942,gfx950")
        mscclpp_cmake_arg(CMAKE_PREFIX_PATH)
        mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH)
        mscclpp_cmake_arg(HIP_COMPILER)

-        set(GFX942_VARIANT "gfx942")
+        #gfx950 change is added for testing assuming cmake args are space separated values for list
+        set(GFX_VARIANT "gfx942 gfx950")
        if(BUILD_ADDRESS_SANITIZER)
-            set(GFX942_VARIANT "gfx942:xnack+")
+            set(GFX_VARIANT "gfx942:xnack+ gfx950:xnack+")
        endif()

        download_project(PROJ                mscclpp_nccl
                         #GIT_REPOSITORY      https://github.com/microsoft/mscclpp.git
                         #GIT_TAG             4ee15b7ad085daaf74349d4c49c9b8480d28f0dc
                         INSTALL_DIR         ${MSCCLPP_ROOT}
-                         CMAKE_ARGS          -DAMDGPU_TARGETS=${GFX942_VARIANT} -DGPU_TARGETS=${GFX942_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE}
+                         CMAKE_ARGS          -DAMDGPU_TARGETS=${GFX_VARIANT} -DGPU_TARGETS=${GFX_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE}
                         LOG_DOWNLOAD        FALSE
                         LOG_CONFIGURE       FALSE
                         LOG_BUILD           FALSE
@@ -31,7 +31,7 @@ if(list_result EQUAL 0)

    ## Extract file paths for the selected gfx archs
    foreach(line ${cmd_output})
-        if(line MATCHES "(gfx90a|gfx940|gfx941|gfx942)")
+        if(line MATCHES "(gfx90a|gfx940|gfx941|gfx942|gfx950)")
            string(REGEX MATCH "\\file://(.*)" file_match ${line})
            if(file_match)
                list(APPEND file_paths ${file_match})
@@ -11,7 +11,7 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
  __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
@@ -15,7 +15,7 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
  __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
@@ -209,7 +209,7 @@ namespace {
  }

  template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
  __device__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
  __device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
@@ -357,7 +357,7 @@ namespace {
  }

  template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
  __device__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
  __device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
@@ -10,7 +10,7 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
  __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
@@ -10,7 +10,7 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
  __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
@@ -152,7 +152,7 @@ __device__ __forceinline__ void mscclRunInterpreter(
  int npKitCtxIdx = bid;
  int xcc_id = 0;
  if (tid == 0) {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
    asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
 #endif
  }
@@ -11,7 +11,7 @@
 #include "common.h"
 #include <cuda_runtime.h>

-#if defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
 #define COLL_UNROLL 2
 #else
 #define COLL_UNROLL 4
@@ -124,7 +124,7 @@ union alignas(16) BytePack<16> {
  uint32_t u32[4];
  uint64_t u64[2];
  ulong2 ul2, native;
-#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
  inline __device__ BytePack<16>& operator=(BytePack<16> other) {
    u64[0] = other.u64[0];
    u64[1] = other.u64[1];
@@ -15,7 +15,7 @@

 #define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000

-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
 #define __THREAD_FENCE __threadfence_block()
 #else
 #define __THREAD_FENCE __threadfence()
@@ -11,7 +11,7 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
  __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
@@ -11,7 +11,7 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
  __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
@@ -132,7 +132,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
 #endif
  }

-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
  __device__  void run() {
 #else
  __device__  __attribute__((noinline)) void run() {
@@ -241,7 +241,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
    }

    if (isCopy) {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
      reduceCopy<COLL_UNROLL*2, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
        (subtid, subtn, 0, nullptr, false, 1, &work->sendAddr, 1, &work->recvAddr, (ssize_t)work->sendBytes);
 #else
@@ -254,7 +254,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
      } else {
 #if defined(__gfx90a__)
        runSend<ProtoSimple<1,1,8>>(subtid, subtn, group, work);
-#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
        runSend<ProtoSimple<1,1,4>>(subtid, subtn, group, work);
 #else
        runSend<ProtoSimple<1,1>>(subtid, subtn, group, work);
@@ -266,7 +266,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
      } else {
 #if defined(__gfx90a__)
        runRecv<ProtoSimple<1,1,8>>(subtid, subtn, group, work);
-#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
        runRecv<ProtoSimple<1,1,4>>(subtid, subtn, group, work);
 #else
        runRecv<ProtoSimple<1,1>>(subtid, subtn, group, work);
@@ -286,7 +286,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {

 static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {

-  const int channelLimit = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 2*CHANNEL_LIMIT : CHANNEL_LIMIT;
+  const int channelLimit = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? 2*CHANNEL_LIMIT : CHANNEL_LIMIT;
  const int nChannels = (comm->nChannels > channelLimit) ? comm->nChannels / 2 : comm->nChannels;
  const int nNodes = comm->nNodes, node = comm->node;

@@ -803,8 +803,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
    }
  }

-  // Only use full MAXCHANNELS for gfx94x
-  int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ?
+  // Only use full MAXCHANNELS for gfx94x and gfx950
+  int maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ?
    ((comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 38)
      ? comm->topo->nodes[GPU].nodes[0].gpu.cu : MAXCHANNELS) : 2*CHANNEL_LIMIT;

@@ -656,7 +656,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
    }
  }

-  // Special handling of gfx94x
+  // Special handling of gfx94x and gfx950

 #if !defined(TOPO_EXPL)
  char strValue[1024];
@@ -666,7 +666,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
    int arch, vendor, model;
    NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
    if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL &&
-      IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
+      (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) &&
      ((system->nodes[GPU].count == 8 && system->nodes[NET].count == 8 && system->nodes[GPU].count == system->nRanks) ||
      (system->nodes[GPU].count != system->nRanks))) {
      if (!rcclPathOverride(system, 0x100000) && !rcclPathOverride(system, 0x1000))
@@ -843,7 +843,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
    path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
    if (path->type == PATH_NVL) {
      float nvlBw = ncclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn);
-      *nChannels = (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 4 : 2)*std::max(1, (int)(path->bw / nvlBw));
+      *nChannels = ((IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? 4 : 2)*std::max(1, (int)(path->bw / nvlBw));
    } else {
      *nChannels = 2;
    }
@@ -906,7 +906,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
    // Round to next pow2 nChannelsPerPeer and nChannels
    comm->p2pnChannelsPerPeer = (ncclParamNChannelsPerPeer() == -2 ? pow2Up(minChannels) : ncclParamNChannelsPerPeer());
    // Doubling P2P channels per peer on single node
-    if (comm->topo->nodes[GPU].count == comm->topo->nRanks && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) comm->p2pnChannelsPerPeer *= 2;
+    if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))) comm->p2pnChannelsPerPeer *= 2;
    comm->p2pnChannels = std::min(pow2Up(comm->p2pnChannels), 4*CHANNEL_LIMIT);
  }

@@ -31,6 +31,7 @@
 #define VEGA_XGMI_WIDTH 24.0
 #define MI200_XGMI_WIDTH 36.0
 #define GFX94X_XGMI_WIDTH 48.0
+#define GFX95X_XGMI_WIDTH 48.0

 // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
 // to GPU traffic consumes more PCI bandwidth.
@@ -263,6 +264,8 @@ static float ncclTopoXGMISpeed(const char* gcn) {
    return MI200_XGMI_WIDTH;
  else if (IsArchMatch(gcn, "gfx94"))
    return GFX94X_XGMI_WIDTH;
+  else if (IsArchMatch(gcn, "gfx95"))
+    return GFX95X_XGMI_WIDTH;
  else
    return VEGA_XGMI_WIDTH;
 }
@@ -360,7 +360,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
      if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;

      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue;
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue;
        if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
        int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
        float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
@@ -376,7 +376,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
          busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p];
        else
          busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p];
-        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 1.65; }
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 1.65; }
 #else
        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); }
        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
@@ -344,7 +344,7 @@ struct rccl_float8
    // default constructor
    HIP_HOST_DEVICE rccl_float8() = default;

-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
    // device specific optimized F8 down-conversion code

    template <bool stochastic_rounding = false>
@@ -384,7 +384,7 @@ struct rccl_float8
 #endif // __gfx940__

    // constructor from float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)

    // NOTE: ON-DEVICE... always optimal bias
    explicit HIP_DEVICE rccl_float8(float                        v,
@@ -446,7 +446,7 @@ struct rccl_float8
    }

    // convert to float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
    // upcast using device specific intrinsic
    explicit inline HIP_DEVICE operator float() const
    {
@@ -511,7 +511,7 @@ struct rccl_bfloat8
    // default constructor
    HIP_HOST_DEVICE rccl_bfloat8() = default;

-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
    // device specific optimized F8 down-conversion code

    template <bool stochastic_rounding = false>
@@ -551,7 +551,7 @@ struct rccl_bfloat8
 #endif // __gfx940__

    // constructor from float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)

    // NOTE: ON-DEVICE... always optimal bias
    explicit HIP_DEVICE rccl_bfloat8(float                        v,
@@ -613,7 +613,7 @@ struct rccl_bfloat8
    }

    // convert to float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
    // upcast using device specific intrinsic
    explicit inline HIP_DEVICE operator float() const
    {
@@ -980,7 +980,7 @@ template <
    = 0>
 inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
    // NOTE: we are directly calling cast_to_f8_from_f32 instead of constructor to optimize away one runtime branch
    T val;
    if(std::is_same<T, rccl_float8>::value)
@@ -97,7 +97,7 @@ static uint64_t hashUniqueId(ncclUniqueId const &id) {
 ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
  hipDeviceProp_t devProp;
  CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev));
-  if(IsArchMatch(devProp.gcnArchName, "gfx908") || (IsArchMatch(devProp.gcnArchName, "gfx94")
+  if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx94") || IsArchMatch(devProp.gcnArchName, "gfx950"))
    && devProp.multiProcessorCount > 80))
    comm->unroll = NCCL_UNROLL_2;
  else
@@ -1337,7 +1337,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
    allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels);
  if (ringGraph->nChannels > MAXCHANNELS/2)
    allGather3Data[rank].nc = 1;
-  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94")) {
+  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) {
    // Multi-node MI300A
    int managed = 0;
    CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
@@ -1858,7 +1858,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
  // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
 #ifdef USE_INDIRECT_FUNCTION_CALL
  CUDACHECK(hipGetDeviceProperties(&devProp, 0));
-  if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx94")) {
+  if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx94") && !IsArchMatch(devProp.gcnArchName,"gfx950")) {
    stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
    if (stackSize == 0) {
      if (IsArchMatch(devProp.gcnArchName,"gfx906"))
@@ -1924,7 +1924,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
    if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled()) && mscclppCommCompatible(comm)) {
      hipDeviceProp_t devProp;
      CUDACHECK(hipGetDeviceProperties(&devProp, cudaDev));
-      comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94");
+      comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94") || IsArchMatch(devProp.gcnArchName, "gfx950");
      if (comm->mscclppCompatible) {
        bool mapContainsId = (mscclpp_uniqueIdMap.count(job->commId) > 0);
        auto& mscclppUniqueId = mscclpp_uniqueIdMap[job->commId];
@@ -48,6 +48,8 @@ void convertGcnArchToGcnArchName(const char* gcnArch, const char** gcnArchName)
    *gcnArchName = "gfx941";
  else if (strcmp(gcnArch, "942") == 0)
    *gcnArchName = "gfx942";
+  else if (strcmp(gcnArch, "950") == 0)
+    *gcnArchName = "gfx950";
  else
    *gcnArchName = gcnArch;
 }
@@ -65,6 +67,8 @@ double GetDeviceWallClockRateInKhz(int deviceId) {
  GetGcnArchName(deviceId, gcn);
  if (strncmp("gfx94", gcn, 5) == 0)
    return 1.0E5;
+  else if(strncmp("gfx950", gcn, 6) == 0)
+    return 1.0E5;
  else
    return 2.5E4;
 }
@@ -207,7 +207,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank));
  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
  send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
-  if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
+  if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
    CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
    send->conn.curr_hdp_reg = req.curr_hdp_reg;
  }
@@ -368,7 +368,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
    return ncclInternalError;
  }
-  if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
+  if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
    CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
  }
@@ -34,7 +34,7 @@ THE SOFTWARE.
    } while (0)

 // Macro for collecting HW_REG_XCC_ID
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
 #define GetXccId(val) \
  asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
 #else