diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b78e74663..7e057c4af4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,6 +41,7 @@ set(DEFAULT_GPUS
       gfx908
       gfx90a
       gfx942
+      gfx950
       gfx1030
       gfx1100
       gfx1101
@@ -313,10 +314,20 @@ if (HAVE_PARALLEL_JOBS)
 endif()
 
 ## Disable building MSCCL++ if the build environment is invalid
-## Currently MSCCL++ is supported only on gfx942, and only on Ubuntu and CentOS
-if (ENABLE_MSCCLPP AND NOT ("gfx942" IN_LIST GPU_TARGETS OR "gfx942:xnack-" IN_LIST GPU_TARGETS OR "gfx942:xnack+" IN_LIST GPU_TARGETS))
+## Currently MSCCL++ is supported only on gfx942 and gfx950, and only on Ubuntu and CentOS
+set(MSCCLPP_SUPPORTED_ARCHS "gfx942" "gfx942:xnack-" "gfx942:xnack+" "gfx950" "gfx950:xnack-" "gfx950:xnack+")
+# Check if any of the supported architectures are in GPU_TARGETS
+set(ARCH_MATCH_FOUND OFF)
+foreach(ARCH ${MSCCLPP_SUPPORTED_ARCHS})
+  if(ARCH IN_LIST GPU_TARGETS)
+    set(ARCH_MATCH_FOUND ON)
+    break()
+  endif()
+endforeach()
+
+if (ENABLE_MSCCLPP AND NOT ARCH_MATCH_FOUND)
   set(ENABLE_MSCCLPP OFF)
-  message(WARNING "Can only build MSCCL++ for gfx942; disabling MSCCL++ build")
+  message(WARNING "Can only build MSCCL++ for supported GPU_TARGETS (${MSCCLPP_SUPPORTED_ARCHS}); disabling MSCCL++ build")
 endif()
 if (ENABLE_MSCCLPP AND ROCM_VERSION VERSION_LESS "60200")
   set(ENABLE_MSCCLPP OFF)
diff --git a/cmake/MSCCLPP.cmake b/cmake/MSCCLPP.cmake
index 1f870860e5..491cdadc2f 100644
--- a/cmake/MSCCLPP.cmake
+++ b/cmake/MSCCLPP.cmake
@@ -95,21 +95,22 @@ if(ENABLE_MSCCLPP)
 	    WORKING_DIRECTORY ${MSCCLPP_SOURCE}
 	)
 
-        message(STATUS "Building mscclpp only for gfx942.")
+        message(STATUS "Building mscclpp only for supported variants:gfx942,gfx950")
         mscclpp_cmake_arg(CMAKE_PREFIX_PATH)
         mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH)
         mscclpp_cmake_arg(HIP_COMPILER)
 
-        set(GFX942_VARIANT "gfx942")
+        #gfx950 change is added for testing assuming cmake args are space separated values for list
+        set(GFX_VARIANT "gfx942 gfx950")
         if(BUILD_ADDRESS_SANITIZER)
-            set(GFX942_VARIANT "gfx942:xnack+")
+            set(GFX_VARIANT "gfx942:xnack+ gfx950:xnack+")
         endif()
 
         download_project(PROJ                mscclpp_nccl
                          #GIT_REPOSITORY      https://github.com/microsoft/mscclpp.git
                          #GIT_TAG             4ee15b7ad085daaf74349d4c49c9b8480d28f0dc
                          INSTALL_DIR         ${MSCCLPP_ROOT}
-                         CMAKE_ARGS          -DAMDGPU_TARGETS=${GFX942_VARIANT} -DGPU_TARGETS=${GFX942_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE}
+                         CMAKE_ARGS          -DAMDGPU_TARGETS=${GFX_VARIANT} -DGPU_TARGETS=${GFX_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${JSON_SOURCE}
                          LOG_DOWNLOAD        FALSE
                          LOG_CONFIGURE       FALSE
                          LOG_BUILD           FALSE
diff --git a/cmake/scripts/extract_metadata.cmake b/cmake/scripts/extract_metadata.cmake
index ca025ab033..46dba319e1 100644
--- a/cmake/scripts/extract_metadata.cmake
+++ b/cmake/scripts/extract_metadata.cmake
@@ -31,7 +31,7 @@ if(list_result EQUAL 0)
 
     ## Extract file paths for the selected gfx archs
     foreach(line ${cmd_output})
-        if(line MATCHES "(gfx90a|gfx940|gfx941|gfx942)")
+        if(line MATCHES "(gfx90a|gfx940|gfx941|gfx942|gfx950)")
             string(REGEX MATCH "\\file://(.*)" file_match ${line})
             if(file_match)
                 list(APPEND file_paths ${file_match})
diff --git a/src/device/all_gather.h b/src/device/all_gather.h
index c36c9c9100..88700ada79 100644
--- a/src/device/all_gather.h
+++ b/src/device/all_gather.h
@@ -11,7 +11,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
   __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h
index 37b35c72ea..cabf12ea74 100644
--- a/src/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -15,7 +15,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
   __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
@@ -209,7 +209,7 @@ namespace {
   }
 
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
   __device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
@@ -357,7 +357,7 @@ namespace {
   }
 
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
   __device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
diff --git a/src/device/alltoall_pivot.h b/src/device/alltoall_pivot.h
index 2924fb43e4..6b5a6392ff 100644
--- a/src/device/alltoall_pivot.h
+++ b/src/device/alltoall_pivot.h
@@ -10,7 +10,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
   __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
diff --git a/src/device/broadcast.h b/src/device/broadcast.h
index 8737bd4eae..c53f1553c8 100644
--- a/src/device/broadcast.h
+++ b/src/device/broadcast.h
@@ -10,7 +10,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
   __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
diff --git a/src/device/msccl_kernel_impl.h b/src/device/msccl_kernel_impl.h
index a2aeccf2c0..75ae68890f 100644
--- a/src/device/msccl_kernel_impl.h
+++ b/src/device/msccl_kernel_impl.h
@@ -152,7 +152,7 @@ __device__ __forceinline__ void mscclRunInterpreter(
   int npKitCtxIdx = bid;
   int xcc_id = 0;
   if (tid == 0) {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
     asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
 #endif
   }
diff --git a/src/device/onerank.cu b/src/device/onerank.cu
index 770ecba1e3..809e6af9ca 100644
--- a/src/device/onerank.cu
+++ b/src/device/onerank.cu
@@ -11,7 +11,7 @@
 #include "common.h"
 #include <cuda_runtime.h>
 
-#if defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
 #define COLL_UNROLL 2
 #else
 #define COLL_UNROLL 4
diff --git a/src/device/op128.h b/src/device/op128.h
index 72f90d0c14..7fc1bcdc33 100644
--- a/src/device/op128.h
+++ b/src/device/op128.h
@@ -124,7 +124,7 @@ union alignas(16) BytePack<16> {
   uint32_t u32[4];
   uint64_t u64[2];
   ulong2 ul2, native;
-#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
   inline __device__ BytePack<16>& operator=(BytePack<16> other) {
     u64[0] = other.u64[0];
     u64[1] = other.u64[1];
diff --git a/src/device/primitives.h b/src/device/primitives.h
index 10c8710c61..c49c28d582 100644
--- a/src/device/primitives.h
+++ b/src/device/primitives.h
@@ -15,7 +15,7 @@
 
 #define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
 
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
 #define __THREAD_FENCE __threadfence_block()
 #else
 #define __THREAD_FENCE __threadfence()
diff --git a/src/device/reduce.h b/src/device/reduce.h
index c9856cd865..689e4261f7 100644
--- a/src/device/reduce.h
+++ b/src/device/reduce.h
@@ -11,7 +11,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
   __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h
index fc2da869b2..7d51c7ea12 100644
--- a/src/device/reduce_scatter.h
+++ b/src/device/reduce_scatter.h
@@ -11,7 +11,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #else
   __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h
index b49ac008c8..72ef207a4c 100644
--- a/src/device/sendrecv.h
+++ b/src/device/sendrecv.h
@@ -132,7 +132,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
 #endif
   }
 
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__  void run() {
 #else
   __device__  __attribute__((noinline)) void run() {
@@ -241,7 +241,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
     }
 
     if (isCopy) {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
       reduceCopy<COLL_UNROLL*2, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
         (subtid, subtn, 0, nullptr, false, 1, &work->sendAddr, 1, &work->recvAddr, (ssize_t)work->sendBytes);
 #else
@@ -254,7 +254,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
       } else {
 #if defined(__gfx90a__)
         runSend<ProtoSimple<1,1,8>>(subtid, subtn, group, work);
-#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
         runSend<ProtoSimple<1,1,4>>(subtid, subtn, group, work);
 #else
         runSend<ProtoSimple<1,1>>(subtid, subtn, group, work);
@@ -266,7 +266,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
       } else {
 #if defined(__gfx90a__)
         runRecv<ProtoSimple<1,1,8>>(subtid, subtn, group, work);
-#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#elif defined(__gfx908__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
         runRecv<ProtoSimple<1,1,4>>(subtid, subtn, group, work);
 #else
         runRecv<ProtoSimple<1,1>>(subtid, subtn, group, work);
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index f58c480ccc..6c6a743542 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -286,7 +286,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
 
 static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
 
-  const int channelLimit = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 2*CHANNEL_LIMIT : CHANNEL_LIMIT;
+  const int channelLimit = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? 2*CHANNEL_LIMIT : CHANNEL_LIMIT;
   const int nChannels = (comm->nChannels > channelLimit) ? comm->nChannels / 2 : comm->nChannels;
   const int nNodes = comm->nNodes, node = comm->node;
 
@@ -803,8 +803,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
     }
   }
 
-  // Only use full MAXCHANNELS for gfx94x
-  int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ?
+  // Only use full MAXCHANNELS for gfx94x and gfx950
+  int maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ?
     ((comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 38)
       ? comm->topo->nodes[GPU].nodes[0].gpu.cu : MAXCHANNELS) : 2*CHANNEL_LIMIT;
 
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 159b66e7b3..e78026b2f2 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -656,7 +656,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
     }
   }
 
-  // Special handling of gfx94x
+  // Special handling of gfx94x and gfx950
 
 #if !defined(TOPO_EXPL)
   char strValue[1024];
@@ -666,7 +666,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
     int arch, vendor, model;
     NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
     if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL &&
-      IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") &&
+      (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) &&
       ((system->nodes[GPU].count == 8 && system->nodes[NET].count == 8 && system->nodes[GPU].count == system->nRanks) ||
       (system->nodes[GPU].count != system->nRanks))) {
       if (!rcclPathOverride(system, 0x100000) && !rcclPathOverride(system, 0x1000))
@@ -843,7 +843,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
     path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
     if (path->type == PATH_NVL) {
       float nvlBw = ncclTopoXGMISpeed(system->nodes[GPU].nodes[g].gpu.gcn);
-      *nChannels = (IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? 4 : 2)*std::max(1, (int)(path->bw / nvlBw));
+      *nChannels = ((IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(system->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? 4 : 2)*std::max(1, (int)(path->bw / nvlBw));
     } else {
       *nChannels = 2;
     }
@@ -906,7 +906,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
     // Round to next pow2 nChannelsPerPeer and nChannels
     comm->p2pnChannelsPerPeer = (ncclParamNChannelsPerPeer() == -2 ? pow2Up(minChannels) : ncclParamNChannelsPerPeer());
     // Doubling P2P channels per peer on single node
-    if (comm->topo->nodes[GPU].count == comm->topo->nRanks && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) comm->p2pnChannelsPerPeer *= 2;
+    if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))) comm->p2pnChannelsPerPeer *= 2;
     comm->p2pnChannels = std::min(pow2Up(comm->p2pnChannels), 4*CHANNEL_LIMIT);
   }
 
diff --git a/src/graph/topo.h b/src/graph/topo.h
index d20b9479f6..47713db7ce 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -31,6 +31,7 @@
 #define VEGA_XGMI_WIDTH 24.0
 #define MI200_XGMI_WIDTH 36.0
 #define GFX94X_XGMI_WIDTH 48.0
+#define GFX95X_XGMI_WIDTH 48.0
 
 // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
 // to GPU traffic consumes more PCI bandwidth.
@@ -263,6 +264,8 @@ static float ncclTopoXGMISpeed(const char* gcn) {
     return MI200_XGMI_WIDTH;
   else if (IsArchMatch(gcn, "gfx94"))
     return GFX94X_XGMI_WIDTH;
+  else if (IsArchMatch(gcn, "gfx95"))
+    return GFX95X_XGMI_WIDTH;
   else
     return VEGA_XGMI_WIDTH;
 }
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index ca843b267a..570bc375d2 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -360,7 +360,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
 
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue;
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue;
         if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
         int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
         float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
@@ -376,7 +376,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
           busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p];
         else
           busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p];
-        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 1.65; }
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 1.65; }
 #else
         if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); }
         if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
diff --git a/src/include/rccl_float8.h b/src/include/rccl_float8.h
index 01cab41f71..944d781a11 100644
--- a/src/include/rccl_float8.h
+++ b/src/include/rccl_float8.h
@@ -344,7 +344,7 @@ struct rccl_float8
     // default constructor
     HIP_HOST_DEVICE rccl_float8() = default;
 
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
     // device specific optimized F8 down-conversion code
 
     template <bool stochastic_rounding = false>
@@ -384,7 +384,7 @@ struct rccl_float8
 #endif // __gfx940__
 
     // constructor from float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
 
     // NOTE: ON-DEVICE... always optimal bias
     explicit HIP_DEVICE rccl_float8(float                        v,
@@ -446,7 +446,7 @@ struct rccl_float8
     }
 
     // convert to float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
     // upcast using device specific intrinsic
     explicit inline HIP_DEVICE operator float() const
     {
@@ -511,7 +511,7 @@ struct rccl_bfloat8
     // default constructor
     HIP_HOST_DEVICE rccl_bfloat8() = default;
 
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
     // device specific optimized F8 down-conversion code
 
     template <bool stochastic_rounding = false>
@@ -551,7 +551,7 @@ struct rccl_bfloat8
 #endif // __gfx940__
 
     // constructor from float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
 
     // NOTE: ON-DEVICE... always optimal bias
     explicit HIP_DEVICE rccl_bfloat8(float                        v,
@@ -613,7 +613,7 @@ struct rccl_bfloat8
     }
 
     // convert to float
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
     // upcast using device specific intrinsic
     explicit inline HIP_DEVICE operator float() const
     {
@@ -980,7 +980,7 @@ template <
     = 0>
 inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
 {
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
     // NOTE: we are directly calling cast_to_f8_from_f32 instead of constructor to optimize away one runtime branch
     T val;
     if(std::is_same<T, rccl_float8>::value)
diff --git a/src/init.cc b/src/init.cc
index ec9407c4c9..1bcd4e2620 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -97,7 +97,7 @@ static uint64_t hashUniqueId(ncclUniqueId const &id) {
 ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
   hipDeviceProp_t devProp;
   CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev));
-  if(IsArchMatch(devProp.gcnArchName, "gfx908") || (IsArchMatch(devProp.gcnArchName, "gfx94")
+  if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx94") || IsArchMatch(devProp.gcnArchName, "gfx950"))
     && devProp.multiProcessorCount > 80))
     comm->unroll = NCCL_UNROLL_2;
   else
@@ -1337,7 +1337,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels);
   if (ringGraph->nChannels > MAXCHANNELS/2)
     allGather3Data[rank].nc = 1;
-  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94")) {
+  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94") || IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) {
     // Multi-node MI300A
     int managed = 0;
     CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
@@ -1858,7 +1858,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
 #ifdef USE_INDIRECT_FUNCTION_CALL
   CUDACHECK(hipGetDeviceProperties(&devProp, 0));
-  if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx94")) {
+  if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx94") && !IsArchMatch(devProp.gcnArchName,"gfx950")) {
     stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
     if (stackSize == 0) {
       if (IsArchMatch(devProp.gcnArchName,"gfx906"))
@@ -1924,7 +1924,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled()) && mscclppCommCompatible(comm)) {
       hipDeviceProp_t devProp;
       CUDACHECK(hipGetDeviceProperties(&devProp, cudaDev));
-      comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94");
+      comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94") || IsArchMatch(devProp.gcnArchName, "gfx950");
       if (comm->mscclppCompatible) {
         bool mapContainsId = (mscclpp_uniqueIdMap.count(job->commId) > 0);
         auto& mscclppUniqueId = mscclpp_uniqueIdMap[job->commId];
diff --git a/src/misc/archinfo.cc b/src/misc/archinfo.cc
index 3cdb5df1e6..547bcccd92 100644
--- a/src/misc/archinfo.cc
+++ b/src/misc/archinfo.cc
@@ -48,6 +48,8 @@ void convertGcnArchToGcnArchName(const char* gcnArch, const char** gcnArchName)
     *gcnArchName = "gfx941";
   else if (strcmp(gcnArch, "942") == 0)
     *gcnArchName = "gfx942";
+  else if (strcmp(gcnArch, "950") == 0)
+    *gcnArchName = "gfx950";
   else
     *gcnArchName = gcnArch;
 }
@@ -65,6 +67,8 @@ double GetDeviceWallClockRateInKhz(int deviceId) {
   GetGcnArchName(deviceId, gcn);
   if (strncmp("gfx94", gcn, 5) == 0)
     return 1.0E5;
+  else if(strncmp("gfx950", gcn, 6) == 0)
+    return 1.0E5;
   else
     return 2.5E4;
 }
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 4261dd0fb0..5fd36f6f71 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -207,7 +207,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
-  if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
+  if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
     CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
     send->conn.curr_hdp_reg = req.curr_hdp_reg;
   }
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index f3caa9e70a..5c3871163a 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -368,7 +368,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
     return ncclInternalError;
   }
-  if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
+  if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
     CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
     TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
   }
diff --git a/tools/JitterBench/Common.hpp b/tools/JitterBench/Common.hpp
index 8fad0ac3c1..16aa82c4ce 100644
--- a/tools/JitterBench/Common.hpp
+++ b/tools/JitterBench/Common.hpp
@@ -34,7 +34,7 @@ THE SOFTWARE.
     } while (0)
 
 // Macro for collecting HW_REG_XCC_ID
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
 #define GetXccId(val) \
   asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
 #else