From dfda1d6fab2f296bdcb41b7b79280a876585ae20 Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Fri, 21 Jul 2023 07:31:27 -0700
Subject: [PATCH] Enable gfx94x (#808) (#816)

(cherry picked from commit 94da229a7788d74685d1591a4e75a8341de64f41)

[ROCm/rccl commit: a7fcd58a9754ddaa1b96a00d68b72e4d3a379e26]
---
 projects/rccl/CMakeLists.txt                  | 11 +++-
 projects/rccl/src/collectives/device/common.h | 17 ++++++
 projects/rccl/src/graph/paths.cc              | 54 +++++++++----------
 projects/rccl/src/graph/rome_models.cc        | 42 ++++++++++++---
 projects/rccl/src/graph/search.cc             |  8 +--
 projects/rccl/src/graph/topo.cc               |  2 +-
 projects/rccl/src/graph/topo.h                | 12 ++++-
 projects/rccl/src/include/alloc.h             | 30 ++++++++---
 projects/rccl/src/init.cc                     | 20 +++++--
 .../rccl/src/misc/msccl/msccl_lifecycle.cc    |  3 +-
 projects/rccl/src/misc/npkit.cc               |  8 ++-
 projects/rccl/src/transport/net.cc            |  2 +-
 projects/rccl/src/transport/p2p.cc            |  2 +-
 projects/rccl/tools/scripts/topo_val.sh       |  2 +-
 projects/rccl/tools/topo_expl/topo_expl.cpp   |  2 +
 15 files changed, 159 insertions(+), 56 deletions(-)

diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt
index 13e1fc77c3..80cc46b41f 100644
--- a/projects/rccl/CMakeLists.txt
+++ b/projects/rccl/CMakeLists.txt
@@ -17,7 +17,7 @@ option(BUILD_LOCAL_GPU_TARGET_ONLY             "Build only for GPUs detected on
 option(BUILD_SHARED_LIBS                       "Build as shared library"                       ON)
 option(BUILD_TESTS                             "Build unit test programs"                      OFF)
 option(COLLTRACE                               "Collective Trace Option"                       ON)
-option(ENABLE_IFC                              "Enable indirect function call"                 ON)
+option(ENABLE_IFC                              "Enable indirect function call"                 OFF)
 option(INSTALL_DEPENDENCIES                    "Force install dependencies"                    OFF)
 option(PROFILE                                 "Enable profiling"                              OFF)
 option(TIMETRACE                               "Enable time-trace during compilation"          OFF)
@@ -32,6 +32,9 @@ set(DEFAULT_GPUS
       gfx908:xnack-
       gfx90a:xnack-
       gfx90a:xnack+
+      gfx940
+      gfx941
+      gfx942
       gfx1030
       gfx1100
       gfx1101
@@ -118,6 +121,9 @@ message(STATUS "hipcc version:    ${hipcc_version_string}")
 ### Check for hipEventDisableSystemFence support
 check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVENT_DISABLE_FENCE)
 
+### Check for hipDeviceMallocUncached support
+check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
+
 ### Check for indirect function call support
 if(ENABLE_IFC)
   if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201")
@@ -523,6 +529,9 @@ endif()
 if(${HIP_EVENT_DISABLE_FENCE})
   target_compile_definitions(rccl PRIVATE HIP_EVENT_DISABLE_FENCE)
 endif()
+if(${HIP_UNCACHED_MEMORY})
+  target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY)
+endif()
 if (BUILD_BFD)
   if (HAVE_BFD)
     target_compile_definitions(rccl PRIVATE HAVE_BFD)
diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h
index d4c89343f2..f645e97ae2 100644
--- a/projects/rccl/src/collectives/device/common.h
+++ b/projects/rccl/src/collectives/device/common.h
@@ -28,10 +28,17 @@
   { __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST); }
 #endif
 
+#ifdef ENABLE_LL128
 #define NCCL_FUNC5(func, algo, devredop, type, nullify) \
   MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL,     devredop, type)), \
   MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128,  devredop, type)), \
   MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
+#else
+#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
+  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL,     devredop, type)), \
+  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL,     devredop, type)), \
+  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
+#endif
 
 #define NCCL_FUNC4(func, devredop, type, nullify) \
   NCCL_FUNC5(func, TREE,    devredop, type, nullify), \
@@ -542,7 +549,11 @@ __forceinline__ __device__ void ncclKernel(
 #ifdef USE_INDIRECT_FUNCTION_CALL
       ncclFuncs[ncclShmem.work.header.funcIndex]();
 #else
+#ifdef ENABLE_LL128
       NCCL_CALL_FUNCTIONS<1>(ncclShmem.work.header.funcIndex);
+#else
+      NCCL_CALL_FUNCTIONS<0>(ncclShmem.work.header.funcIndex);
+#endif
 #endif
     }
 
@@ -607,10 +618,16 @@ __device__  __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev
 #endif
 
 // Only generate inline kernels for LL
+#ifdef ENABLE_LL128
 #define IMPL_COLL4(func, algo, devredop, type) \
   IMPL_COLL_FUNC(func, algo, LL,     devredop, type) \
   IMPL_COLL_FUNC(func, algo, LL128,  devredop, type) \
   IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type)
+#else
+#define IMPL_COLL4(func, algo, devredop, type) \
+  IMPL_COLL_FUNC(func, algo, LL,     devredop, type) \
+  IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type)
+#endif
 
 #define IMPL_COLL3(func, devredop, type) \
   IMPL_COLL4(func, TREE,    devredop, type) \
diff --git a/projects/rccl/src/graph/paths.cc b/projects/rccl/src/graph/paths.cc
index cb2bf81b31..5df512b108 100644
--- a/projects/rccl/src/graph/paths.cc
+++ b/projects/rccl/src/graph/paths.cc
@@ -700,36 +700,32 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
   } while (system->nodes[NET].count);
 
   int remove = 1;
-  int arch, vendor, model;
-  NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
-  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD
-    && model == NCCL_TOPO_CPU_TYPE_ROME) {
-    int gdr = 1;
-    bool allXgmi = true;
-    // detect if all GPUs are connected by XGMI
-    for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
-      int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
-      for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
-        if (i == j) continue;
-        int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
-        bool isXGMI;
-        NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
-        allXgmi &= isXGMI;
-      }
-    }
-    if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
-    for (int g = 0; g < system->nodes[GPU].count; g++) {
-      int net;
-      NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
-      NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
-      if (!gdr) break;
-    }
-    if (gdr && !allXgmi) {
-      remove = 0;
-      system->type |= RCCL_TOPO_GDR_ALL;
-      INFO(NCCL_GRAPH, "GDR is available on all GPUs");
+  int gdr = 1;
+  bool allXgmi = true;
+  // detect if all GPUs are connected by XGMI
+  for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
+    int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
+    for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
+      if (i == j) continue;
+      int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
+      bool isXGMI;
+      NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
+      allXgmi &= isXGMI;
     }
   }
+  if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
+  for (int g = 0; g < system->nodes[GPU].count; g++) {
+    int net;
+    NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
+    NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
+    if (!gdr) break;
+  }
+  if (gdr && !allXgmi) {
+    remove = 0;
+    system->type |= RCCL_TOPO_GDR_ALL;
+    INFO(NCCL_GRAPH, "GDR is available on all GPUs");
+  }
+
   if (rcclParamEnableIntranet()) {
     remove = 0;
     system->type |= RCCL_TOPO_FORCE_INTRA;
@@ -812,7 +808,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
 
   int arch, vendor, model;
   NCCLCHECK(ncclTopoCpuType(comm->topo, &arch, &vendor, &model));
-  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) {
     // Adjust P2P channels on Intel platform
     comm->p2pnChannelsPerPeer = 1;
     comm->p2pnChannels = 2;
diff --git a/projects/rccl/src/graph/rome_models.cc b/projects/rccl/src/graph/rome_models.cc
index f5ce15b168..e0e5c41948 100644
--- a/projects/rccl/src/graph/rome_models.cc
+++ b/projects/rccl/src/graph/rome_models.cc
@@ -578,6 +578,32 @@ static struct rcclRomeModel rome_model_76 = {
   .treeBase = "",
 };
 
+static struct rcclRomeModel rome_model_79 = {
+  .nGpus = 8, .nCpus = 2, .nNics = 0, .nLinks = 7,
+  .gpuIds = { 0x1d000, 0x2e000, 0x3f000, 0x61000, 0x9f000, 0xaf000, 0xbf000, 0xdf000, },
+  .nicIds = { },
+  .gpuNuma = { 0, 0, 0, 0, 1, 1, 1, 1, },
+  .nicNuma = { },
+  .connMatrix = { 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, },
+  .gdrLevel = { },
+  .pattern = "4040",
+  .ringBase = "0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0",
+  .options = "noCpuCheck=1",
+};
+
+static struct rcclRomeModel rome_model_80 = {
+  .nGpus = 4, .nCpus = 4, .nNics = 4, .nLinks = 3,
+  .gpuIds = { 0x82000, 0xc2000, 0x2000, 0x42000, },
+  .nicIds = { 0x81000, 0xc1000, 0x1000, 0x41000, },
+  .gpuNuma = { 2, 3, 0, 1, },
+  .nicNuma = { 2, 3, 0, 1, },
+  .connMatrix = { 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, },
+  .gdrLevel = { PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, },
+  .pattern = "11111111",
+  .ringBase = "N2 2 3 0 1 N1|N0 0 1 3 2 N2|N0 0 2 1 3 N3|N3 3 1 0 2 N2|N3 3 1 2 0 N0|N1 1 0 3 2 N2|N1 1 2 3 0 N0|N2 2 0 1 3 N3|N3 3 0 2 1 N1|N2 2 3 1 0 N0|N1 1 2 0 3 N3|N0 0 3 2 1 N1",
+  .options = "",
+};
+
 static struct rcclRomeModel romeTopoModels[] = {
   rome_model_22,
   rome_model_25,
@@ -617,6 +643,8 @@ static struct rcclRomeModel romeTopoModels[] = {
   rome_model_73,
   rome_model_74,
   rome_model_76,
+  rome_model_79,
+  rome_model_80,
 };
 
 /* Parse user defined rings. Format is like :
@@ -850,7 +878,7 @@ static void parseOptions(struct ncclTopoSystem* system, const char *options) {
   }
 }
 
-static bool disableNumaMatching(const char *options) {
+static bool checkOption(const char *options, const char *name) {
   if (strcmp(options, "")) {
     char *str_temp = (char *)malloc(strlen(options) + 1);
     strcpy(str_temp, options);
@@ -862,7 +890,7 @@ static bool disableNumaMatching(const char *options) {
     while (tokens[numTokens-1] != NULL && numTokens < MAX_OPT_TOKENS)
         tokens[numTokens++] = strtok_r(NULL, "=, ", &state);
     for (int i = 0; i < numTokens/2; i++) {
-      if (strcmp(tokens[i*2], "disableNumaMatching") == 0) {
+      if (strcmp(tokens[i*2], name) == 0) {
         return (bool)atol(tokens[i*2+1]);
       }
     }
@@ -1202,8 +1230,6 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
   // only valid on Rome
   int arch, vendor, model;
   NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
-  if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
-    return ncclSuccess;
 
   // number of GPUs and NICs on each numa node is used as first screening pattern
   struct rcclRomeModel romeTopo;
@@ -1229,7 +1255,10 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
   if (i < romeTopo.nGpus) match_nbio = false;
 
   for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) {
-    bool ignore_numa = disableNumaMatching(romeTopoModels[i].options);
+    bool ignore_cpu = checkOption(romeTopoModels[i].options, "noCpuCheck");
+    if (!ignore_cpu && (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME))
+      continue;
+    bool ignore_numa = checkOption(romeTopoModels[i].options, "disableNumaMatching");
     if (!ignore_numa && romeTopo.nCpus != romeTopoModels[i].nCpus) continue;
     if (romeTopo.nGpus != romeTopoModels[i].nGpus ||
       romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue;
@@ -1501,7 +1530,8 @@ ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* grap
     }
   }
   INFO(NCCL_GRAPH, "%s", line);
-  system->type |= RCCL_TOPO_4P2H_ROME;
+  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME)
+    system->type |= RCCL_TOPO_4P2H_ROME;
   parseOptions(system, rome_model_68.options);
   // create 4P4H based on reference and remapped ids
   NCCLCHECK(parseGraph(rome_model_68.ringBase, system, graph, g_hives, n_hives));
diff --git a/projects/rccl/src/graph/search.cc b/projects/rccl/src/graph/search.cc
index 45364fb8ba..ad6ff7c1d9 100644
--- a/projects/rccl/src/graph/search.cc
+++ b/projects/rccl/src/graph/search.cc
@@ -837,8 +837,8 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
 }
 
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-float speedArrayIntra[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
-float speedArrayInter[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float speedArrayIntra[] = { 48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float speedArrayInter[] = { 48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
 #else
@@ -890,7 +890,9 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   if (str) {
     // user supplied topo
     NCCLCHECK(parseGraph(str, system, graph, NULL, NULL));
-    if (graph->nChannels) {
+    int arch, vendor, model;
+    NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
+    if (graph->nChannels && arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) {
       system->type |= RCCL_TOPO_4P2H_ROME;
     }
   } else if (!rcclParamModelMatchingDisable() && !graph->collNet) {
diff --git a/projects/rccl/src/graph/topo.cc b/projects/rccl/src/graph/topo.cc
index bd4c75310f..f246630fce 100644
--- a/projects/rccl/src/graph/topo.cc
+++ b/projects/rccl/src/graph/topo.cc
@@ -380,7 +380,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
   return ncclSuccess;
 }
 
-struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
+struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { "0x120000", GPU }, { NULL, PCI /* Default fallback value */ } };
 struct kvDict kvDictPciGen[] = {
   { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
   { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
diff --git a/projects/rccl/src/graph/topo.h b/projects/rccl/src/graph/topo.h
index cc995d4ccf..50fd6275df 100644
--- a/projects/rccl/src/graph/topo.h
+++ b/projects/rccl/src/graph/topo.h
@@ -27,6 +27,7 @@
 #define NET_BW 12.0           // 100Gbit
 #define VEGA_XGMI_WIDTH 24.0
 #define MI200_XGMI_WIDTH 36.0
+#define GFX94X_XGMI_WIDTH 48.0
 
 // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
 // to GPU traffic consumes more PCI bandwidth.
@@ -223,7 +224,16 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
 
 // Returns XGMI speed in GB/s
 static float ncclTopoXGMISpeed(int gcn) {
-  return gcn == 910 ? MI200_XGMI_WIDTH : VEGA_XGMI_WIDTH;
+  switch (gcn) {
+    case 910:
+      return MI200_XGMI_WIDTH;
+    case 940:
+    case 941:
+    case 942:
+      return GFX94X_XGMI_WIDTH;
+    default:
+      return VEGA_XGMI_WIDTH;
+  }
 }
 
 #if ENABLE_COLLTRACE
diff --git a/projects/rccl/src/include/alloc.h b/projects/rccl/src/include/alloc.h
index 4f47be44d9..9579044bf3 100644
--- a/projects/rccl/src/include/alloc.h
+++ b/projects/rccl/src/include/alloc.h
@@ -165,9 +165,15 @@ ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (isFineGrain)
+  if (isFineGrain) {
+#if defined(HIP_UNCACHED_MEMORY)
+    hipDeviceProp_t prop;
+    CUDACHECK(hipGetDeviceProperties(&prop, 0));
+    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish);
+#else
     CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
-  else
+#endif
+  } else
     CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
@@ -187,9 +193,15 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t
   cudaStream_t stream = sideStream;
   if (stream == nullptr)
     CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  if (isFineGrain)
+  if (isFineGrain) {
+#if defined(HIP_UNCACHED_MEMORY)
+    hipDeviceProp_t prop;
+    CUDACHECK(hipGetDeviceProperties(&prop, 0));
+    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish);
+#else
     CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
-  else
+#endif
+  } else
     CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
   CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
   CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
@@ -215,9 +227,15 @@ ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, s
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (isFineGrain)
+  if (isFineGrain) {
+#if defined(HIP_UNCACHED_MEMORY)
+    hipDeviceProp_t prop;
+    CUDACHECK(hipGetDeviceProperties(&prop, 0));
+    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish);
+#else
     CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
-  else
+#endif
+  } else
     CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
   CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
   int dev;
diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc
index 9778cb7bda..baaa64ea6d 100644
--- a/projects/rccl/src/init.cc
+++ b/projects/rccl/src/init.cc
@@ -171,6 +171,13 @@ RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0);
 void *ncclCommThreadMain(void *arg) {
   ncclComm_t comm = (ncclComm_t)arg;
   int head = 0;
+  hipDeviceProp_t devProp;
+  double vega_gpu_rtc_freq;
+  hipError_t status = hipGetDeviceProperties(&devProp, comm->cudaDev);
+  if (devProp.gcnArch/10 == 94 && status == hipSuccess)
+    vega_gpu_rtc_freq = 1.0E8;
+  else
+    vega_gpu_rtc_freq = 2.5E7;
   #define MAX_NAME_LENGTH 64
   char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+2));
   for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) {
@@ -213,16 +220,15 @@ void *ncclCommThreadMain(void *arg) {
       char line[1024];
       int offset = 0;
       uint16_t fIdx = td->funcIndex;
-      #define VEGA_GPU_RTC_FREQUENCY 2.5E7
       if (type == ncclCollTraceDataType) {
         sprintf(line, "## [%012.6f] [%02d:%02d] L:%04d DT %08x %016lx %016lx",
-          (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid,
+          (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid,
           fIdx, td->data_0, td->opCount, td->data_1);
       } else {
         if (fIdx == FUNC_INDEX_P2P || type == ncclCollTraceP2pElemType)
-          sprintf(line, "## [%012.6f] [%02d:%02d] %06x-%06x", (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->p2pOpCount[0], td->p2pOpCount[1]);
+          sprintf(line, "## [%012.6f] [%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->p2pOpCount[0], td->p2pOpCount[1]);
         else
-          sprintf(line, "## [%012.6f] [%02d:%02d] %06lx", (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->opCount);
+          sprintf(line, "## [%012.6f] [%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->opCount);
         offset = strlen(line);
         if (type == ncclCollTraceCollElemType) {
           sprintf(line+offset, " CE %s nw %d bi %d nc %d busId %lx nRanks %d", func_names+MAX_NAME_LENGTH*fIdx, td->coll.nWarps, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks);
@@ -705,7 +711,13 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
 
   // detect if fine grained memory is available on this GPU
   int *ptr;
+#if defined(HIP_UNCACHED_MEMORY)
+  hipDeviceProp_t prop;
+  CUDACHECK(hipGetDeviceProperties(&prop, 0));
+  if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained) == hipSuccess) {
+#else
   if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) {
+#endif
     CUDACHECK(hipFree(ptr));
     info->hasFineGrain = true;
     NCCLCHECK(ncclGpuGdrSupport(comm, &info->gdrSupport));
diff --git a/projects/rccl/src/misc/msccl/msccl_lifecycle.cc b/projects/rccl/src/misc/msccl/msccl_lifecycle.cc
index 8424b21aa1..885e7d0bd9 100644
--- a/projects/rccl/src/misc/msccl/msccl_lifecycle.cc
+++ b/projects/rccl/src/misc/msccl/msccl_lifecycle.cc
@@ -23,6 +23,7 @@
 #include "msccl/msccl_status.h"
 
 RCCL_PARAM(MscclEnabled, "MSCCL_ENABLE", 1);
+RCCL_PARAM(MscclForceEnabled, "MSCCL_FORCE_ENABLE", 0);
 static const char* mscclAlgoFilePathEnv = "MSCCL_ALGO_FILE_PATH";
 static std::atomic<bool> mscclInitialized;
 static bool mscclSchedulerTriedLoadAlgo = false;
@@ -257,7 +258,7 @@ static ncclResult_t mscclSchedulerSelectAlgo(struct mscclSavedSchedulerParam* pa
   if (status.mscclSchedulerPtr) {
     NCCLCHECK(status.mscclSchedulerPtr->selectAlgo(&(param->p)));
   } else {
-    if (param->comm->topo->mscclEnabled) {
+    if (param->comm->topo->mscclEnabled || rcclParamMscclForceEnabled()) {
       NCCLCHECK(mscclInternalSchedulerSelectAlgo(&(param->p)));
     } else {
       param->p.scheduled = false;
diff --git a/projects/rccl/src/misc/npkit.cc b/projects/rccl/src/misc/npkit.cc
index 6ef83f0a40..0302fbd92b 100644
--- a/projects/rccl/src/misc/npkit.cc
+++ b/projects/rccl/src/misc/npkit.cc
@@ -120,7 +120,13 @@ ncclResult_t NpKit::Dump(const std::string& dump_dir) {
   dump_file_path = dump_dir;
   dump_file_path += "/gpu_clock_rate_rank_";
   dump_file_path += std::to_string(rank_);
-  constexpr int vega_gpu_rtc_freq_in_khz = 25000;
+  hipDeviceProp_t devProp;
+  int vega_gpu_rtc_freq_in_khz;
+  CUDACHECK(hipGetDeviceProperties(&devProp, 0));
+  if (devProp.gcnArch/10 == 94)
+    vega_gpu_rtc_freq_in_khz = 100000;
+  else
+    vega_gpu_rtc_freq_in_khz = 25000;
   std::string clock_rate_str = std::to_string(vega_gpu_rtc_freq_in_khz);
   auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out);
   gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc
index 273858cd02..274006d97f 100644
--- a/projects/rccl/src/transport/net.cc
+++ b/projects/rccl/src/transport/net.cc
@@ -190,7 +190,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
-  if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) {
+  if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
     CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
     send->conn.curr_hdp_reg = req.curr_hdp_reg;
   }
diff --git a/projects/rccl/src/transport/p2p.cc b/projects/rccl/src/transport/p2p.cc
index 460b4bf4e9..fd9e67c72d 100644
--- a/projects/rccl/src/transport/p2p.cc
+++ b/projects/rccl/src/transport/p2p.cc
@@ -354,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
     return ncclInternalError;
   }
-  if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) {
+  if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
     CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
     TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
   }
diff --git a/projects/rccl/tools/scripts/topo_val.sh b/projects/rccl/tools/scripts/topo_val.sh
index 1dc06dd074..14c2cf2831 100755
--- a/projects/rccl/tools/scripts/topo_val.sh
+++ b/projects/rccl/tools/scripts/topo_val.sh
@@ -21,7 +21,7 @@
 
 DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
-for i in {0..78}
+for i in {0..80}
 do
 	if [[ $i -eq 50 ]] || [[ $i -eq 51 ]]
 	then
diff --git a/projects/rccl/tools/topo_expl/topo_expl.cpp b/projects/rccl/tools/topo_expl/topo_expl.cpp
index ea636a5492..1430e76f70 100644
--- a/projects/rccl/tools/topo_expl/topo_expl.cpp
+++ b/projects/rccl/tools/topo_expl/topo_expl.cpp
@@ -151,6 +151,8 @@ NodeModelDesc model_descs[] = {
   {2, "topo_8p1h_4.xml",        "2 nodes 8P1H Alt."},
   {1, "topo_8p1h_5.xml",        "Single node 8P1H Alt."},
   {2, "topo_8p1h_5.xml",        "2 nodes 8P1H Alt."},
+  {1, "topo_8p_940.xml",        "Single node gfx940 8P"},
+  {2, "topo_4p_940.xml",        "2 nodes gfx940 4P"},
 };
 
 NCCL_PARAM(MaxCTAs, "MAX_CTAS", MAXCHANNELS);