Enable gfx94x (#808) (#816)

(cherry picked from commit 94da229a7788d74685d1591a4e75a8341de64f41) [ROCm/rccl commit: a7fcd58a97]
2023-07-21 07:31:27 -07:00
@@ -17,7 +17,7 @@ option(BUILD_LOCAL_GPU_TARGET_ONLY             "Build only for GPUs detected on
 option(BUILD_SHARED_LIBS                       "Build as shared library"                       ON)
 option(BUILD_TESTS                             "Build unit test programs"                      OFF)
 option(COLLTRACE                               "Collective Trace Option"                       ON)
-option(ENABLE_IFC                              "Enable indirect function call"                 ON)
+option(ENABLE_IFC                              "Enable indirect function call"                 OFF)
 option(INSTALL_DEPENDENCIES                    "Force install dependencies"                    OFF)
 option(PROFILE                                 "Enable profiling"                              OFF)
 option(TIMETRACE                               "Enable time-trace during compilation"          OFF)
@@ -32,6 +32,9 @@ set(DEFAULT_GPUS
      gfx908:xnack-
      gfx90a:xnack-
      gfx90a:xnack+
+      gfx940
+      gfx941
+      gfx942
      gfx1030
      gfx1100
      gfx1101
@@ -118,6 +121,9 @@ message(STATUS "hipcc version:    ${hipcc_version_string}")
 ### Check for hipEventDisableSystemFence support
 check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVENT_DISABLE_FENCE)

+### Check for hipDeviceMallocUncached support
+check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
+
 ### Check for indirect function call support
 if(ENABLE_IFC)
  if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201")
@@ -523,6 +529,9 @@ endif()
 if(${HIP_EVENT_DISABLE_FENCE})
  target_compile_definitions(rccl PRIVATE HIP_EVENT_DISABLE_FENCE)
 endif()
+if(${HIP_UNCACHED_MEMORY})
+  target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY)
+endif()
 if (BUILD_BFD)
  if (HAVE_BFD)
    target_compile_definitions(rccl PRIVATE HAVE_BFD)
@@ -28,10 +28,17 @@
  { __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST); }
 #endif

+#ifdef ENABLE_LL128
 #define NCCL_FUNC5(func, algo, devredop, type, nullify) \
  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL,     devredop, type)), \
  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128,  devredop, type)), \
  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
+#else
+#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
+  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL,     devredop, type)), \
+  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL,     devredop, type)), \
+  MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
+#endif

 #define NCCL_FUNC4(func, devredop, type, nullify) \
  NCCL_FUNC5(func, TREE,    devredop, type, nullify), \
@@ -542,7 +549,11 @@ __forceinline__ __device__ void ncclKernel(
 #ifdef USE_INDIRECT_FUNCTION_CALL
      ncclFuncs[ncclShmem.work.header.funcIndex]();
 #else
+#ifdef ENABLE_LL128
      NCCL_CALL_FUNCTIONS<1>(ncclShmem.work.header.funcIndex);
+#else
+      NCCL_CALL_FUNCTIONS<0>(ncclShmem.work.header.funcIndex);
+#endif
 #endif
    }

@@ -607,10 +618,16 @@ __device__  __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev
 #endif

 // Only generate inline kernels for LL
+#ifdef ENABLE_LL128
 #define IMPL_COLL4(func, algo, devredop, type) \
  IMPL_COLL_FUNC(func, algo, LL,     devredop, type) \
  IMPL_COLL_FUNC(func, algo, LL128,  devredop, type) \
  IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type)
+#else
+#define IMPL_COLL4(func, algo, devredop, type) \
+  IMPL_COLL_FUNC(func, algo, LL,     devredop, type) \
+  IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type)
+#endif

 #define IMPL_COLL3(func, devredop, type) \
  IMPL_COLL4(func, TREE,    devredop, type) \
@@ -700,36 +700,32 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
  } while (system->nodes[NET].count);

  int remove = 1;
-  int arch, vendor, model;
-  NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
-  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD
-    && model == NCCL_TOPO_CPU_TYPE_ROME) {
-    int gdr = 1;
-    bool allXgmi = true;
-    // detect if all GPUs are connected by XGMI
-    for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
-      int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
-      for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
-        if (i == j) continue;
-        int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
-        bool isXGMI;
-        NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
-        allXgmi &= isXGMI;
-      }
-    }
-    if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
-    for (int g = 0; g < system->nodes[GPU].count; g++) {
-      int net;
-      NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
-      NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
-      if (!gdr) break;
-    }
-    if (gdr && !allXgmi) {
-      remove = 0;
-      system->type |= RCCL_TOPO_GDR_ALL;
-      INFO(NCCL_GRAPH, "GDR is available on all GPUs");
+  int gdr = 1;
+  bool allXgmi = true;
+  // detect if all GPUs are connected by XGMI
+  for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
+    int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
+    for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
+      if (i == j) continue;
+      int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
+      bool isXGMI;
+      NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
+      allXgmi &= isXGMI;
    }
  }
+  if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
+  for (int g = 0; g < system->nodes[GPU].count; g++) {
+    int net;
+    NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
+    NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
+    if (!gdr) break;
+  }
+  if (gdr && !allXgmi) {
+    remove = 0;
+    system->type |= RCCL_TOPO_GDR_ALL;
+    INFO(NCCL_GRAPH, "GDR is available on all GPUs");
+  }
+
  if (rcclParamEnableIntranet()) {
    remove = 0;
    system->type |= RCCL_TOPO_FORCE_INTRA;
@@ -812,7 +808,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {

  int arch, vendor, model;
  NCCLCHECK(ncclTopoCpuType(comm->topo, &arch, &vendor, &model));
-  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) {
    // Adjust P2P channels on Intel platform
    comm->p2pnChannelsPerPeer = 1;
    comm->p2pnChannels = 2;
@@ -578,6 +578,32 @@ static struct rcclRomeModel rome_model_76 = {
  .treeBase = "",
 };

+static struct rcclRomeModel rome_model_79 = {
+  .nGpus = 8, .nCpus = 2, .nNics = 0, .nLinks = 7,
+  .gpuIds = { 0x1d000, 0x2e000, 0x3f000, 0x61000, 0x9f000, 0xaf000, 0xbf000, 0xdf000, },
+  .nicIds = { },
+  .gpuNuma = { 0, 0, 0, 0, 1, 1, 1, 1, },
+  .nicNuma = { },
+  .connMatrix = { 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, },
+  .gdrLevel = { },
+  .pattern = "4040",
+  .ringBase = "0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0",
+  .options = "noCpuCheck=1",
+};
+
+static struct rcclRomeModel rome_model_80 = {
+  .nGpus = 4, .nCpus = 4, .nNics = 4, .nLinks = 3,
+  .gpuIds = { 0x82000, 0xc2000, 0x2000, 0x42000, },
+  .nicIds = { 0x81000, 0xc1000, 0x1000, 0x41000, },
+  .gpuNuma = { 2, 3, 0, 1, },
+  .nicNuma = { 2, 3, 0, 1, },
+  .connMatrix = { 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, },
+  .gdrLevel = { PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, },
+  .pattern = "11111111",
+  .ringBase = "N2 2 3 0 1 N1|N0 0 1 3 2 N2|N0 0 2 1 3 N3|N3 3 1 0 2 N2|N3 3 1 2 0 N0|N1 1 0 3 2 N2|N1 1 2 3 0 N0|N2 2 0 1 3 N3|N3 3 0 2 1 N1|N2 2 3 1 0 N0|N1 1 2 0 3 N3|N0 0 3 2 1 N1",
+  .options = "",
+};
+
 static struct rcclRomeModel romeTopoModels[] = {
  rome_model_22,
  rome_model_25,
@@ -617,6 +643,8 @@ static struct rcclRomeModel romeTopoModels[] = {
  rome_model_73,
  rome_model_74,
  rome_model_76,
+  rome_model_79,
+  rome_model_80,
 };

 /* Parse user defined rings. Format is like :
@@ -850,7 +878,7 @@ static void parseOptions(struct ncclTopoSystem* system, const char *options) {
  }
 }

-static bool disableNumaMatching(const char *options) {
+static bool checkOption(const char *options, const char *name) {
  if (strcmp(options, "")) {
    char *str_temp = (char *)malloc(strlen(options) + 1);
    strcpy(str_temp, options);
@@ -862,7 +890,7 @@ static bool disableNumaMatching(const char *options) {
    while (tokens[numTokens-1] != NULL && numTokens < MAX_OPT_TOKENS)
        tokens[numTokens++] = strtok_r(NULL, "=, ", &state);
    for (int i = 0; i < numTokens/2; i++) {
-      if (strcmp(tokens[i*2], "disableNumaMatching") == 0) {
+      if (strcmp(tokens[i*2], name) == 0) {
        return (bool)atol(tokens[i*2+1]);
      }
    }
@@ -1202,8 +1230,6 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
  // only valid on Rome
  int arch, vendor, model;
  NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
-  if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
-    return ncclSuccess;

  // number of GPUs and NICs on each numa node is used as first screening pattern
  struct rcclRomeModel romeTopo;
@@ -1229,7 +1255,10 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
  if (i < romeTopo.nGpus) match_nbio = false;

  for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) {
-    bool ignore_numa = disableNumaMatching(romeTopoModels[i].options);
+    bool ignore_cpu = checkOption(romeTopoModels[i].options, "noCpuCheck");
+    if (!ignore_cpu && (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME))
+      continue;
+    bool ignore_numa = checkOption(romeTopoModels[i].options, "disableNumaMatching");
    if (!ignore_numa && romeTopo.nCpus != romeTopoModels[i].nCpus) continue;
    if (romeTopo.nGpus != romeTopoModels[i].nGpus ||
      romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue;
@@ -1501,7 +1530,8 @@ ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* grap
    }
  }
  INFO(NCCL_GRAPH, "%s", line);
-  system->type |= RCCL_TOPO_4P2H_ROME;
+  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME)
+    system->type |= RCCL_TOPO_4P2H_ROME;
  parseOptions(system, rome_model_68.options);
  // create 4P4H based on reference and remapped ids
  NCCLCHECK(parseGraph(rome_model_68.ringBase, system, graph, g_hives, n_hives));
@@ -837,8 +837,8 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
 }

 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-float speedArrayIntra[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
-float speedArrayInter[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float speedArrayIntra[] = { 48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float speedArrayInter[] = { 48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
 #else
@@ -890,7 +890,9 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
  if (str) {
    // user supplied topo
    NCCLCHECK(parseGraph(str, system, graph, NULL, NULL));
-    if (graph->nChannels) {
+    int arch, vendor, model;
+    NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
+    if (graph->nChannels && arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) {
      system->type |= RCCL_TOPO_4P2H_ROME;
    }
  } else if (!rcclParamModelMatchingDisable() && !graph->collNet) {
@@ -380,7 +380,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
  return ncclSuccess;
 }

-struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
+struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { "0x120000", GPU }, { NULL, PCI /* Default fallback value */ } };
 struct kvDict kvDictPciGen[] = {
  { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
  { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
@@ -27,6 +27,7 @@
 #define NET_BW 12.0           // 100Gbit
 #define VEGA_XGMI_WIDTH 24.0
 #define MI200_XGMI_WIDTH 36.0
+#define GFX94X_XGMI_WIDTH 48.0

 // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
 // to GPU traffic consumes more PCI bandwidth.
@@ -223,7 +224,16 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in

 // Returns XGMI speed in GB/s
 static float ncclTopoXGMISpeed(int gcn) {
-  return gcn == 910 ? MI200_XGMI_WIDTH : VEGA_XGMI_WIDTH;
+  switch (gcn) {
+    case 910:
+      return MI200_XGMI_WIDTH;
+    case 940:
+    case 941:
+    case 942:
+      return GFX94X_XGMI_WIDTH;
+    default:
+      return VEGA_XGMI_WIDTH;
+  }
 }

 #if ENABLE_COLLTRACE
@@ -165,9 +165,15 @@ ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  *ptr = nullptr;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (isFineGrain)
+  if (isFineGrain) {
+#if defined(HIP_UNCACHED_MEMORY)
+    hipDeviceProp_t prop;
+    CUDACHECK(hipGetDeviceProperties(&prop, 0));
+    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish);
+#else
    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
-  else
+#endif
+  } else
    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
@@ -187,9 +193,15 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t
  cudaStream_t stream = sideStream;
  if (stream == nullptr)
    CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  if (isFineGrain)
+  if (isFineGrain) {
+#if defined(HIP_UNCACHED_MEMORY)
+    hipDeviceProp_t prop;
+    CUDACHECK(hipGetDeviceProperties(&prop, 0));
+    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish);
+#else
    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
-  else
+#endif
+  } else
    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
@@ -215,9 +227,15 @@ ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, s
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  *ptr = nullptr;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (isFineGrain)
+  if (isFineGrain) {
+#if defined(HIP_UNCACHED_MEMORY)
+    hipDeviceProp_t prop;
+    CUDACHECK(hipGetDeviceProperties(&prop, 0));
+    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish);
+#else
    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
-  else
+#endif
+  } else
    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
  int dev;
@@ -171,6 +171,13 @@ RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0);
 void *ncclCommThreadMain(void *arg) {
  ncclComm_t comm = (ncclComm_t)arg;
  int head = 0;
+  hipDeviceProp_t devProp;
+  double vega_gpu_rtc_freq;
+  hipError_t status = hipGetDeviceProperties(&devProp, comm->cudaDev);
+  if (devProp.gcnArch/10 == 94 && status == hipSuccess)
+    vega_gpu_rtc_freq = 1.0E8;
+  else
+    vega_gpu_rtc_freq = 2.5E7;
  #define MAX_NAME_LENGTH 64
  char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+2));
  for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) {
@@ -213,16 +220,15 @@ void *ncclCommThreadMain(void *arg) {
      char line[1024];
      int offset = 0;
      uint16_t fIdx = td->funcIndex;
-      #define VEGA_GPU_RTC_FREQUENCY 2.5E7
      if (type == ncclCollTraceDataType) {
        sprintf(line, "## [%012.6f] [%02d:%02d] L:%04d DT %08x %016lx %016lx",
-          (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid,
+          (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid,
          fIdx, td->data_0, td->opCount, td->data_1);
      } else {
        if (fIdx == FUNC_INDEX_P2P || type == ncclCollTraceP2pElemType)
-          sprintf(line, "## [%012.6f] [%02d:%02d] %06x-%06x", (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->p2pOpCount[0], td->p2pOpCount[1]);
+          sprintf(line, "## [%012.6f] [%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->p2pOpCount[0], td->p2pOpCount[1]);
        else
-          sprintf(line, "## [%012.6f] [%02d:%02d] %06lx", (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->opCount);
+          sprintf(line, "## [%012.6f] [%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->opCount);
        offset = strlen(line);
        if (type == ncclCollTraceCollElemType) {
          sprintf(line+offset, " CE %s nw %d bi %d nc %d busId %lx nRanks %d", func_names+MAX_NAME_LENGTH*fIdx, td->coll.nWarps, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks);
@@ -705,7 +711,13 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u

  // detect if fine grained memory is available on this GPU
  int *ptr;
+#if defined(HIP_UNCACHED_MEMORY)
+  hipDeviceProp_t prop;
+  CUDACHECK(hipGetDeviceProperties(&prop, 0));
+  if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained) == hipSuccess) {
+#else
  if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) {
+#endif
    CUDACHECK(hipFree(ptr));
    info->hasFineGrain = true;
    NCCLCHECK(ncclGpuGdrSupport(comm, &info->gdrSupport));
@@ -23,6 +23,7 @@
 #include "msccl/msccl_status.h"

 RCCL_PARAM(MscclEnabled, "MSCCL_ENABLE", 1);
+RCCL_PARAM(MscclForceEnabled, "MSCCL_FORCE_ENABLE", 0);
 static const char* mscclAlgoFilePathEnv = "MSCCL_ALGO_FILE_PATH";
 static std::atomic<bool> mscclInitialized;
 static bool mscclSchedulerTriedLoadAlgo = false;
@@ -257,7 +258,7 @@ static ncclResult_t mscclSchedulerSelectAlgo(struct mscclSavedSchedulerParam* pa
  if (status.mscclSchedulerPtr) {
    NCCLCHECK(status.mscclSchedulerPtr->selectAlgo(&(param->p)));
  } else {
-    if (param->comm->topo->mscclEnabled) {
+    if (param->comm->topo->mscclEnabled || rcclParamMscclForceEnabled()) {
      NCCLCHECK(mscclInternalSchedulerSelectAlgo(&(param->p)));
    } else {
      param->p.scheduled = false;
@@ -120,7 +120,13 @@ ncclResult_t NpKit::Dump(const std::string& dump_dir) {
  dump_file_path = dump_dir;
  dump_file_path += "/gpu_clock_rate_rank_";
  dump_file_path += std::to_string(rank_);
-  constexpr int vega_gpu_rtc_freq_in_khz = 25000;
+  hipDeviceProp_t devProp;
+  int vega_gpu_rtc_freq_in_khz;
+  CUDACHECK(hipGetDeviceProperties(&devProp, 0));
+  if (devProp.gcnArch/10 == 94)
+    vega_gpu_rtc_freq_in_khz = 100000;
+  else
+    vega_gpu_rtc_freq_in_khz = 25000;
  std::string clock_rate_str = std::to_string(vega_gpu_rtc_freq_in_khz);
  auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out);
  gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
@@ -190,7 +190,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
  send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
-  if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) {
+  if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
    CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
    send->conn.curr_hdp_reg = req.curr_hdp_reg;
  }
@@ -354,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
    return ncclInternalError;
  }
-  if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) {
+  if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
    CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
  }
@@ -21,7 +21,7 @@

 DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

-for i in {0..78}
+for i in {0..80}
 do
 	if [[ $i -eq 50 ]] || [[ $i -eq 51 ]]
 	then
@@ -151,6 +151,8 @@ NodeModelDesc model_descs[] = {
  {2, "topo_8p1h_4.xml",        "2 nodes 8P1H Alt."},
  {1, "topo_8p1h_5.xml",        "Single node 8P1H Alt."},
  {2, "topo_8p1h_5.xml",        "2 nodes 8P1H Alt."},
+  {1, "topo_8p_940.xml",        "Single node gfx940 8P"},
+  {2, "topo_4p_940.xml",        "2 nodes gfx940 4P"},
 };

 NCCL_PARAM(MaxCTAs, "MAX_CTAS", MAXCHANNELS);