From dfda1d6fab2f296bdcb41b7b79280a876585ae20 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Fri, 21 Jul 2023 07:31:27 -0700 Subject: [PATCH] Enable gfx94x (#808) (#816) (cherry picked from commit 94da229a7788d74685d1591a4e75a8341de64f41) [ROCm/rccl commit: a7fcd58a9754ddaa1b96a00d68b72e4d3a379e26] --- projects/rccl/CMakeLists.txt | 11 +++- projects/rccl/src/collectives/device/common.h | 17 ++++++ projects/rccl/src/graph/paths.cc | 54 +++++++++---------- projects/rccl/src/graph/rome_models.cc | 42 ++++++++++++--- projects/rccl/src/graph/search.cc | 8 +-- projects/rccl/src/graph/topo.cc | 2 +- projects/rccl/src/graph/topo.h | 12 ++++- projects/rccl/src/include/alloc.h | 30 ++++++++--- projects/rccl/src/init.cc | 20 +++++-- .../rccl/src/misc/msccl/msccl_lifecycle.cc | 3 +- projects/rccl/src/misc/npkit.cc | 8 ++- projects/rccl/src/transport/net.cc | 2 +- projects/rccl/src/transport/p2p.cc | 2 +- projects/rccl/tools/scripts/topo_val.sh | 2 +- projects/rccl/tools/topo_expl/topo_expl.cpp | 2 + 15 files changed, 159 insertions(+), 56 deletions(-) diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt index 13e1fc77c3..80cc46b41f 100644 --- a/projects/rccl/CMakeLists.txt +++ b/projects/rccl/CMakeLists.txt @@ -17,7 +17,7 @@ option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on option(BUILD_SHARED_LIBS "Build as shared library" ON) option(BUILD_TESTS "Build unit test programs" OFF) option(COLLTRACE "Collective Trace Option" ON) -option(ENABLE_IFC "Enable indirect function call" ON) +option(ENABLE_IFC "Enable indirect function call" OFF) option(INSTALL_DEPENDENCIES "Force install dependencies" OFF) option(PROFILE "Enable profiling" OFF) option(TIMETRACE "Enable time-trace during compilation" OFF) @@ -32,6 +32,9 @@ set(DEFAULT_GPUS gfx908:xnack- gfx90a:xnack- gfx90a:xnack+ + gfx940 + gfx941 + gfx942 gfx1030 gfx1100 gfx1101 @@ -118,6 +121,9 @@ message(STATUS "hipcc version: ${hipcc_version_string}") ### Check for hipEventDisableSystemFence support check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVENT_DISABLE_FENCE) +### Check for hipDeviceMallocUncached support +check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY) + ### Check for indirect function call support if(ENABLE_IFC) if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201") @@ -523,6 +529,9 @@ endif() if(${HIP_EVENT_DISABLE_FENCE}) target_compile_definitions(rccl PRIVATE HIP_EVENT_DISABLE_FENCE) endif() +if(${HIP_UNCACHED_MEMORY}) + target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY) +endif() if (BUILD_BFD) if (HAVE_BFD) target_compile_definitions(rccl PRIVATE HAVE_BFD) diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h index d4c89343f2..f645e97ae2 100644 --- a/projects/rccl/src/collectives/device/common.h +++ b/projects/rccl/src/collectives/device/common.h @@ -28,10 +28,17 @@ { __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST); } #endif +#ifdef ENABLE_LL128 #define NCCL_FUNC5(func, algo, devredop, type, nullify) \ MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \ MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128, devredop, type)), \ MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type)) +#else +#define NCCL_FUNC5(func, algo, devredop, type, nullify) \ + MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \ + MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \ + MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type)) +#endif #define NCCL_FUNC4(func, devredop, type, nullify) \ NCCL_FUNC5(func, TREE, devredop, type, nullify), \ @@ -542,7 +549,11 @@ __forceinline__ __device__ void ncclKernel( #ifdef USE_INDIRECT_FUNCTION_CALL ncclFuncs[ncclShmem.work.header.funcIndex](); #else +#ifdef ENABLE_LL128 NCCL_CALL_FUNCTIONS<1>(ncclShmem.work.header.funcIndex); +#else + NCCL_CALL_FUNCTIONS<0>(ncclShmem.work.header.funcIndex); +#endif #endif } @@ -607,10 +618,16 @@ __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev #endif // Only generate inline kernels for LL +#ifdef ENABLE_LL128 #define IMPL_COLL4(func, algo, devredop, type) \ IMPL_COLL_FUNC(func, algo, LL, devredop, type) \ IMPL_COLL_FUNC(func, algo, LL128, devredop, type) \ IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type) +#else +#define IMPL_COLL4(func, algo, devredop, type) \ + IMPL_COLL_FUNC(func, algo, LL, devredop, type) \ + IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type) +#endif #define IMPL_COLL3(func, devredop, type) \ IMPL_COLL4(func, TREE, devredop, type) \ diff --git a/projects/rccl/src/graph/paths.cc b/projects/rccl/src/graph/paths.cc index cb2bf81b31..5df512b108 100644 --- a/projects/rccl/src/graph/paths.cc +++ b/projects/rccl/src/graph/paths.cc @@ -700,36 +700,32 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* } while (system->nodes[NET].count); int remove = 1; - int arch, vendor, model; - NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); - if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD - && model == NCCL_TOPO_CPU_TYPE_ROME) { - int gdr = 1; - bool allXgmi = true; - // detect if all GPUs are connected by XGMI - for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) { - int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev; - for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) { - if (i == j) continue; - int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev; - bool isXGMI; - NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI)); - allXgmi &= isXGMI; - } - } - if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL; - for (int g = 0; g < system->nodes[GPU].count; g++) { - int net; - NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net)); - NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr)); - if (!gdr) break; - } - if (gdr && !allXgmi) { - remove = 0; - system->type |= RCCL_TOPO_GDR_ALL; - INFO(NCCL_GRAPH, "GDR is available on all GPUs"); + int gdr = 1; + bool allXgmi = true; + // detect if all GPUs are connected by XGMI + for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) { + int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev; + for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) { + if (i == j) continue; + int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev; + bool isXGMI; + NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI)); + allXgmi &= isXGMI; } } + if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL; + for (int g = 0; g < system->nodes[GPU].count; g++) { + int net; + NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net)); + NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr)); + if (!gdr) break; + } + if (gdr && !allXgmi) { + remove = 0; + system->type |= RCCL_TOPO_GDR_ALL; + INFO(NCCL_GRAPH, "GDR is available on all GPUs"); + } + if (rcclParamEnableIntranet()) { remove = 0; system->type |= RCCL_TOPO_FORCE_INTRA; @@ -812,7 +808,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(comm->topo, &arch, &vendor, &model)); - if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { + if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) { // Adjust P2P channels on Intel platform comm->p2pnChannelsPerPeer = 1; comm->p2pnChannels = 2; diff --git a/projects/rccl/src/graph/rome_models.cc b/projects/rccl/src/graph/rome_models.cc index f5ce15b168..e0e5c41948 100644 --- a/projects/rccl/src/graph/rome_models.cc +++ b/projects/rccl/src/graph/rome_models.cc @@ -578,6 +578,32 @@ static struct rcclRomeModel rome_model_76 = { .treeBase = "", }; +static struct rcclRomeModel rome_model_79 = { + .nGpus = 8, .nCpus = 2, .nNics = 0, .nLinks = 7, + .gpuIds = { 0x1d000, 0x2e000, 0x3f000, 0x61000, 0x9f000, 0xaf000, 0xbf000, 0xdf000, }, + .nicIds = { }, + .gpuNuma = { 0, 0, 0, 0, 1, 1, 1, 1, }, + .nicNuma = { }, + .connMatrix = { 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, }, + .gdrLevel = { }, + .pattern = "4040", + .ringBase = "0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0", + .options = "noCpuCheck=1", +}; + +static struct rcclRomeModel rome_model_80 = { + .nGpus = 4, .nCpus = 4, .nNics = 4, .nLinks = 3, + .gpuIds = { 0x82000, 0xc2000, 0x2000, 0x42000, }, + .nicIds = { 0x81000, 0xc1000, 0x1000, 0x41000, }, + .gpuNuma = { 2, 3, 0, 1, }, + .nicNuma = { 2, 3, 0, 1, }, + .connMatrix = { 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, }, + .gdrLevel = { PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, }, + .pattern = "11111111", + .ringBase = "N2 2 3 0 1 N1|N0 0 1 3 2 N2|N0 0 2 1 3 N3|N3 3 1 0 2 N2|N3 3 1 2 0 N0|N1 1 0 3 2 N2|N1 1 2 3 0 N0|N2 2 0 1 3 N3|N3 3 0 2 1 N1|N2 2 3 1 0 N0|N1 1 2 0 3 N3|N0 0 3 2 1 N1", + .options = "", +}; + static struct rcclRomeModel romeTopoModels[] = { rome_model_22, rome_model_25, @@ -617,6 +643,8 @@ static struct rcclRomeModel romeTopoModels[] = { rome_model_73, rome_model_74, rome_model_76, + rome_model_79, + rome_model_80, }; /* Parse user defined rings. Format is like : @@ -850,7 +878,7 @@ static void parseOptions(struct ncclTopoSystem* system, const char *options) { } } -static bool disableNumaMatching(const char *options) { +static bool checkOption(const char *options, const char *name) { if (strcmp(options, "")) { char *str_temp = (char *)malloc(strlen(options) + 1); strcpy(str_temp, options); @@ -862,7 +890,7 @@ static bool disableNumaMatching(const char *options) { while (tokens[numTokens-1] != NULL && numTokens < MAX_OPT_TOKENS) tokens[numTokens++] = strtok_r(NULL, "=, ", &state); for (int i = 0; i < numTokens/2; i++) { - if (strcmp(tokens[i*2], "disableNumaMatching") == 0) { + if (strcmp(tokens[i*2], name) == 0) { return (bool)atol(tokens[i*2+1]); } } @@ -1202,8 +1230,6 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* // only valid on Rome int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); - if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME) - return ncclSuccess; // number of GPUs and NICs on each numa node is used as first screening pattern struct rcclRomeModel romeTopo; @@ -1229,7 +1255,10 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* if (i < romeTopo.nGpus) match_nbio = false; for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) { - bool ignore_numa = disableNumaMatching(romeTopoModels[i].options); + bool ignore_cpu = checkOption(romeTopoModels[i].options, "noCpuCheck"); + if (!ignore_cpu && (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)) + continue; + bool ignore_numa = checkOption(romeTopoModels[i].options, "disableNumaMatching"); if (!ignore_numa && romeTopo.nCpus != romeTopoModels[i].nCpus) continue; if (romeTopo.nGpus != romeTopoModels[i].nGpus || romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue; @@ -1501,7 +1530,8 @@ ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* grap } } INFO(NCCL_GRAPH, "%s", line); - system->type |= RCCL_TOPO_4P2H_ROME; + if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) + system->type |= RCCL_TOPO_4P2H_ROME; parseOptions(system, rome_model_68.options); // create 4P4H based on reference and remapped ids NCCLCHECK(parseGraph(rome_model_68.ringBase, system, graph, g_hives, n_hives)); diff --git a/projects/rccl/src/graph/search.cc b/projects/rccl/src/graph/search.cc index 45364fb8ba..ad6ff7c1d9 100644 --- a/projects/rccl/src/graph/search.cc +++ b/projects/rccl/src/graph/search.cc @@ -837,8 +837,8 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs } #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) -float speedArrayIntra[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; -float speedArrayInter[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +float speedArrayIntra[] = { 48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +float speedArrayInter[] = { 48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float)) #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float)) #else @@ -890,7 +890,9 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph if (str) { // user supplied topo NCCLCHECK(parseGraph(str, system, graph, NULL, NULL)); - if (graph->nChannels) { + int arch, vendor, model; + NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); + if (graph->nChannels && arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) { system->type |= RCCL_TOPO_4P2H_ROME; } } else if (!rcclParamModelMatchingDisable() && !graph->collNet) { diff --git a/projects/rccl/src/graph/topo.cc b/projects/rccl/src/graph/topo.cc index bd4c75310f..f246630fce 100644 --- a/projects/rccl/src/graph/topo.cc +++ b/projects/rccl/src/graph/topo.cc @@ -380,7 +380,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s return ncclSuccess; } -struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } }; +struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { "0x120000", GPU }, { NULL, PCI /* Default fallback value */ } }; struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */ { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 }, diff --git a/projects/rccl/src/graph/topo.h b/projects/rccl/src/graph/topo.h index cc995d4ccf..50fd6275df 100644 --- a/projects/rccl/src/graph/topo.h +++ b/projects/rccl/src/graph/topo.h @@ -27,6 +27,7 @@ #define NET_BW 12.0 // 100Gbit #define VEGA_XGMI_WIDTH 24.0 #define MI200_XGMI_WIDTH 36.0 +#define GFX94X_XGMI_WIDTH 48.0 // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU // to GPU traffic consumes more PCI bandwidth. @@ -223,7 +224,16 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in // Returns XGMI speed in GB/s static float ncclTopoXGMISpeed(int gcn) { - return gcn == 910 ? MI200_XGMI_WIDTH : VEGA_XGMI_WIDTH; + switch (gcn) { + case 910: + return MI200_XGMI_WIDTH; + case 940: + case 941: + case 942: + return GFX94X_XGMI_WIDTH; + default: + return VEGA_XGMI_WIDTH; + } } #if ENABLE_COLLTRACE diff --git a/projects/rccl/src/include/alloc.h b/projects/rccl/src/include/alloc.h index 4f47be44d9..9579044bf3 100644 --- a/projects/rccl/src/include/alloc.h +++ b/projects/rccl/src/include/alloc.h @@ -165,9 +165,15 @@ ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (isFineGrain) + if (isFineGrain) { +#if defined(HIP_UNCACHED_MEMORY) + hipDeviceProp_t prop; + CUDACHECK(hipGetDeviceProperties(&prop, 0)); + CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish); +#else CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish); - else +#endif + } else CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); @@ -187,9 +193,15 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t cudaStream_t stream = sideStream; if (stream == nullptr) CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - if (isFineGrain) + if (isFineGrain) { +#if defined(HIP_UNCACHED_MEMORY) + hipDeviceProp_t prop; + CUDACHECK(hipGetDeviceProperties(&prop, 0)); + CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish); +#else CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish); - else +#endif + } else CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); @@ -215,9 +227,15 @@ ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, s cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; *ptr = nullptr; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (isFineGrain) + if (isFineGrain) { +#if defined(HIP_UNCACHED_MEMORY) + hipDeviceProp_t prop; + CUDACHECK(hipGetDeviceProperties(&prop, 0)); + CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish); +#else CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish); - else +#endif + } else CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); int dev; diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 9778cb7bda..baaa64ea6d 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -171,6 +171,13 @@ RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0); void *ncclCommThreadMain(void *arg) { ncclComm_t comm = (ncclComm_t)arg; int head = 0; + hipDeviceProp_t devProp; + double vega_gpu_rtc_freq; + hipError_t status = hipGetDeviceProperties(&devProp, comm->cudaDev); + if (devProp.gcnArch/10 == 94 && status == hipSuccess) + vega_gpu_rtc_freq = 1.0E8; + else + vega_gpu_rtc_freq = 2.5E7; #define MAX_NAME_LENGTH 64 char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+2)); for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) { @@ -213,16 +220,15 @@ void *ncclCommThreadMain(void *arg) { char line[1024]; int offset = 0; uint16_t fIdx = td->funcIndex; - #define VEGA_GPU_RTC_FREQUENCY 2.5E7 if (type == ncclCollTraceDataType) { sprintf(line, "## [%012.6f] [%02d:%02d] L:%04d DT %08x %016lx %016lx", - (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, + (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, fIdx, td->data_0, td->opCount, td->data_1); } else { if (fIdx == FUNC_INDEX_P2P || type == ncclCollTraceP2pElemType) - sprintf(line, "## [%012.6f] [%02d:%02d] %06x-%06x", (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->p2pOpCount[0], td->p2pOpCount[1]); + sprintf(line, "## [%012.6f] [%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->p2pOpCount[0], td->p2pOpCount[1]); else - sprintf(line, "## [%012.6f] [%02d:%02d] %06lx", (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->opCount); + sprintf(line, "## [%012.6f] [%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->opCount); offset = strlen(line); if (type == ncclCollTraceCollElemType) { sprintf(line+offset, " CE %s nw %d bi %d nc %d busId %lx nRanks %d", func_names+MAX_NAME_LENGTH*fIdx, td->coll.nWarps, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks); @@ -705,7 +711,13 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u // detect if fine grained memory is available on this GPU int *ptr; +#if defined(HIP_UNCACHED_MEMORY) + hipDeviceProp_t prop; + CUDACHECK(hipGetDeviceProperties(&prop, 0)); + if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained) == hipSuccess) { +#else if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) { +#endif CUDACHECK(hipFree(ptr)); info->hasFineGrain = true; NCCLCHECK(ncclGpuGdrSupport(comm, &info->gdrSupport)); diff --git a/projects/rccl/src/misc/msccl/msccl_lifecycle.cc b/projects/rccl/src/misc/msccl/msccl_lifecycle.cc index 8424b21aa1..885e7d0bd9 100644 --- a/projects/rccl/src/misc/msccl/msccl_lifecycle.cc +++ b/projects/rccl/src/misc/msccl/msccl_lifecycle.cc @@ -23,6 +23,7 @@ #include "msccl/msccl_status.h" RCCL_PARAM(MscclEnabled, "MSCCL_ENABLE", 1); +RCCL_PARAM(MscclForceEnabled, "MSCCL_FORCE_ENABLE", 0); static const char* mscclAlgoFilePathEnv = "MSCCL_ALGO_FILE_PATH"; static std::atomic mscclInitialized; static bool mscclSchedulerTriedLoadAlgo = false; @@ -257,7 +258,7 @@ static ncclResult_t mscclSchedulerSelectAlgo(struct mscclSavedSchedulerParam* pa if (status.mscclSchedulerPtr) { NCCLCHECK(status.mscclSchedulerPtr->selectAlgo(&(param->p))); } else { - if (param->comm->topo->mscclEnabled) { + if (param->comm->topo->mscclEnabled || rcclParamMscclForceEnabled()) { NCCLCHECK(mscclInternalSchedulerSelectAlgo(&(param->p))); } else { param->p.scheduled = false; diff --git a/projects/rccl/src/misc/npkit.cc b/projects/rccl/src/misc/npkit.cc index 6ef83f0a40..0302fbd92b 100644 --- a/projects/rccl/src/misc/npkit.cc +++ b/projects/rccl/src/misc/npkit.cc @@ -120,7 +120,13 @@ ncclResult_t NpKit::Dump(const std::string& dump_dir) { dump_file_path = dump_dir; dump_file_path += "/gpu_clock_rate_rank_"; dump_file_path += std::to_string(rank_); - constexpr int vega_gpu_rtc_freq_in_khz = 25000; + hipDeviceProp_t devProp; + int vega_gpu_rtc_freq_in_khz; + CUDACHECK(hipGetDeviceProperties(&devProp, 0)); + if (devProp.gcnArch/10 == 94) + vega_gpu_rtc_freq_in_khz = 100000; + else + vega_gpu_rtc_freq_in_khz = 25000; std::string clock_rate_str = std::to_string(vega_gpu_rtc_freq_in_khz); auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out); gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length()); diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc index 273858cd02..274006d97f 100644 --- a/projects/rccl/src/transport/net.cc +++ b/projects/rccl/src/transport/net.cc @@ -190,7 +190,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0; - if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) { + if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) { CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev)); send->conn.curr_hdp_reg = req.curr_hdp_reg; } diff --git a/projects/rccl/src/transport/p2p.cc b/projects/rccl/src/transport/p2p.cc index 460b4bf4e9..fd9e67c72d 100644 --- a/projects/rccl/src/transport/p2p.cc +++ b/projects/rccl/src/transport/p2p.cc @@ -354,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank); return ncclInternalError; } - if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) { + if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) { CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev)); TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg); } diff --git a/projects/rccl/tools/scripts/topo_val.sh b/projects/rccl/tools/scripts/topo_val.sh index 1dc06dd074..14c2cf2831 100755 --- a/projects/rccl/tools/scripts/topo_val.sh +++ b/projects/rccl/tools/scripts/topo_val.sh @@ -21,7 +21,7 @@ DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -for i in {0..78} +for i in {0..80} do if [[ $i -eq 50 ]] || [[ $i -eq 51 ]] then diff --git a/projects/rccl/tools/topo_expl/topo_expl.cpp b/projects/rccl/tools/topo_expl/topo_expl.cpp index ea636a5492..1430e76f70 100644 --- a/projects/rccl/tools/topo_expl/topo_expl.cpp +++ b/projects/rccl/tools/topo_expl/topo_expl.cpp @@ -151,6 +151,8 @@ NodeModelDesc model_descs[] = { {2, "topo_8p1h_4.xml", "2 nodes 8P1H Alt."}, {1, "topo_8p1h_5.xml", "Single node 8P1H Alt."}, {2, "topo_8p1h_5.xml", "2 nodes 8P1H Alt."}, + {1, "topo_8p_940.xml", "Single node gfx940 8P"}, + {2, "topo_4p_940.xml", "2 nodes gfx940 4P"}, }; NCCL_PARAM(MaxCTAs, "MAX_CTAS", MAXCHANNELS);