Enable gfx94x (#808) (#816)

(cherry picked from commit 94da229a7788d74685d1591a4e75a8341de64f41)

[ROCm/rccl commit: a7fcd58a97]
This commit is contained in:
Wenkai Du
2023-07-21 07:31:27 -07:00
کامیت شده توسط GitHub
والد f4b106ec94
کامیت dfda1d6fab
15فایلهای تغییر یافته به همراه159 افزوده شده و 56 حذف شده
+10 -1
مشاهده پرونده
@@ -17,7 +17,7 @@ option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on
option(BUILD_SHARED_LIBS "Build as shared library" ON)
option(BUILD_TESTS "Build unit test programs" OFF)
option(COLLTRACE "Collective Trace Option" ON)
option(ENABLE_IFC "Enable indirect function call" ON)
option(ENABLE_IFC "Enable indirect function call" OFF)
option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
option(PROFILE "Enable profiling" OFF)
option(TIMETRACE "Enable time-trace during compilation" OFF)
@@ -32,6 +32,9 @@ set(DEFAULT_GPUS
gfx908:xnack-
gfx90a:xnack-
gfx90a:xnack+
gfx940
gfx941
gfx942
gfx1030
gfx1100
gfx1101
@@ -118,6 +121,9 @@ message(STATUS "hipcc version: ${hipcc_version_string}")
### Check for hipEventDisableSystemFence support
check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVENT_DISABLE_FENCE)
### Check for hipDeviceMallocUncached support
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
### Check for indirect function call support
if(ENABLE_IFC)
if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201")
@@ -523,6 +529,9 @@ endif()
if(${HIP_EVENT_DISABLE_FENCE})
target_compile_definitions(rccl PRIVATE HIP_EVENT_DISABLE_FENCE)
endif()
if(${HIP_UNCACHED_MEMORY})
target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY)
endif()
if (BUILD_BFD)
if (HAVE_BFD)
target_compile_definitions(rccl PRIVATE HAVE_BFD)
@@ -28,10 +28,17 @@
{ __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST); }
#endif
#ifdef ENABLE_LL128
#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
#else
#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
#endif
#define NCCL_FUNC4(func, devredop, type, nullify) \
NCCL_FUNC5(func, TREE, devredop, type, nullify), \
@@ -542,7 +549,11 @@ __forceinline__ __device__ void ncclKernel(
#ifdef USE_INDIRECT_FUNCTION_CALL
ncclFuncs[ncclShmem.work.header.funcIndex]();
#else
#ifdef ENABLE_LL128
NCCL_CALL_FUNCTIONS<1>(ncclShmem.work.header.funcIndex);
#else
NCCL_CALL_FUNCTIONS<0>(ncclShmem.work.header.funcIndex);
#endif
#endif
}
@@ -607,10 +618,16 @@ __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev
#endif
// Only generate inline kernels for LL
#ifdef ENABLE_LL128
#define IMPL_COLL4(func, algo, devredop, type) \
IMPL_COLL_FUNC(func, algo, LL, devredop, type) \
IMPL_COLL_FUNC(func, algo, LL128, devredop, type) \
IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type)
#else
#define IMPL_COLL4(func, algo, devredop, type) \
IMPL_COLL_FUNC(func, algo, LL, devredop, type) \
IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type)
#endif
#define IMPL_COLL3(func, devredop, type) \
IMPL_COLL4(func, TREE, devredop, type) \
@@ -700,36 +700,32 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
} while (system->nodes[NET].count);
int remove = 1;
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD
&& model == NCCL_TOPO_CPU_TYPE_ROME) {
int gdr = 1;
bool allXgmi = true;
// detect if all GPUs are connected by XGMI
for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
if (i == j) continue;
int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
bool isXGMI;
NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
allXgmi &= isXGMI;
}
}
if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
for (int g = 0; g < system->nodes[GPU].count; g++) {
int net;
NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
if (!gdr) break;
}
if (gdr && !allXgmi) {
remove = 0;
system->type |= RCCL_TOPO_GDR_ALL;
INFO(NCCL_GRAPH, "GDR is available on all GPUs");
int gdr = 1;
bool allXgmi = true;
// detect if all GPUs are connected by XGMI
for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
if (i == j) continue;
int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
bool isXGMI;
NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
allXgmi &= isXGMI;
}
}
if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
for (int g = 0; g < system->nodes[GPU].count; g++) {
int net;
NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &net));
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
if (!gdr) break;
}
if (gdr && !allXgmi) {
remove = 0;
system->type |= RCCL_TOPO_GDR_ALL;
INFO(NCCL_GRAPH, "GDR is available on all GPUs");
}
if (rcclParamEnableIntranet()) {
remove = 0;
system->type |= RCCL_TOPO_FORCE_INTRA;
@@ -812,7 +808,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(comm->topo, &arch, &vendor, &model));
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) {
// Adjust P2P channels on Intel platform
comm->p2pnChannelsPerPeer = 1;
comm->p2pnChannels = 2;
@@ -578,6 +578,32 @@ static struct rcclRomeModel rome_model_76 = {
.treeBase = "",
};
static struct rcclRomeModel rome_model_79 = {
.nGpus = 8, .nCpus = 2, .nNics = 0, .nLinks = 7,
.gpuIds = { 0x1d000, 0x2e000, 0x3f000, 0x61000, 0x9f000, 0xaf000, 0xbf000, 0xdf000, },
.nicIds = { },
.gpuNuma = { 0, 0, 0, 0, 1, 1, 1, 1, },
.nicNuma = { },
.connMatrix = { 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, },
.gdrLevel = { },
.pattern = "4040",
.ringBase = "0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0",
.options = "noCpuCheck=1",
};
static struct rcclRomeModel rome_model_80 = {
.nGpus = 4, .nCpus = 4, .nNics = 4, .nLinks = 3,
.gpuIds = { 0x82000, 0xc2000, 0x2000, 0x42000, },
.nicIds = { 0x81000, 0xc1000, 0x1000, 0x41000, },
.gpuNuma = { 2, 3, 0, 1, },
.nicNuma = { 2, 3, 0, 1, },
.connMatrix = { 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, },
.gdrLevel = { PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, },
.pattern = "11111111",
.ringBase = "N2 2 3 0 1 N1|N0 0 1 3 2 N2|N0 0 2 1 3 N3|N3 3 1 0 2 N2|N3 3 1 2 0 N0|N1 1 0 3 2 N2|N1 1 2 3 0 N0|N2 2 0 1 3 N3|N3 3 0 2 1 N1|N2 2 3 1 0 N0|N1 1 2 0 3 N3|N0 0 3 2 1 N1",
.options = "",
};
static struct rcclRomeModel romeTopoModels[] = {
rome_model_22,
rome_model_25,
@@ -617,6 +643,8 @@ static struct rcclRomeModel romeTopoModels[] = {
rome_model_73,
rome_model_74,
rome_model_76,
rome_model_79,
rome_model_80,
};
/* Parse user defined rings. Format is like :
@@ -850,7 +878,7 @@ static void parseOptions(struct ncclTopoSystem* system, const char *options) {
}
}
static bool disableNumaMatching(const char *options) {
static bool checkOption(const char *options, const char *name) {
if (strcmp(options, "")) {
char *str_temp = (char *)malloc(strlen(options) + 1);
strcpy(str_temp, options);
@@ -862,7 +890,7 @@ static bool disableNumaMatching(const char *options) {
while (tokens[numTokens-1] != NULL && numTokens < MAX_OPT_TOKENS)
tokens[numTokens++] = strtok_r(NULL, "=, ", &state);
for (int i = 0; i < numTokens/2; i++) {
if (strcmp(tokens[i*2], "disableNumaMatching") == 0) {
if (strcmp(tokens[i*2], name) == 0) {
return (bool)atol(tokens[i*2+1]);
}
}
@@ -1202,8 +1230,6 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
// only valid on Rome
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
return ncclSuccess;
// number of GPUs and NICs on each numa node is used as first screening pattern
struct rcclRomeModel romeTopo;
@@ -1229,7 +1255,10 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
if (i < romeTopo.nGpus) match_nbio = false;
for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) {
bool ignore_numa = disableNumaMatching(romeTopoModels[i].options);
bool ignore_cpu = checkOption(romeTopoModels[i].options, "noCpuCheck");
if (!ignore_cpu && (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME))
continue;
bool ignore_numa = checkOption(romeTopoModels[i].options, "disableNumaMatching");
if (!ignore_numa && romeTopo.nCpus != romeTopoModels[i].nCpus) continue;
if (romeTopo.nGpus != romeTopoModels[i].nGpus ||
romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue;
@@ -1501,7 +1530,8 @@ ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* grap
}
}
INFO(NCCL_GRAPH, "%s", line);
system->type |= RCCL_TOPO_4P2H_ROME;
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME)
system->type |= RCCL_TOPO_4P2H_ROME;
parseOptions(system, rome_model_68.options);
// create 4P4H based on reference and remapped ids
NCCLCHECK(parseGraph(rome_model_68.ringBase, system, graph, g_hives, n_hives));
@@ -837,8 +837,8 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
float speedArrayIntra[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
float speedArrayInter[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
float speedArrayIntra[] = { 48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
float speedArrayInter[] = { 48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
#else
@@ -890,7 +890,9 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
if (str) {
// user supplied topo
NCCLCHECK(parseGraph(str, system, graph, NULL, NULL));
if (graph->nChannels) {
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (graph->nChannels && arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) {
system->type |= RCCL_TOPO_4P2H_ROME;
}
} else if (!rcclParamModelMatchingDisable() && !graph->collNet) {
@@ -380,7 +380,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
return ncclSuccess;
}
struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { "0x120000", GPU }, { NULL, PCI /* Default fallback value */ } };
struct kvDict kvDictPciGen[] = {
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
{ "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
+11 -1
مشاهده پرونده
@@ -27,6 +27,7 @@
#define NET_BW 12.0 // 100Gbit
#define VEGA_XGMI_WIDTH 24.0
#define MI200_XGMI_WIDTH 36.0
#define GFX94X_XGMI_WIDTH 48.0
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
// to GPU traffic consumes more PCI bandwidth.
@@ -223,7 +224,16 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
// Returns XGMI speed in GB/s
static float ncclTopoXGMISpeed(int gcn) {
return gcn == 910 ? MI200_XGMI_WIDTH : VEGA_XGMI_WIDTH;
switch (gcn) {
case 910:
return MI200_XGMI_WIDTH;
case 940:
case 941:
case 942:
return GFX94X_XGMI_WIDTH;
default:
return VEGA_XGMI_WIDTH;
}
}
#if ENABLE_COLLTRACE
@@ -165,9 +165,15 @@ ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (isFineGrain)
if (isFineGrain) {
#if defined(HIP_UNCACHED_MEMORY)
hipDeviceProp_t prop;
CUDACHECK(hipGetDeviceProperties(&prop, 0));
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish);
#else
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
else
#endif
} else
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
finish:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
@@ -187,9 +193,15 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t
cudaStream_t stream = sideStream;
if (stream == nullptr)
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
if (isFineGrain)
if (isFineGrain) {
#if defined(HIP_UNCACHED_MEMORY)
hipDeviceProp_t prop;
CUDACHECK(hipGetDeviceProperties(&prop, 0));
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish);
#else
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
else
#endif
} else
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
@@ -215,9 +227,15 @@ ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, s
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (isFineGrain)
if (isFineGrain) {
#if defined(HIP_UNCACHED_MEMORY)
hipDeviceProp_t prop;
CUDACHECK(hipGetDeviceProperties(&prop, 0));
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained), result, finish);
#else
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
else
#endif
} else
CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
int dev;
+16 -4
مشاهده پرونده
@@ -171,6 +171,13 @@ RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0);
void *ncclCommThreadMain(void *arg) {
ncclComm_t comm = (ncclComm_t)arg;
int head = 0;
hipDeviceProp_t devProp;
double vega_gpu_rtc_freq;
hipError_t status = hipGetDeviceProperties(&devProp, comm->cudaDev);
if (devProp.gcnArch/10 == 94 && status == hipSuccess)
vega_gpu_rtc_freq = 1.0E8;
else
vega_gpu_rtc_freq = 2.5E7;
#define MAX_NAME_LENGTH 64
char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+2));
for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) {
@@ -213,16 +220,15 @@ void *ncclCommThreadMain(void *arg) {
char line[1024];
int offset = 0;
uint16_t fIdx = td->funcIndex;
#define VEGA_GPU_RTC_FREQUENCY 2.5E7
if (type == ncclCollTraceDataType) {
sprintf(line, "## [%012.6f] [%02d:%02d] L:%04d DT %08x %016lx %016lx",
(double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid,
(double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid,
fIdx, td->data_0, td->opCount, td->data_1);
} else {
if (fIdx == FUNC_INDEX_P2P || type == ncclCollTraceP2pElemType)
sprintf(line, "## [%012.6f] [%02d:%02d] %06x-%06x", (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->p2pOpCount[0], td->p2pOpCount[1]);
sprintf(line, "## [%012.6f] [%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->p2pOpCount[0], td->p2pOpCount[1]);
else
sprintf(line, "## [%012.6f] [%02d:%02d] %06lx", (double)(td->timeStamp)/VEGA_GPU_RTC_FREQUENCY, comm->rank, td->bid, td->opCount);
sprintf(line, "## [%012.6f] [%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->opCount);
offset = strlen(line);
if (type == ncclCollTraceCollElemType) {
sprintf(line+offset, " CE %s nw %d bi %d nc %d busId %lx nRanks %d", func_names+MAX_NAME_LENGTH*fIdx, td->coll.nWarps, td->coll.bid, td->coll.nChannels, comm->busId, comm->nRanks);
@@ -705,7 +711,13 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
// detect if fine grained memory is available on this GPU
int *ptr;
#if defined(HIP_UNCACHED_MEMORY)
hipDeviceProp_t prop;
CUDACHECK(hipGetDeviceProperties(&prop, 0));
if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), prop.gcnArch/10 == 94 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained) == hipSuccess) {
#else
if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) {
#endif
CUDACHECK(hipFree(ptr));
info->hasFineGrain = true;
NCCLCHECK(ncclGpuGdrSupport(comm, &info->gdrSupport));
@@ -23,6 +23,7 @@
#include "msccl/msccl_status.h"
RCCL_PARAM(MscclEnabled, "MSCCL_ENABLE", 1);
RCCL_PARAM(MscclForceEnabled, "MSCCL_FORCE_ENABLE", 0);
static const char* mscclAlgoFilePathEnv = "MSCCL_ALGO_FILE_PATH";
static std::atomic<bool> mscclInitialized;
static bool mscclSchedulerTriedLoadAlgo = false;
@@ -257,7 +258,7 @@ static ncclResult_t mscclSchedulerSelectAlgo(struct mscclSavedSchedulerParam* pa
if (status.mscclSchedulerPtr) {
NCCLCHECK(status.mscclSchedulerPtr->selectAlgo(&(param->p)));
} else {
if (param->comm->topo->mscclEnabled) {
if (param->comm->topo->mscclEnabled || rcclParamMscclForceEnabled()) {
NCCLCHECK(mscclInternalSchedulerSelectAlgo(&(param->p)));
} else {
param->p.scheduled = false;
@@ -120,7 +120,13 @@ ncclResult_t NpKit::Dump(const std::string& dump_dir) {
dump_file_path = dump_dir;
dump_file_path += "/gpu_clock_rate_rank_";
dump_file_path += std::to_string(rank_);
constexpr int vega_gpu_rtc_freq_in_khz = 25000;
hipDeviceProp_t devProp;
int vega_gpu_rtc_freq_in_khz;
CUDACHECK(hipGetDeviceProperties(&devProp, 0));
if (devProp.gcnArch/10 == 94)
vega_gpu_rtc_freq_in_khz = 100000;
else
vega_gpu_rtc_freq_in_khz = 25000;
std::string clock_rate_str = std::to_string(vega_gpu_rtc_freq_in_khz);
auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out);
gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
@@ -190,7 +190,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) {
if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
send->conn.curr_hdp_reg = req.curr_hdp_reg;
}
@@ -354,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
return ncclInternalError;
}
if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) {
if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
}
@@ -21,7 +21,7 @@
DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
for i in {0..78}
for i in {0..80}
do
if [[ $i -eq 50 ]] || [[ $i -eq 51 ]]
then
@@ -151,6 +151,8 @@ NodeModelDesc model_descs[] = {
{2, "topo_8p1h_4.xml", "2 nodes 8P1H Alt."},
{1, "topo_8p1h_5.xml", "Single node 8P1H Alt."},
{2, "topo_8p1h_5.xml", "2 nodes 8P1H Alt."},
{1, "topo_8p_940.xml", "Single node gfx940 8P"},
{2, "topo_4p_940.xml", "2 nodes gfx940 4P"},
};
NCCL_PARAM(MaxCTAs, "MAX_CTAS", MAXCHANNELS);