diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md index 4f43e0cd35..f4169c11ba 100644 --- a/projects/rccl/CHANGELOG.md +++ b/projects/rccl/CHANGELOG.md @@ -5,6 +5,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: ## Unreleased - RCCL 2.27.7 for ROCm 7.1.0 ### Added +* `RCCL_FORCE_ENABLE_DMABUF` added as a debugging feature if the user wants to explicitly enable DMABUF and forego system/kernel checks. * Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv. * Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages. * added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold. diff --git a/projects/rccl/src/misc/rocmwrap.cc b/projects/rccl/src/misc/rocmwrap.cc index 5888378a72..98239fe320 100644 --- a/projects/rccl/src/misc/rocmwrap.cc +++ b/projects/rccl/src/misc/rocmwrap.cc @@ -20,6 +20,7 @@ DECLARE_ROCM_PFN(hsa_amd_portable_export_dmabuf); // DMA-BUF support NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 0); +RCCL_PARAM(ForceEnableDMABUF, "FORCE_ENABLE_DMABUF", 0); /* ROCr Driver functions loaded with dlsym() */ DECLARE_ROCM_PFN(hsa_init); DECLARE_ROCM_PFN(hsa_system_get_info); @@ -146,15 +147,22 @@ static void initOnceFunc() { // Determine whether we support the cuMem APIs or not ncclCuMemSupported = ncclIsCuMemSupported(); - /* DMA-BUF support */ //ROCm support - if (ncclParamDmaBufEnable() == 0 ) { + if(rcclParamForceEnableDMABUF()) + { + dmaBufSupport = 1; + WARN("DMA_BUF Support is force enabled, so explicitly setting RCCL_FORCE_ENABLE_DMABUF=1"); + } + else if (ncclParamDmaBufEnable() == 0) + { INFO(NCCL_INIT, "Dmabuf feature disabled without NCCL_DMABUF_ENABLE=1"); goto error; } + + // ROCr checks res = pfn_hsa_system_get_info((hsa_system_info_t) 0x204, &dmaBufSupport); - if (res != HSA_STATUS_SUCCESS || !dmaBufSupport) { + if (res != HSA_STATUS_SUCCESS || !dmaBufSupport){ INFO(NCCL_INIT, "Current version of ROCm does not support dmabuf feature."); goto error; } @@ -164,45 +172,64 @@ static void initOnceFunc() { WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf"); goto error; } - else { - //check OS kernel support - struct utsname utsname; - FILE *fp = NULL; - char kernel_opt1[28] = "CONFIG_DMABUF_MOVE_NOTIFY=y"; - char kernel_opt2[20] = "CONFIG_PCI_P2PDMA=y"; - char kernel_conf_file[128]; - char buf[256]; - int found_opt1 = 0; - int found_opt2 = 0; - - //check for kernel name exists - if (uname(&utsname) == -1) INFO(NCCL_INIT,"Could not get kernel name"); - //format and store the kernel conf file location - snprintf(kernel_conf_file, sizeof(kernel_conf_file), "/boot/config-%s", utsname.release); - fp = fopen(kernel_conf_file, "r"); - if (fp == NULL) INFO(NCCL_INIT,"Could not open kernel conf file"); - //look for kernel_opt1 and kernel_opt2 in the conf file and check - while (fgets(buf, sizeof(buf), fp) != NULL) { - if (strstr(buf, kernel_opt1) != NULL) { - found_opt1 = 1; - INFO(NCCL_INIT,"CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release); - } - if (strstr(buf, kernel_opt2) != NULL) { - found_opt2 = 1; - INFO(NCCL_INIT,"CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release); - } - } - if (!found_opt1 || !found_opt2) { - dmaBufSupport = 0; - INFO(NCCL_INIT, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release); - INFO(NCCL_INIT, "DMA_BUF_SUPPORT Failed due to OS kernel support"); - } - - if(dmaBufSupport) INFO(NCCL_INIT, "DMA_BUF Support Enabled"); - else goto error; - } } + //check OS kernel support + if(!rcclParamForceEnableDMABUF()) { + struct utsname utsname; + FILE *fp = NULL; + char kernel_opt1[28] = "CONFIG_DMABUF_MOVE_NOTIFY=y"; + char kernel_opt2[20] = "CONFIG_PCI_P2PDMA=y"; + char kernel_conf_file[128]; + char buf[256]; + int found_opt1 = 0; + int found_opt2 = 0; + + //check for kernel name exists + if (uname(&utsname) == -1) INFO(NCCL_INIT,"Could not get kernel name"); + //format and store the kernel conf file location + const char* possiblePaths[] = { + "/proc/config.gz", + "/boot/config-%s", + "/usr/src/linux-%s/.config", + "/usr/src/linux/.config", + "/usr/lib/modules/%s/config", + "/usr/lib/ostree-boot/config-%s", + "/usr/lib/kernel/config-%s", + "/usr/src/linux-headers-%s/.config", + "/lib/modules/%s/build/.config", + }; + for (const auto& path : possiblePaths) { + snprintf(kernel_conf_file, sizeof(kernel_conf_file), path, utsname.release); + fp = fopen(kernel_conf_file, "r"); + if (fp != NULL){ + //look for kernel_opt1 and kernel_opt2 in the conf file and check + while (fgets(buf, sizeof(buf), fp) != NULL) { + if (strstr(buf, kernel_opt1) != NULL) { + found_opt1 = 1; + INFO(NCCL_INIT,"CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release); + } + if (strstr(buf, kernel_opt2) != NULL) { + found_opt2 = 1; + INFO(NCCL_INIT,"CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release); + } + } + if (!found_opt1 || !found_opt2) { + dmaBufSupport = 0; + INFO(NCCL_INIT, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release); + INFO(NCCL_INIT, "DMA_BUF_SUPPORT Failed due to OS kernel support"); + } + + if(dmaBufSupport) INFO(NCCL_INIT, "DMA_BUF Support Enabled"); + else goto error; + break; + } + } + if(fp == NULL) { + dmaBufSupport = 0; + INFO(NCCL_INIT,"Could not open kernel conf file"); + } + } /* * Required to initialize the ROCr Driver. * Multiple calls of hsa_init() will return immediately