Add support for additional paths in RCCL DMABUF kernel configuration loading (#1825)

* Adding more path to the kernel load and an environment variable to force enable DMABUF

---------

Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com>

[ROCm/rccl commit: b58f234539]
Этот коммит содержится в:
mberenjk
2025-10-20 13:35:22 -07:00
коммит произвёл GitHub
родитель ef1ed44e93
Коммит 96c62b091d
2 изменённых файлов: 68 добавлений и 40 удалений
+1
Просмотреть файл
@@ -5,6 +5,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
## Unreleased - RCCL 2.27.7 for ROCm 7.1.0
### Added
* `RCCL_FORCE_ENABLE_DMABUF` added as a debugging feature if the user wants to explicitly enable DMABUF and forego system/kernel checks.
* Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv.
* Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages.
* added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold.
+67 -40
Просмотреть файл
@@ -20,6 +20,7 @@
DECLARE_ROCM_PFN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 0);
RCCL_PARAM(ForceEnableDMABUF, "FORCE_ENABLE_DMABUF", 0);
/* ROCr Driver functions loaded with dlsym() */
DECLARE_ROCM_PFN(hsa_init);
DECLARE_ROCM_PFN(hsa_system_get_info);
@@ -146,15 +147,22 @@ static void initOnceFunc() {
// Determine whether we support the cuMem APIs or not
ncclCuMemSupported = ncclIsCuMemSupported();
/* DMA-BUF support */
//ROCm support
if (ncclParamDmaBufEnable() == 0 ) {
if(rcclParamForceEnableDMABUF())
{
dmaBufSupport = 1;
WARN("DMA_BUF Support is force enabled, so explicitly setting RCCL_FORCE_ENABLE_DMABUF=1");
}
else if (ncclParamDmaBufEnable() == 0)
{
INFO(NCCL_INIT, "Dmabuf feature disabled without NCCL_DMABUF_ENABLE=1");
goto error;
}
// ROCr checks
res = pfn_hsa_system_get_info((hsa_system_info_t) 0x204, &dmaBufSupport);
if (res != HSA_STATUS_SUCCESS || !dmaBufSupport) {
if (res != HSA_STATUS_SUCCESS || !dmaBufSupport){
INFO(NCCL_INIT, "Current version of ROCm does not support dmabuf feature.");
goto error;
}
@@ -164,45 +172,64 @@ static void initOnceFunc() {
WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf");
goto error;
}
else {
//check OS kernel support
struct utsname utsname;
FILE *fp = NULL;
char kernel_opt1[28] = "CONFIG_DMABUF_MOVE_NOTIFY=y";
char kernel_opt2[20] = "CONFIG_PCI_P2PDMA=y";
char kernel_conf_file[128];
char buf[256];
int found_opt1 = 0;
int found_opt2 = 0;
//check for kernel name exists
if (uname(&utsname) == -1) INFO(NCCL_INIT,"Could not get kernel name");
//format and store the kernel conf file location
snprintf(kernel_conf_file, sizeof(kernel_conf_file), "/boot/config-%s", utsname.release);
fp = fopen(kernel_conf_file, "r");
if (fp == NULL) INFO(NCCL_INIT,"Could not open kernel conf file");
//look for kernel_opt1 and kernel_opt2 in the conf file and check
while (fgets(buf, sizeof(buf), fp) != NULL) {
if (strstr(buf, kernel_opt1) != NULL) {
found_opt1 = 1;
INFO(NCCL_INIT,"CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release);
}
if (strstr(buf, kernel_opt2) != NULL) {
found_opt2 = 1;
INFO(NCCL_INIT,"CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release);
}
}
if (!found_opt1 || !found_opt2) {
dmaBufSupport = 0;
INFO(NCCL_INIT, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release);
INFO(NCCL_INIT, "DMA_BUF_SUPPORT Failed due to OS kernel support");
}
if(dmaBufSupport) INFO(NCCL_INIT, "DMA_BUF Support Enabled");
else goto error;
}
}
//check OS kernel support
if(!rcclParamForceEnableDMABUF()) {
struct utsname utsname;
FILE *fp = NULL;
char kernel_opt1[28] = "CONFIG_DMABUF_MOVE_NOTIFY=y";
char kernel_opt2[20] = "CONFIG_PCI_P2PDMA=y";
char kernel_conf_file[128];
char buf[256];
int found_opt1 = 0;
int found_opt2 = 0;
//check for kernel name exists
if (uname(&utsname) == -1) INFO(NCCL_INIT,"Could not get kernel name");
//format and store the kernel conf file location
const char* possiblePaths[] = {
"/proc/config.gz",
"/boot/config-%s",
"/usr/src/linux-%s/.config",
"/usr/src/linux/.config",
"/usr/lib/modules/%s/config",
"/usr/lib/ostree-boot/config-%s",
"/usr/lib/kernel/config-%s",
"/usr/src/linux-headers-%s/.config",
"/lib/modules/%s/build/.config",
};
for (const auto& path : possiblePaths) {
snprintf(kernel_conf_file, sizeof(kernel_conf_file), path, utsname.release);
fp = fopen(kernel_conf_file, "r");
if (fp != NULL){
//look for kernel_opt1 and kernel_opt2 in the conf file and check
while (fgets(buf, sizeof(buf), fp) != NULL) {
if (strstr(buf, kernel_opt1) != NULL) {
found_opt1 = 1;
INFO(NCCL_INIT,"CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release);
}
if (strstr(buf, kernel_opt2) != NULL) {
found_opt2 = 1;
INFO(NCCL_INIT,"CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release);
}
}
if (!found_opt1 || !found_opt2) {
dmaBufSupport = 0;
INFO(NCCL_INIT, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release);
INFO(NCCL_INIT, "DMA_BUF_SUPPORT Failed due to OS kernel support");
}
if(dmaBufSupport) INFO(NCCL_INIT, "DMA_BUF Support Enabled");
else goto error;
break;
}
}
if(fp == NULL) {
dmaBufSupport = 0;
INFO(NCCL_INIT,"Could not open kernel conf file");
}
}
/*
* Required to initialize the ROCr Driver.
* Multiple calls of hsa_init() will return immediately