Add support for additional paths in RCCL DMABUF kernel configuration loading (#1825)
* Adding more path to the kernel load and an environment variable to force enable DMABUF
---------
Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com>
[ROCm/rccl commit: b58f234539]
Этот коммит содержится в:
@@ -5,6 +5,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
|
||||
## Unreleased - RCCL 2.27.7 for ROCm 7.1.0
|
||||
|
||||
### Added
|
||||
* `RCCL_FORCE_ENABLE_DMABUF` added as a debugging feature if the user wants to explicitly enable DMABUF and forego system/kernel checks.
|
||||
* Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv.
|
||||
* Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages.
|
||||
* added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold.
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
DECLARE_ROCM_PFN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
|
||||
NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 0);
|
||||
RCCL_PARAM(ForceEnableDMABUF, "FORCE_ENABLE_DMABUF", 0);
|
||||
/* ROCr Driver functions loaded with dlsym() */
|
||||
DECLARE_ROCM_PFN(hsa_init);
|
||||
DECLARE_ROCM_PFN(hsa_system_get_info);
|
||||
@@ -146,15 +147,22 @@ static void initOnceFunc() {
|
||||
|
||||
// Determine whether we support the cuMem APIs or not
|
||||
ncclCuMemSupported = ncclIsCuMemSupported();
|
||||
|
||||
/* DMA-BUF support */
|
||||
//ROCm support
|
||||
if (ncclParamDmaBufEnable() == 0 ) {
|
||||
if(rcclParamForceEnableDMABUF())
|
||||
{
|
||||
dmaBufSupport = 1;
|
||||
WARN("DMA_BUF Support is force enabled, so explicitly setting RCCL_FORCE_ENABLE_DMABUF=1");
|
||||
}
|
||||
else if (ncclParamDmaBufEnable() == 0)
|
||||
{
|
||||
INFO(NCCL_INIT, "Dmabuf feature disabled without NCCL_DMABUF_ENABLE=1");
|
||||
goto error;
|
||||
}
|
||||
|
||||
// ROCr checks
|
||||
res = pfn_hsa_system_get_info((hsa_system_info_t) 0x204, &dmaBufSupport);
|
||||
if (res != HSA_STATUS_SUCCESS || !dmaBufSupport) {
|
||||
if (res != HSA_STATUS_SUCCESS || !dmaBufSupport){
|
||||
INFO(NCCL_INIT, "Current version of ROCm does not support dmabuf feature.");
|
||||
goto error;
|
||||
}
|
||||
@@ -164,45 +172,64 @@ static void initOnceFunc() {
|
||||
WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf");
|
||||
goto error;
|
||||
}
|
||||
else {
|
||||
//check OS kernel support
|
||||
struct utsname utsname;
|
||||
FILE *fp = NULL;
|
||||
char kernel_opt1[28] = "CONFIG_DMABUF_MOVE_NOTIFY=y";
|
||||
char kernel_opt2[20] = "CONFIG_PCI_P2PDMA=y";
|
||||
char kernel_conf_file[128];
|
||||
char buf[256];
|
||||
int found_opt1 = 0;
|
||||
int found_opt2 = 0;
|
||||
|
||||
//check for kernel name exists
|
||||
if (uname(&utsname) == -1) INFO(NCCL_INIT,"Could not get kernel name");
|
||||
//format and store the kernel conf file location
|
||||
snprintf(kernel_conf_file, sizeof(kernel_conf_file), "/boot/config-%s", utsname.release);
|
||||
fp = fopen(kernel_conf_file, "r");
|
||||
if (fp == NULL) INFO(NCCL_INIT,"Could not open kernel conf file");
|
||||
//look for kernel_opt1 and kernel_opt2 in the conf file and check
|
||||
while (fgets(buf, sizeof(buf), fp) != NULL) {
|
||||
if (strstr(buf, kernel_opt1) != NULL) {
|
||||
found_opt1 = 1;
|
||||
INFO(NCCL_INIT,"CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release);
|
||||
}
|
||||
if (strstr(buf, kernel_opt2) != NULL) {
|
||||
found_opt2 = 1;
|
||||
INFO(NCCL_INIT,"CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release);
|
||||
}
|
||||
}
|
||||
if (!found_opt1 || !found_opt2) {
|
||||
dmaBufSupport = 0;
|
||||
INFO(NCCL_INIT, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release);
|
||||
INFO(NCCL_INIT, "DMA_BUF_SUPPORT Failed due to OS kernel support");
|
||||
}
|
||||
|
||||
if(dmaBufSupport) INFO(NCCL_INIT, "DMA_BUF Support Enabled");
|
||||
else goto error;
|
||||
}
|
||||
}
|
||||
|
||||
//check OS kernel support
|
||||
if(!rcclParamForceEnableDMABUF()) {
|
||||
struct utsname utsname;
|
||||
FILE *fp = NULL;
|
||||
char kernel_opt1[28] = "CONFIG_DMABUF_MOVE_NOTIFY=y";
|
||||
char kernel_opt2[20] = "CONFIG_PCI_P2PDMA=y";
|
||||
char kernel_conf_file[128];
|
||||
char buf[256];
|
||||
int found_opt1 = 0;
|
||||
int found_opt2 = 0;
|
||||
|
||||
//check for kernel name exists
|
||||
if (uname(&utsname) == -1) INFO(NCCL_INIT,"Could not get kernel name");
|
||||
//format and store the kernel conf file location
|
||||
const char* possiblePaths[] = {
|
||||
"/proc/config.gz",
|
||||
"/boot/config-%s",
|
||||
"/usr/src/linux-%s/.config",
|
||||
"/usr/src/linux/.config",
|
||||
"/usr/lib/modules/%s/config",
|
||||
"/usr/lib/ostree-boot/config-%s",
|
||||
"/usr/lib/kernel/config-%s",
|
||||
"/usr/src/linux-headers-%s/.config",
|
||||
"/lib/modules/%s/build/.config",
|
||||
};
|
||||
for (const auto& path : possiblePaths) {
|
||||
snprintf(kernel_conf_file, sizeof(kernel_conf_file), path, utsname.release);
|
||||
fp = fopen(kernel_conf_file, "r");
|
||||
if (fp != NULL){
|
||||
//look for kernel_opt1 and kernel_opt2 in the conf file and check
|
||||
while (fgets(buf, sizeof(buf), fp) != NULL) {
|
||||
if (strstr(buf, kernel_opt1) != NULL) {
|
||||
found_opt1 = 1;
|
||||
INFO(NCCL_INIT,"CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release);
|
||||
}
|
||||
if (strstr(buf, kernel_opt2) != NULL) {
|
||||
found_opt2 = 1;
|
||||
INFO(NCCL_INIT,"CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release);
|
||||
}
|
||||
}
|
||||
if (!found_opt1 || !found_opt2) {
|
||||
dmaBufSupport = 0;
|
||||
INFO(NCCL_INIT, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release);
|
||||
INFO(NCCL_INIT, "DMA_BUF_SUPPORT Failed due to OS kernel support");
|
||||
}
|
||||
|
||||
if(dmaBufSupport) INFO(NCCL_INIT, "DMA_BUF Support Enabled");
|
||||
else goto error;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(fp == NULL) {
|
||||
dmaBufSupport = 0;
|
||||
INFO(NCCL_INIT,"Could not open kernel conf file");
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Required to initialize the ROCr Driver.
|
||||
* Multiple calls of hsa_init() will return immediately
|
||||
|
||||
Ссылка в новой задаче
Block a user