add DMA_BUF support (#763)

* add DMA_BUF support

* remove unused libraries in src/init.cc

* change NCCL_ALL to NCCL_INIT

* remove extra pointer functions in transport/net.cc
This commit is contained in:
Bertan Dogancay
2023-06-01 12:46:42 -06:00
committed by GitHub
parent c62aebe882
commit d52b6c0d24
2 changed files with 47 additions and 10 deletions
-5
View File
@@ -133,8 +133,6 @@ endif()
find_package(hsa-runtime64 REQUIRED)
get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCLUDE_DIRECTORIES)
message(STATUS "HSA runtime: ${HSA_INCLUDE_PATH}")
check_symbol_exists("HSA_AMD_SYSTEM_INFO_DMABUF_SUPPORTED" "hsa/hsa.h" HAS_HSA_AMD_SYSTEM_INFO_DMABUF_SUPPORTED)
message(STATUS "HSA DMABUF support: ${HAS_HSA_AMD_SYSTEM_INFO_DMABUF_SUPPORTED}")
## Check for ROCM-smi
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
@@ -512,9 +510,6 @@ endif()
if(TRACE)
target_compile_definitions(rccl PRIVATE ENABLE_TRACE)
endif()
if(${HAS_HSA_AMD_SYSTEM_INFO_DMABUF_SUPPORTED})
target_compile_definitions(rccl PRIVATE QUERY_HSA_DMABUF)
endif()
if(${HIP_EVENT_DISABLE_FENCE})
target_compile_definitions(rccl PRIVATE HIP_EVENT_DISABLE_FENCE)
endif()
+47 -5
View File
@@ -8,8 +8,11 @@
#include "nccl.h"
#include "debug.h"
#include "rocmwrap.h"
#include "hsa/hsa.h"
#include <dlfcn.h>
#include <sys/utsname.h>
#include <fstream>
#define DECLARE_ROCM_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
@@ -25,6 +28,7 @@ static enum { hsaUninitialized, hsaInitializing, hsaInitialized, hsaError } hsaS
static void *hsaLib;
static uint16_t version_major, version_minor;
bool ncclCudaLaunchBlocking = false;
bool dmaBufSupport = false;
ncclResult_t rocmLibraryInit(void) {
do {
@@ -102,17 +106,55 @@ ncclResult_t rocmLibraryInit(void) {
//goto error;
//}
#if defined(QUERY_HSA_DMABUF)
bool supported;
res = pfn_hsa_system_get_info(HSA_AMD_SYSTEM_INFO_DMABUF_SUPPORTED, &supported);
if (res == 0 && supported) {
/* DMA-BUF support */
//ROCm support
res = pfn_hsa_system_get_info((hsa_system_info_t) 0x204, &dmaBufSupport);
if (res != HSA_STATUS_SUCCESS || !dmaBufSupport) INFO(NCCL_INIT, "Current version of ROCm does not support dmabuf feature.");
else {
pfn_hsa_amd_portable_export_dmabuf = (PFN_hsa_amd_portable_export_dmabuf) dlsym(hsaLib, "hsa_amd_portable_export_dmabuf");
if (pfn_hsa_amd_portable_export_dmabuf == NULL) {
WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf");
goto error;
}
else {
//check OS kernel support
struct utsname utsname;
FILE *fp = NULL;
char kernel_opt1[28] = "CONFIG_DMABUF_MOVE_NOTIFY=y";
char kernel_opt2[20] = "CONFIG_PCI_P2PDMA=y";
char kernel_conf_file[128];
char buf[256];
int found_opt1 = 0;
int found_opt2 = 0;
//check for kernel name exists
if (uname(&utsname) == -1) INFO(NCCL_INIT,"Could not get kernel name");
//format and store the kernel conf file location
snprintf(kernel_conf_file, sizeof(kernel_conf_file), "/boot/config-%s", utsname.release);
fp = fopen(kernel_conf_file, "r");
if (fp == NULL) INFO(NCCL_INIT,"Could not open kernel conf file");
//look for kernel_opt1 and kernel_opt2 in the conf file and check
while (fgets(buf, sizeof(buf), fp) != NULL) {
if (strstr(buf, kernel_opt1) != NULL) {
found_opt1 = 1;
INFO(NCCL_INIT,"CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release);
}
if (strstr(buf, kernel_opt2) != NULL) {
found_opt2 = 1;
INFO(NCCL_INIT,"CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release);
}
}
if (!found_opt1 || !found_opt2) {
dmaBufSupport = 0;
INFO(NCCL_INIT, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release);
INFO(NCCL_INIT, "DMA_BUF_SUPPORT Failed due to OS kernel support");
}
if(dmaBufSupport) INFO(NCCL_INIT, "DMA_BUF Support Enabled");
else goto error;
}
}
#endif
/*
* Required to initialize the ROCr Driver.
* Multiple calls of hsa_init() will return immediately