Fix(Refactor): Switch SMI logic to whitelist (RCCL_SMI_ENABLED) and remove redundant fallback code

Bu işleme şunda yer alıyor:
Donato Capitella
2026-02-01 11:31:39 +00:00
ebeveyn 54de8024d3
işleme f227312867
3 değiştirilmiş dosya ile 6 ekleme ve 39 silme
+4 -37
Dosyayı Görüntüle
@@ -46,12 +46,11 @@ option(QUIET_WARNINGS "Supress compiler warnings"
option(ENABLE_ROCSHMEM "Enable rocSHMEM support in RCCL" OFF)
option(ENABLE_AMDSMI "Enable AMD/ROCm SMI support" ON)
if(NOT ENABLE_AMDSMI)
message(STATUS "SMI Support: DISABLED via ENABLE_AMDSMI=OFF")
add_compile_definitions(SMI_DISABLED)
unset(USE_AMDSMI CACHE)
else()
if(ENABLE_AMDSMI)
message(STATUS "SMI Support: ENABLED")
add_compile_definitions(RCCL_SMI_ENABLED)
else()
message(STATUS "SMI Support: DISABLED via ENABLE_AMDSMI=OFF")
endif()
# Default GPU architectures to build
@@ -291,39 +290,7 @@ if(ROCM_VERSION VERSION_GREATER_EQUAL "71100" AND ENABLE_AMDSMI)
endif()
endif()
if(NOT USE_AMDSMI AND ENABLE_AMDSMI)
## Fallback to rocm-smi if amd-smi not found or ROCm < 7.11.0
message(WARNING "Could not find amd_smi. Falling back to rocm_smi.")
find_package(rocm_smi PATHS ${ROCM_PATH}/lib/cmake/rocm_smi)
if(rocm_smi_FOUND)
set(SMI_INCLUDE_DIR "${rocm_smi_INCLUDE_DIR}" CACHE INTERNAL "rocm-smi include directory")
set(SMI_LIB_DIR "${rocm_smi_LIB_DIR}" CACHE INTERNAL "rocm-smi library directory")
else()
message(WARNING "CMake could not find rocm-smi. Checking old include directory structure for rocm_smi")
set(SMI_INCLUDE_DIR "${ROCM_PATH}/rocm_smi/include")
set(SMI_LIB_DIR "${ROCM_PATH}/rocm_smi/lib")
endif()
if(NOT EXISTS "${SMI_INCLUDE_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
message(FATAL_ERROR "rocm_smi not found in ${SMI_INCLUDE_DIR}")
endif()
message(STATUS "Found rocm_smi at ${SMI_INCLUDE_DIR}")
set(SMI_LIB_NAME "rocm-smi-lib" CACHE INTERNAL "rocm-smi-lib for packaging")
set(SMI_LIBRARIES rocm_smi64)
add_definitions("-DUSE_ROCMSMI")
check_include_file_cxx("${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi64Config.h" HAVE_ROCM_SMI64CONFIG)
### Check for RSMI_INIT_FLAG_THRAD_ONLY_MUTEX support
file(READ "${SMI_INCLUDE_DIR}/rocm_smi/rocm_smi.h" rocm_smi_incl)
string(FIND "${rocm_smi_incl}" "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX" matchres)
if(${matchres} EQUAL -1)
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX not supported")
else()
message(STATUS "RSMI_INIT_FLAG_THRAD_ONLY_MUTEX supported")
set(HAVE_ROCM_SMI_THREAD_ONLY_MUTEX True)
endif ()
endif()
## Check for BFD library if custom backtrace is requested
if(BUILD_BFD)
+1 -1
Dosyayı Görüntüle
@@ -7,7 +7,7 @@
#include "amd_smi/amdsmi.h"
#include "nccl.h"
#if defined(USE_AMDSMI) && !defined(SMI_DISABLED)
#if defined(USE_AMDSMI) && defined(RCCL_SMI_ENABLED)
ncclResult_t amd_smi_init();
ncclResult_t amd_smi_shutdown();
ncclResult_t amd_smi_getNumDevice(uint32_t* num_devs);
+1 -1
Dosyayı Görüntüle
@@ -29,7 +29,7 @@ THE SOFTWARE.
#endif
#include "nccl.h"
#if defined(USE_ROCMSMI) && !defined(SMI_DISABLED)
#if defined(USE_ROCMSMI) && defined(RCCL_SMI_ENABLED)
ncclResult_t rocm_smi_init();
ncclResult_t rocm_smi_getNumDevice(uint32_t* num_devs);
ncclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len);