fix: disable AMD SMI for gfx1151 targets in CMake and remove a debug error from the SMI wrapper header.

Αυτή η υποβολή περιλαμβάνεται σε:
Donato Capitella
2026-02-01 11:49:27 +00:00
γονέας 3f31d17ae7
υποβολή 0586700b06
2 αρχεία άλλαξαν με 15 προσθήκες και 7 διαγραφές
@@ -46,12 +46,7 @@ option(QUIET_WARNINGS "Supress compiler warnings"
option(ENABLE_ROCSHMEM "Enable rocSHMEM support in RCCL" OFF)
option(ENABLE_AMDSMI "Enable AMD/ROCm SMI support" ON)
if(ENABLE_AMDSMI)
message(STATUS "SMI Support: ENABLED")
add_compile_definitions(RCCL_SMI_ENABLED)
else()
message(STATUS "SMI Support: DISABLED via ENABLE_AMDSMI=OFF")
endif()
# Default GPU architectures to build
#==================================================================================================
@@ -125,9 +120,23 @@ else()
set(SUPPORTED_GPUS ${DEFAULT_GPUS})
endif()
set(GPU_TARGETS "${SUPPORTED_GPUS}")
set(GPU_TARGETS "${SUPPORTED_GPUS}")
message(STATUS "Compiling for ${GPU_TARGETS}")
# Auto-disable SMI for gfx1151 as it is not supported
if("${GPU_TARGETS}" MATCHES "gfx1151")
message(STATUS "Detected gfx1151 target: Forcing ENABLE_AMDSMI=OFF")
set(ENABLE_AMDSMI OFF CACHE BOOL "Force disable SMI for gfx1151" FORCE)
endif()
if(ENABLE_AMDSMI)
message(STATUS "SMI Support: ENABLED")
add_compile_definitions(RCCL_SMI_ENABLED)
else()
message(STATUS "SMI Support: DISABLED")
endif()
## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults
@@ -36,7 +36,6 @@ ncclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBus
ncclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex);
ncclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int *hops, int *count);
#else
#error "DEBUG_TRACE: SMI is indeed disabled, using inline functions."
inline ncclResult_t rocm_smi_init() { return ncclSuccess; }
inline ncclResult_t rocm_smi_getNumDevice(uint32_t* num_devs) { *num_devs = 0; return ncclSuccess; }
inline ncclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len) { if (len > 0) pciBusId[0] = '\0'; return ncclSuccess; }