fix: disable AMD SMI for gfx1151 targets in CMake and remove a debug error from the SMI wrapper header.
Αυτή η υποβολή περιλαμβάνεται σε:
@@ -46,12 +46,7 @@ option(QUIET_WARNINGS "Supress compiler warnings"
|
||||
option(ENABLE_ROCSHMEM "Enable rocSHMEM support in RCCL" OFF)
|
||||
option(ENABLE_AMDSMI "Enable AMD/ROCm SMI support" ON)
|
||||
|
||||
if(ENABLE_AMDSMI)
|
||||
message(STATUS "SMI Support: ENABLED")
|
||||
add_compile_definitions(RCCL_SMI_ENABLED)
|
||||
else()
|
||||
message(STATUS "SMI Support: DISABLED via ENABLE_AMDSMI=OFF")
|
||||
endif()
|
||||
|
||||
|
||||
# Default GPU architectures to build
|
||||
#==================================================================================================
|
||||
@@ -125,9 +120,23 @@ else()
|
||||
set(SUPPORTED_GPUS ${DEFAULT_GPUS})
|
||||
endif()
|
||||
|
||||
set(GPU_TARGETS "${SUPPORTED_GPUS}")
|
||||
set(GPU_TARGETS "${SUPPORTED_GPUS}")
|
||||
message(STATUS "Compiling for ${GPU_TARGETS}")
|
||||
|
||||
# Auto-disable SMI for gfx1151 as it is not supported
|
||||
if("${GPU_TARGETS}" MATCHES "gfx1151")
|
||||
message(STATUS "Detected gfx1151 target: Forcing ENABLE_AMDSMI=OFF")
|
||||
set(ENABLE_AMDSMI OFF CACHE BOOL "Force disable SMI for gfx1151" FORCE)
|
||||
endif()
|
||||
|
||||
if(ENABLE_AMDSMI)
|
||||
message(STATUS "SMI Support: ENABLED")
|
||||
add_compile_definitions(RCCL_SMI_ENABLED)
|
||||
else()
|
||||
message(STATUS "SMI Support: DISABLED")
|
||||
endif()
|
||||
|
||||
## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
|
||||
include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults
|
||||
|
||||
|
||||
@@ -36,7 +36,6 @@ ncclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBus
|
||||
ncclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* deviceIndex);
|
||||
ncclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int *hops, int *count);
|
||||
#else
|
||||
#error "DEBUG_TRACE: SMI is indeed disabled, using inline functions."
|
||||
inline ncclResult_t rocm_smi_init() { return ncclSuccess; }
|
||||
inline ncclResult_t rocm_smi_getNumDevice(uint32_t* num_devs) { *num_devs = 0; return ncclSuccess; }
|
||||
inline ncclResult_t rocm_smi_getDevicePciBusIdString(uint32_t deviceIndex, char* pciBusId, size_t len) { if (len > 0) pciBusId[0] = '\0'; return ncclSuccess; }
|
||||
|
||||
Αναφορά σε νέο ζήτημα
Block a user