diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ff760b83c..d56e4d177b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,7 +35,7 @@ find_program (GIT NAMES git) ## Setup the package version based on git tags. set(PKG_VERSION_GIT_TAG_PREFIX "rsmi_pkg_ver") -get_package_version_number("6.0.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) +get_package_version_number("6.1.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) message("Package version: ${PKG_VERSION_STR}") set(${ROCM_SMI_LIBS_TARGET}_VERSION_MAJOR "${VERSION_MAJOR}") set(${ROCM_SMI_LIBS_TARGET}_VERSION_MINOR "${VERSION_MINOR}") @@ -72,7 +72,7 @@ endif() ## Compiler flags set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti -std=c++17") + "${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti") if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2") diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index ea945c91a0..3b5d41d72c 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -363,16 +363,16 @@ typedef rsmi_clk_type_t rsmi_clk_type; */ typedef enum { RSMI_COMPUTE_PARTITION_INVALID = 0, - RSMI_COMPUTE_PARTITION_CPX = 1, //!< Core mode (CPX)- Per-chip XCC with - //!< shared memory - RSMI_COMPUTE_PARTITION_SPX = 2, //!< Single GPU mode (SPX)- All XCCs work - //!< together with shared memory - RSMI_COMPUTE_PARTITION_DPX = 3, //!< Dual GPU mode (DPX)- Half XCCs work - //!< together with shared memory - RSMI_COMPUTE_PARTITION_TPX = 4, //!< Triple GPU mode (TPX)- One-third XCCs - //!< work together with shared memory - RSMI_COMPUTE_PARTITION_QPX = 5, //!< Quad GPU mode (QPX)- Quarter XCCs - //!< work together with shared memory + RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory + RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory } rsmi_compute_partition_type_t; /// \cond Ignore in docs. typedef rsmi_compute_partition_type_t rsmi_compute_partition_type; @@ -680,8 +680,8 @@ typedef enum { */ typedef enum _RSMI_IO_LINK_TYPE { RSMI_IOLINK_TYPE_UNDEFINED = 0, //!< unknown type. - RSMI_IOLINK_TYPE_PCIEXPRESS = 1, //!< PCI Express - RSMI_IOLINK_TYPE_XGMI = 2, //!< XGMI + RSMI_IOLINK_TYPE_PCIEXPRESS, //!< PCI Express + RSMI_IOLINK_TYPE_XGMI, //!< XGMI RSMI_IOLINK_TYPE_NUMIOLINKTYPES, //!< Number of IO Link types RSMI_IOLINK_TYPE_SIZE = 0xFFFFFFFF //!< Max of IO Link types } RSMI_IO_LINK_TYPE; @@ -1503,6 +1503,23 @@ rsmi_status_t rsmi_dev_subsystem_vendor_id_get(uint32_t dv_ind, uint16_t *id); */ rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id); +/** + * @brief Get the XGMI physical id associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint32_t to + * which the XGMI physical id will be written + * + * @param[in] dv_ind a device index + * + * @param[inout] revision a pointer to uint32_t to which the XGMI physical id + * will be written + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * + */ +rsmi_status_t rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id); + + /** @} */ // end of IDQuer /*****************************************************************************/ diff --git a/include/rocm_smi/rocm_smi_device.h b/include/rocm_smi/rocm_smi_device.h index 21b8101407..5712affa87 100755 --- a/include/rocm_smi/rocm_smi_device.h +++ b/include/rocm_smi/rocm_smi_device.h @@ -103,6 +103,7 @@ enum DevInfoTypes { kDevOverDriveLevel, kDevMemOverDriveLevel, kDevDevID, + kDevXGMIPhysicalID, kDevDevRevID, kDevDevProdName, kDevDevProdNum, diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 8749b13213..325d86a3eb 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -836,6 +836,21 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { return ret; } +rsmi_status_t +rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id) { + std::ostringstream ss; + rsmi_status_t ret; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); + CHK_SUPPORT_NAME_ONLY(id) + + ret = get_id(dv_ind, amd::smi::kDevXGMIPhysicalID, id); + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", reporting " << amd::smi::getRSMIStatusString(ret); + LOG_TRACE(ss); + return ret; +} + rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision) { std::ostringstream outss; @@ -2776,9 +2791,6 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) { ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); DEVICE_MUTEX - if (odv == nullptr) { - return RSMI_STATUS_INVALID_ARGS; - } CHK_SUPPORT_NAME_ONLY(odv) rsmi_status_t ret = get_od_clk_volt_info(dv_ind, odv); diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index bd357c35d1..149cf22718 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -82,6 +82,7 @@ static const char *kDevPerfLevelFName = "power_dpm_force_performance_level"; static const char *kDevDevProdNameFName = "product_name"; static const char *kDevDevProdNumFName = "product_number"; static const char *kDevDevIDFName = "device"; +static const char* kDevXGMIPhysicalIDFName = "xgmi_physical_id"; static const char *kDevDevRevIDFName = "revision"; static const char *kDevVendorIDFName = "vendor"; static const char *kDevSubSysDevIDFName = "subsystem_device"; @@ -238,6 +239,7 @@ static const std::map kDevAttribNameMap = { {kDevDevProdName, kDevDevProdNameFName}, {kDevDevProdNum, kDevDevProdNumFName}, {kDevDevID, kDevDevIDFName}, + {kDevXGMIPhysicalID, kDevXGMIPhysicalIDFName}, {kDevDevRevID, kDevDevRevIDFName}, {kDevVendorID, kDevVendorIDFName}, {kDevSubSysDevID, kDevSubSysDevIDFName}, @@ -379,6 +381,7 @@ static const std::map kDevFuncDependsMap = { // Functions with only mandatory dependencies {"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}}, {"rsmi_dev_id_get", {{kDevDevIDFName}, {}}}, + {"rsmi_dev_xgmi_physical_id_get", {{kDevXGMIPhysicalIDFName}, {}}}, {"rsmi_dev_revision_get", {{kDevDevRevIDFName}, {}}}, {"rsmi_dev_vendor_id_get", {{kDevVendorIDFName}, {}}}, {"rsmi_dev_name_get", {{kDevVendorIDFName, @@ -956,6 +959,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevSubSysVendorID: case kDevVendorID: case kDevErrCntFeatures: + case kDevXGMIPhysicalID: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); @@ -1102,6 +1106,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevComputePartition: case kDevMemoryPartition: case kDevNumaNode: + case kDevXGMIPhysicalID: return readDevInfoStr(type, val); break; diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index 44b40646d3..a948f148af 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -85,6 +86,7 @@ amd::smi::RocmSMI::devInfoTypesStrings = { {amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"}, {amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"}, {amd::smi::kDevDevID, amdSMI + "kDevDevID"}, + {amd::smi::kDevXGMIPhysicalID, amdSMI + "kDevXGMIPhysicalID"}, {amd::smi::kDevDevRevID, amdSMI + "kDevDevRevID"}, {amd::smi::kDevDevProdName, amdSMI + "kDevDevProdName"}, {amd::smi::kDevDevProdNum, amdSMI + "kDevDevProdNum"}, @@ -383,9 +385,28 @@ RocmSMI::Initialize(uint64_t flags) { << "\n | final update: device->bdfid() holds correct device bdf"; LOG_TRACE(ss); } - if (ret != 0) { - throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, - "Failed to initialize rocm_smi library (amdgpu node discovery)."); + + std::shared_ptr dev; + // Sort index based on the BDF, collect BDF id firstly. + std::vector>> dv_to_id; + dv_to_id.reserve(devices_.size()); + for (uint32_t dv_ind = 0; dv_ind < devices_.size(); ++dv_ind) { + dev = devices_[dv_ind]; + uint64_t bdfid = dev->bdfid(); + dv_to_id.push_back({bdfid, dev}); + } + ss << __PRETTY_FUNCTION__ << " Sort index based on BDF."; + LOG_DEBUG(ss); + + // Stable sort to keep the order if bdf is equal. + std::stable_sort(dv_to_id.begin(), dv_to_id.end(), [] + (const std::pair>& p1, + const std::pair>& p2) { + return p1.first < p2.first; + }); + devices_.clear(); + for (uint32_t dv_ind = 0; dv_ind < dv_to_id.size(); ++dv_ind) { + devices_.push_back(dv_to_id[dv_ind].second); } std::map> tmp_map; @@ -406,7 +427,6 @@ RocmSMI::Initialize(uint64_t flags) { for (it = io_link_map_tmp.begin(); it != io_link_map_tmp.end(); it++) io_link_map_[it->first] = it->second; - std::shared_ptr dev; // Remove any drm nodes that don't have a corresponding readable kfd node. // kfd nodes will not be added if their properties file is not readable. @@ -451,6 +471,7 @@ RocmSMI::Initialize(uint64_t flags) { if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { logSystemDetails(); } + // Leaving below to help debug temp file issues // displayAppTmpFilesContent(); std::string amdGPUDeviceList = displayAllDevicePaths(devices_);