From 7d629c5959429e967977209c640118bcd8c132dc Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 2 Nov 2023 18:26:00 -0500 Subject: [PATCH 1/3] CMake - Bump version Change-Id: Ibe62c0059262bcb9937ae856b796392b1fe362a0 Signed-off-by: Galantsev, Dmitrii --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ff760b83c..07562c4a8a 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,7 +35,7 @@ find_program (GIT NAMES git) ## Setup the package version based on git tags. set(PKG_VERSION_GIT_TAG_PREFIX "rsmi_pkg_ver") -get_package_version_number("6.0.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) +get_package_version_number("6.1.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) message("Package version: ${PKG_VERSION_STR}") set(${ROCM_SMI_LIBS_TARGET}_VERSION_MAJOR "${VERSION_MAJOR}") set(${ROCM_SMI_LIBS_TARGET}_VERSION_MINOR "${VERSION_MINOR}") From a099f0682a023803b4a44499171e4c26cc8ec9aa Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Fri, 3 Nov 2023 17:47:54 -0500 Subject: [PATCH 2/3] Fix issues introduced in 57b6135e54494a4a8afb631b914bdeb0608a0543 - std=c++.. is not required because CMAKE_CXX_STANDARD is set - nullptr check breaks the test because we rely on nullptr as an api for checking feature availability. - enum number setting is unnecessary Change-Id: I393e6dd3f292b7fa4198302f140c0443ba5e50f5 Signed-off-by: Galantsev, Dmitrii --- CMakeLists.txt | 2 +- include/rocm_smi/rocm_smi.h | 24 ++++++++++++------------ src/rocm_smi.cc | 3 --- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 07562c4a8a..d56e4d177b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,7 +72,7 @@ endif() ## Compiler flags set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti -std=c++17") + "${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti") if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2") diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index ea945c91a0..3dc2fbf5b3 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -363,16 +363,16 @@ typedef rsmi_clk_type_t rsmi_clk_type; */ typedef enum { RSMI_COMPUTE_PARTITION_INVALID = 0, - RSMI_COMPUTE_PARTITION_CPX = 1, //!< Core mode (CPX)- Per-chip XCC with - //!< shared memory - RSMI_COMPUTE_PARTITION_SPX = 2, //!< Single GPU mode (SPX)- All XCCs work - //!< together with shared memory - RSMI_COMPUTE_PARTITION_DPX = 3, //!< Dual GPU mode (DPX)- Half XCCs work - //!< together with shared memory - RSMI_COMPUTE_PARTITION_TPX = 4, //!< Triple GPU mode (TPX)- One-third XCCs - //!< work together with shared memory - RSMI_COMPUTE_PARTITION_QPX = 5, //!< Quad GPU mode (QPX)- Quarter XCCs - //!< work together with shared memory + RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory + RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory } rsmi_compute_partition_type_t; /// \cond Ignore in docs. typedef rsmi_compute_partition_type_t rsmi_compute_partition_type; @@ -680,8 +680,8 @@ typedef enum { */ typedef enum _RSMI_IO_LINK_TYPE { RSMI_IOLINK_TYPE_UNDEFINED = 0, //!< unknown type. - RSMI_IOLINK_TYPE_PCIEXPRESS = 1, //!< PCI Express - RSMI_IOLINK_TYPE_XGMI = 2, //!< XGMI + RSMI_IOLINK_TYPE_PCIEXPRESS, //!< PCI Express + RSMI_IOLINK_TYPE_XGMI, //!< XGMI RSMI_IOLINK_TYPE_NUMIOLINKTYPES, //!< Number of IO Link types RSMI_IOLINK_TYPE_SIZE = 0xFFFFFFFF //!< Max of IO Link types } RSMI_IO_LINK_TYPE; diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 8749b13213..566b087da3 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -2776,9 +2776,6 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) { ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); DEVICE_MUTEX - if (odv == nullptr) { - return RSMI_STATUS_INVALID_ARGS; - } CHK_SUPPORT_NAME_ONLY(odv) rsmi_status_t ret = get_od_clk_volt_info(dv_ind, odv); From e5627d2bf13f971564d14d4c286ee8974e696fd1 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 18 Oct 2023 10:11:57 -0500 Subject: [PATCH 3/3] Sort GPU index using BDF Sort GPU index based on BDF. Also add an API to get the XGMI physical id. Change-Id: I998876e435165c59d450ecd0b979315278b488a5 --- include/rocm_smi/rocm_smi.h | 17 +++++++++++++++++ include/rocm_smi/rocm_smi_device.h | 1 + src/rocm_smi.cc | 15 +++++++++++++++ src/rocm_smi_device.cc | 5 +++++ src/rocm_smi_main.cc | 29 +++++++++++++++++++++++++---- 5 files changed, 63 insertions(+), 4 deletions(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 3dc2fbf5b3..3b5d41d72c 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -1503,6 +1503,23 @@ rsmi_status_t rsmi_dev_subsystem_vendor_id_get(uint32_t dv_ind, uint16_t *id); */ rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id); +/** + * @brief Get the XGMI physical id associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint32_t to + * which the XGMI physical id will be written + * + * @param[in] dv_ind a device index + * + * @param[inout] revision a pointer to uint32_t to which the XGMI physical id + * will be written + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * + */ +rsmi_status_t rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id); + + /** @} */ // end of IDQuer /*****************************************************************************/ diff --git a/include/rocm_smi/rocm_smi_device.h b/include/rocm_smi/rocm_smi_device.h index 21b8101407..5712affa87 100755 --- a/include/rocm_smi/rocm_smi_device.h +++ b/include/rocm_smi/rocm_smi_device.h @@ -103,6 +103,7 @@ enum DevInfoTypes { kDevOverDriveLevel, kDevMemOverDriveLevel, kDevDevID, + kDevXGMIPhysicalID, kDevDevRevID, kDevDevProdName, kDevDevProdNum, diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 566b087da3..325d86a3eb 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -836,6 +836,21 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { return ret; } +rsmi_status_t +rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id) { + std::ostringstream ss; + rsmi_status_t ret; + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); + CHK_SUPPORT_NAME_ONLY(id) + + ret = get_id(dv_ind, amd::smi::kDevXGMIPhysicalID, id); + ss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", reporting " << amd::smi::getRSMIStatusString(ret); + LOG_TRACE(ss); + return ret; +} + rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision) { std::ostringstream outss; diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index bd357c35d1..149cf22718 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -82,6 +82,7 @@ static const char *kDevPerfLevelFName = "power_dpm_force_performance_level"; static const char *kDevDevProdNameFName = "product_name"; static const char *kDevDevProdNumFName = "product_number"; static const char *kDevDevIDFName = "device"; +static const char* kDevXGMIPhysicalIDFName = "xgmi_physical_id"; static const char *kDevDevRevIDFName = "revision"; static const char *kDevVendorIDFName = "vendor"; static const char *kDevSubSysDevIDFName = "subsystem_device"; @@ -238,6 +239,7 @@ static const std::map kDevAttribNameMap = { {kDevDevProdName, kDevDevProdNameFName}, {kDevDevProdNum, kDevDevProdNumFName}, {kDevDevID, kDevDevIDFName}, + {kDevXGMIPhysicalID, kDevXGMIPhysicalIDFName}, {kDevDevRevID, kDevDevRevIDFName}, {kDevVendorID, kDevVendorIDFName}, {kDevSubSysDevID, kDevSubSysDevIDFName}, @@ -379,6 +381,7 @@ static const std::map kDevFuncDependsMap = { // Functions with only mandatory dependencies {"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}}, {"rsmi_dev_id_get", {{kDevDevIDFName}, {}}}, + {"rsmi_dev_xgmi_physical_id_get", {{kDevXGMIPhysicalIDFName}, {}}}, {"rsmi_dev_revision_get", {{kDevDevRevIDFName}, {}}}, {"rsmi_dev_vendor_id_get", {{kDevVendorIDFName}, {}}}, {"rsmi_dev_name_get", {{kDevVendorIDFName, @@ -956,6 +959,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevSubSysVendorID: case kDevVendorID: case kDevErrCntFeatures: + case kDevXGMIPhysicalID: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); @@ -1102,6 +1106,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevComputePartition: case kDevMemoryPartition: case kDevNumaNode: + case kDevXGMIPhysicalID: return readDevInfoStr(type, val); break; diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index 44b40646d3..a948f148af 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -85,6 +86,7 @@ amd::smi::RocmSMI::devInfoTypesStrings = { {amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"}, {amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"}, {amd::smi::kDevDevID, amdSMI + "kDevDevID"}, + {amd::smi::kDevXGMIPhysicalID, amdSMI + "kDevXGMIPhysicalID"}, {amd::smi::kDevDevRevID, amdSMI + "kDevDevRevID"}, {amd::smi::kDevDevProdName, amdSMI + "kDevDevProdName"}, {amd::smi::kDevDevProdNum, amdSMI + "kDevDevProdNum"}, @@ -383,9 +385,28 @@ RocmSMI::Initialize(uint64_t flags) { << "\n | final update: device->bdfid() holds correct device bdf"; LOG_TRACE(ss); } - if (ret != 0) { - throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, - "Failed to initialize rocm_smi library (amdgpu node discovery)."); + + std::shared_ptr dev; + // Sort index based on the BDF, collect BDF id firstly. + std::vector>> dv_to_id; + dv_to_id.reserve(devices_.size()); + for (uint32_t dv_ind = 0; dv_ind < devices_.size(); ++dv_ind) { + dev = devices_[dv_ind]; + uint64_t bdfid = dev->bdfid(); + dv_to_id.push_back({bdfid, dev}); + } + ss << __PRETTY_FUNCTION__ << " Sort index based on BDF."; + LOG_DEBUG(ss); + + // Stable sort to keep the order if bdf is equal. + std::stable_sort(dv_to_id.begin(), dv_to_id.end(), [] + (const std::pair>& p1, + const std::pair>& p2) { + return p1.first < p2.first; + }); + devices_.clear(); + for (uint32_t dv_ind = 0; dv_ind < dv_to_id.size(); ++dv_ind) { + devices_.push_back(dv_to_id[dv_ind].second); } std::map> tmp_map; @@ -406,7 +427,6 @@ RocmSMI::Initialize(uint64_t flags) { for (it = io_link_map_tmp.begin(); it != io_link_map_tmp.end(); it++) io_link_map_[it->first] = it->second; - std::shared_ptr dev; // Remove any drm nodes that don't have a corresponding readable kfd node. // kfd nodes will not be added if their properties file is not readable. @@ -451,6 +471,7 @@ RocmSMI::Initialize(uint64_t flags) { if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { logSystemDetails(); } + // Leaving below to help debug temp file issues // displayAppTmpFilesContent(); std::string amdGPUDeviceList = displayAllDevicePaths(devices_);