From 33eb3fa42953ab254a184c432859a76b542a5044 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Thu, 20 Jun 2024 19:36:23 -0500 Subject: [PATCH] [SWDEV-463213] Add partition ID fallback + new API Changes: - Added rsmi_dev_partition_id_get() -> uses fallback described below for devices which support partition updates. - Updated/added to tests for partitions to reflect these changes. Due to driver changes in KFD, some devices may report bits [31:28] or [2:0]. bits [63:32] = domain bits [31:28] = partition id bits [27:16] = reserved bits [15:8] = Bus bits [7:3] = Device bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes Change-Id: Ia5641cfb8dbe2d1bff52f8eb81d5a159954528d3 Signed-off-by: Charis Poag [ROCm/rocm_smi_lib commit: 323ab1105dcee578454e1feff0b01bb3a98fa01f] --- projects/rocm-smi-lib/CHANGELOG.md | 35 +++ .../rocm-smi-lib/include/rocm_smi/rocm_smi.h | 28 +- .../rocm-smi-lib/python_smi_tools/rocm_smi.py | 24 +- .../python_smi_tools/rsmiBindings.py | 22 +- projects/rocm-smi-lib/src/rocm_smi.cc | 157 ++++++++-- projects/rocm-smi-lib/src/rocm_smi_logger.cc | 2 +- projects/rocm-smi-lib/src/rocm_smi_main.cc | 44 ++- .../functional/computepartition_read_write.cc | 292 ++++++++++++++---- .../tests/rocm_smi_test/test_common.cc | 2 +- 9 files changed, 491 insertions(+), 115 deletions(-) diff --git a/projects/rocm-smi-lib/CHANGELOG.md b/projects/rocm-smi-lib/CHANGELOG.md index be5fc7fc2e..41383b1875 100644 --- a/projects/rocm-smi-lib/CHANGELOG.md +++ b/projects/rocm-smi-lib/CHANGELOG.md @@ -4,6 +4,41 @@ Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/] ***All information listed below is for reference and subject to change.*** +## rocm_smi_lib for ROCm 6.2 + +### Added + +- **Added Partition ID API (`rsmi_dev_partition_id_get(..)`)** +Previously `rsmi_dev_partition_id_get` could only be retrived by querying through `rsmi_dev_pci_id_get()` +and parsing optional bits in our python CLI/API. We are now making this available directly through API. +As well as added testing, in our compute partitioning tests verifing partition IDs update accordingly. + +### Changed + +- N/A + +### Optimized + +- N/A + +### Fixed + +- **Partition ID CLI output** +Due to driver changes in KFD, some devices may report bits [31:28] or [2:0]. With the newly added `rsmi_dev_partition_id_get(..)`, we provided this fallback to properly retreive partition ID. We +plan to eventually remove partition ID from the function portion of the BDF (Bus Device Function). See below for PCI ID description. + + - bits [63:32] = domain + - bits [31:28] or bits [2:0] = partition id + - bits [27:16] = reserved + - bits [15:8] = Bus + - bits [7:3] = Device + - bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes + +### Known Issues + +- N/A + + ## rocm_smi_lib for ROCm 6.1.2 ### Added diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 3d71530bab..7ed96aaf09 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -367,8 +367,6 @@ typedef rsmi_clk_type_t rsmi_clk_type; */ typedef enum { RSMI_COMPUTE_PARTITION_INVALID = 0, - RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with - //!< shared memory RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work //!< together with shared memory RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work @@ -377,6 +375,8 @@ typedef enum { //!< work together with shared memory RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs //!< work together with shared memory + RSMI_COMPUTE_PARTITION_CPX //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory } rsmi_compute_partition_type_t; /// \cond Ignore in docs. typedef rsmi_compute_partition_type_t rsmi_compute_partition_type; @@ -4053,6 +4053,30 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, */ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind); +/** + * @brief Retrieves the partition_id for a desired device + * + * @details + * Given a device index @p dv_ind and a uint32_t pointer @p partition_id , + * this function will attempt to obtain the device's partition ID. + * Upon successful retreival, the obtained device's partition will be stored + * in the passed @p partition_id uint32_t variable. If device does + * not support partitions or is in SPX, a @p partition_id ID of 0 shall + * be returned. + * + * @param[in] dv_ind a device index + * + * @param[inout] partition_id a uint32_t variable, + * which the device's partition_id will be written to. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * + */ +rsmi_status_t rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id); + /** @} */ // end of ComputePartition /*****************************************************************************/ diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index 63773797a0..11f0bc3dc4 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -196,9 +196,11 @@ def getBus(device, silent=False): # BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) | ((BUS & 0xFF) << 8) | # ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7) # bits [63:32] = domain - # bits [31:28] = partition id + # bits [31:28] or bits [2:0] = partition id # bits [27:16] = reserved - # bits [15: 0] = pci bus/device/function + # bits [15:8] = Bus + # bits [7:3] = Device + # bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes domain = (bdfid.value >> 32) & 0xffffffff bus = (bdfid.value >> 8) & 0xff device = (bdfid.value >> 3) & 0x1f @@ -215,19 +217,19 @@ def getPartitionId(device, silent=False): :param silent: Turn on to silence error output (you plan to handle manually). Default is off. """ - bdfid = c_uint64(0) - ret = rocmsmi.rsmi_dev_pci_id_get(device, byref(bdfid)) + partition_id = c_uint32(0) + ret = rocmsmi.rsmi_dev_partition_id_get(device, byref(partition_id)) # BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) | ((BUS & 0xFF) << 8) | # ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7) # bits [63:32] = domain - # bits [31:28] = partition id - # bits [27:16] = reserved - # bits [15: 0] = pci bus/device/function - partition_num = (bdfid.value >> 28) & 0xf - pci_id = bdfid.value - partition_id = '{:d}'.format(partition_num) - if rsmi_ret_ok(ret, device, 'get_pci_id', silent): + # bits [31:28] or bits [2:0] = partition id + # bits [27:16] = reserved + # bits [15:8] = Bus + # bits [7:3] = Device + # bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes + partition_id = '{:d}'.format(partition_id.value) + if rsmi_ret_ok(ret, device, 'rsmi_dev_partition_id_get', silent): return partition_id diff --git a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py index 873051404c..45011fa42b 100644 --- a/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py +++ b/projects/rocm-smi-lib/python_smi_tools/rsmiBindings.py @@ -583,19 +583,19 @@ class rsmi_func_id_value_t(Union): class rsmi_compute_partition_type_t(c_int): RSMI_COMPUTE_PARTITION_INVALID = 0 - RSMI_COMPUTE_PARTITION_CPX = 1 - RSMI_COMPUTE_PARTITION_SPX = 2 - RSMI_COMPUTE_PARTITION_DPX = 3 - RSMI_COMPUTE_PARTITION_TPX = 4 - RSMI_COMPUTE_PARTITION_QPX = 5 + RSMI_COMPUTE_PARTITION_SPX = 1 + RSMI_COMPUTE_PARTITION_DPX = 2 + RSMI_COMPUTE_PARTITION_TPX = 3 + RSMI_COMPUTE_PARTITION_QPX = 4 + RSMI_COMPUTE_PARTITION_CPX = 5 rsmi_compute_partition_type_dict = { #'RSMI_COMPUTE_PARTITION_INVALID': 0, - 'CPX': 1, - 'SPX': 2, - 'DPX': 3, - 'TPX': 4, - 'QPX': 5 + 'SPX': 1, + 'DPX': 2, + 'TPX': 3, + 'QPX': 4, + 'CPX': 5, } rsmi_compute_partition_type = rsmi_compute_partition_type_t @@ -604,7 +604,7 @@ rsmi_compute_partition_type = rsmi_compute_partition_type_t # Usage example to get corresponding names: # compute_partition_type_l[rsmi_compute_partition_type_t.RSMI_COMPUTE_PARTITION_CPX] # will return string 'CPX' -compute_partition_type_l = ['CPX', 'SPX', 'DPX', 'TPX', 'QPX'] +compute_partition_type_l = ['SPX', 'DPX', 'TPX', 'QPX', 'CPX'] class rsmi_memory_partition_type_t(c_int): RSMI_MEMORY_PARTITION_UNKNOWN = 0 diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index f7f9d53ee7..c728ee9b07 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -754,13 +754,18 @@ rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) { kfd_node->get_property_value("domain", &domain); - // Add domain to full pci_id: - // BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) | - // ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7) - // bits [63:32] = domain - // bits [31:28] = partition id in multi partition system - // bits [27:16] = reserved - // bits [15: 0] = pci bus/device/function + /** + * Add domain to full pci_id: + * BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) | + * ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7) + * + * bits [63:32] = domain + * bits [31:28] or bits [2:0] = partition id + * bits [27:16] = reserved + * bits [15:8] = Bus + * bits [7:3] = Device + * bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes + */ assert((domain & 0xFFFFFFFF00000000) == 0); (*bdfid) &= 0xFFFFFFFF; // keep bottom 32 bits of pci_id *bdfid |= (domain & 0xFFFFFFFF) << 32; // Add domain to top of pci_id @@ -4575,9 +4580,12 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, CATCH } -static rsmi_status_t -get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { +static rsmi_status_t get_compute_partition(uint32_t dv_ind, + std::string &compute_partition) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(compute_partition.c_str()) std::string compute_partition_str; @@ -4601,6 +4609,8 @@ get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { return RSMI_STATUS_UNEXPECTED_DATA; } compute_partition = compute_partition_str; + ss << __PRETTY_FUNCTION__ << " | ======= END =======, " << dv_ind; + LOG_TRACE(ss); return RSMI_STATUS_SUCCESS; CATCH } @@ -4610,7 +4620,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, uint32_t len) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start =======, dv_ind = " + ss << __PRETTY_FUNCTION__ << " | ======= start =======, dv_ind = " << dv_ind; LOG_TRACE(ss); if ((len == 0) || (compute_partition == nullptr)) { @@ -4646,7 +4656,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, return ret; } - std::size_t length = returning_compute_partition.copy(compute_partition, len); + std::size_t length = returning_compute_partition.copy(compute_partition, len-1); compute_partition[length]='\0'; if (len < (returning_compute_partition.size() + 1)) { @@ -4680,20 +4690,47 @@ static rsmi_status_t is_available_compute_partition(uint32_t dv_ind, std::string new_compute_partition) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); DEVICE_MUTEX std::string availableComputePartitions; rsmi_status_t ret = get_dev_value_line(amd::smi::kDevAvailableComputePartition, dv_ind, &availableComputePartitions); if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | FAIL " + << " | Device #: " << dv_ind + << " | Type: " + << devInfoTypesStrings.at(amd::smi::kDevAvailableComputePartition) + << " | Data: could not retrieve requested data" + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_ERROR(ss); return ret; } bool isComputePartitionAvailable = amd::smi::containsString(availableComputePartitions, new_compute_partition); - return (isComputePartitionAvailable) ? RSMI_STATUS_SUCCESS : - RSMI_STATUS_SETTING_UNAVAILABLE; + + ret = ((isComputePartitionAvailable) ? RSMI_STATUS_SUCCESS : + RSMI_STATUS_SETTING_UNAVAILABLE); + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " + << devInfoTypesStrings.at(amd::smi::kDevAvailableComputePartition) + << " | Data: available_partitions = " << availableComputePartitions + << " | Data: isComputePartitionAvailable = " + << (isComputePartitionAvailable ? "True" : "False") + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_INFO(ss); + return ret; CATCH } @@ -4702,16 +4739,14 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, rsmi_compute_partition_type_t compute_partition) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS if (!amd::smi::is_sudo_user()) { return RSMI_STATUS_PERMISSION; } - DEVICE_MUTEX - std::string newComputePartitionStr - = mapRSMIToStringComputePartitionTypes.at(compute_partition); - std::string currentComputePartition; + std::string currentComputePartition = ""; + std::string newComputePartitionStr = ""; switch (compute_partition) { case RSMI_COMPUTE_PARTITION_CPX: @@ -4719,9 +4754,13 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, case RSMI_COMPUTE_PARTITION_DPX: case RSMI_COMPUTE_PARTITION_TPX: case RSMI_COMPUTE_PARTITION_QPX: + newComputePartitionStr = + mapRSMIToStringComputePartitionTypes.at(compute_partition); break; case RSMI_COMPUTE_PARTITION_INVALID: default: + newComputePartitionStr = + mapRSMIToStringComputePartitionTypes.at(RSMI_COMPUTE_PARTITION_INVALID); ss << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | Fail " @@ -4798,8 +4837,8 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << "| sizeof string = " << std::dec << sizeof(newComputePartitionStr); LOG_DEBUG(ss); - GET_DEV_FROM_INDX + DEVICE_MUTEX int ret = dev->writeDevInfo(amd::smi::kDevComputePartition, newComputePartitionStr); rsmi_status_t returnResponse = amd::smi::ErrnoToRsmiStatus(ret); @@ -4814,7 +4853,6 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << getRSMIStatusString(returnResponse) << " |"; LOG_TRACE(ss); - // TODO(charpoag): investigate providing GPU busy state occured with return returnResponse; CATCH } @@ -4822,6 +4860,9 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, static rsmi_status_t get_memory_partition(uint32_t dv_ind, std::string &memory_partition) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(memory_partition.c_str()) std::string val_str; @@ -4845,6 +4886,8 @@ static rsmi_status_t get_memory_partition(uint32_t dv_ind, return RSMI_STATUS_UNEXPECTED_DATA; } memory_partition = val_str; + ss << __PRETTY_FUNCTION__ << " | ======= END =======, " << dv_ind; + LOG_TRACE(ss); return RSMI_STATUS_SUCCESS; CATCH } @@ -4854,7 +4897,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, rsmi_memory_partition_type_t memory_partition) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -4989,7 +5032,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, uint32_t len) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; LOG_TRACE(ss); if ((len == 0) || (memory_partition == nullptr)) { ss << __PRETTY_FUNCTION__ @@ -5059,7 +5102,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -5098,7 +5141,7 @@ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) { rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << "| ======= start =======, " << dv_ind; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -5134,6 +5177,72 @@ rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) { CATCH } +rsmi_status_t +rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start =======, " << dv_ind; + LOG_TRACE(ss); + if (partition_id == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | FAIL" + << " | Device #: " << dv_ind + << " | Type: partition_id" + << " | Data: nullptr" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_INVALID_ARGS; + } + DEVICE_MUTEX + std::string strCompPartition = "UNKNOWN"; + const uint32_t PARTITION_LEN = 10; + char compute_partition[PARTITION_LEN]; + rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, compute_partition, PARTITION_LEN); + if (ret == RSMI_STATUS_SUCCESS) { + strCompPartition = compute_partition; + } + uint64_t pci_id = UINT64_MAX; + *partition_id = UINT32_MAX; + ret = rsmi_dev_pci_id_get(dv_ind, &pci_id); + if (ret == RSMI_STATUS_SUCCESS) { + *partition_id = static_cast((pci_id >> 28) & 0xf); + } + + /** + * Fall back is required due to driver changes within KFD. + * Some devices may report bits [31:28] or [2:0]. + * With the newly added rsmi_dev_partition_id_get(..), + * we provided this fallback to properly retrieve the partition ID. We + * plan to eventually remove partition ID from the function portion of the + * BDF (Bus Device Function). See below for PCI ID description. + * + * bits [63:32] = domain + * bits [31:28] or bits [2:0] = partition id + * bits [27:16] = reserved + * bits [15:8] = Bus + * bits [7:3] = Device + * bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes + */ + if (*partition_id != UINT32_MAX && *partition_id == 0 && + (strCompPartition == "DPX" || strCompPartition == "TPX" + || strCompPartition == "CPX" || strCompPartition == "QPX")) { + *partition_id = static_cast(pci_id & 0x7); + } + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success" + << " | Device #: " << dv_ind + << " | Type: partition_id" + << " | Data: " << *partition_id + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |"; + LOG_INFO(ss); + return ret; + CATCH +} + rsmi_status_t rsmi_dev_target_graphics_version_get(uint32_t dv_ind, uint64_t *gfx_version) { TRY diff --git a/projects/rocm-smi-lib/src/rocm_smi_logger.cc b/projects/rocm-smi-lib/src/rocm_smi_logger.cc index c7fcce537f..cba21bbbdd 100644 --- a/projects/rocm-smi-lib/src/rocm_smi_logger.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_logger.cc @@ -265,7 +265,7 @@ void ROCmLogging::Logger::buffer(const char* text) throw() { // and timestamp in the buffer message. Just log the raw bytes. if ((m_LogType == FILE_LOG) && (m_LogLevel >= LOG_LEVEL_BUFFER)) { lock(); - if(!m_File.is_open()) { + if (!m_File.is_open()) { initialize_resources(); if (!m_File.is_open()) { std::cout << "WARNING: re-initializing resources was unsuccessful." diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc index 7d6edea648..8e3d24644e 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_main.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc @@ -235,15 +235,7 @@ RocmSMI::Initialize(uint64_t flags) { int i_ret; std::ostringstream ss; - LOG_ALWAYS("=============== ROCM SMI initialize ================"); - ROCmLogging::Logger::getInstance()->enableAllLogLevels(); - // Leaving below to allow developers to check current log settings - // std::string logSettings = Logger::getInstance()->getLogSettings(); - // std::cout << "Current log settings:\n" << logSettings << std::endl; - if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { - logSystemDetails(); - } assert(ref_count_ == 1); if (ref_count_ != 1) { @@ -259,6 +251,15 @@ RocmSMI::Initialize(uint64_t flags) { // To help debug env variable issues // debugRSMIEnvVarInfo(); + if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { + ROCmLogging::Logger::getInstance()->enableAllLogLevels(); + LOG_ALWAYS("=============== ROCM SMI initialize ================"); + logSystemDetails(); + } + // Leaving below to allow developers to check current log settings + // std::string logSettings = ROCmLogging::Logger::getInstance()->getLogSettings(); + // std::cout << "Current log settings:\n" << logSettings << std::endl; + while (!std::string(kAMDMonitorTypes[i]).empty()) { amd_monitor_types_.insert(kAMDMonitorTypes[i]); ++i; @@ -863,6 +864,15 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << " BDF = " << std::to_string(primaryBdfId) << " (" << print_int_as_hex(primaryBdfId) << ")"; LOG_DEBUG(ss); + if (doesDeviceSupportPartitions && strCompPartition != "SPX" + && i->second.s_partition_id == 0) { + i->second.s_partition_id = i->second.s_function; + ss << __PRETTY_FUNCTION__ << " | (secondary node add) fall back - " + << "detected !SPX && partition_id == 0" + << "; function = " << std::to_string(i->second.s_function) + << "; partition_id = " << std::to_string(i->second.s_partition_id); + LOG_DEBUG(ss); + } ss << __PRETTY_FUNCTION__ << " | (secondary node add) B4 AddToDeviceList() -->" << "\n[node_id = " << std::to_string(i->second.s_node_id) @@ -881,6 +891,15 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { } else { ss << __PRETTY_FUNCTION__ << " | primary node add ; " << " BDF = " << std::to_string(UINT64_MAX); + if (doesDeviceSupportPartitions && strCompPartition != "SPX" + && i->second.s_partition_id == 0) { + i->second.s_partition_id = i->second.s_function; + ss << __PRETTY_FUNCTION__ << " | (primary node add) fall back - " + << "detected !SPX && partition_id == 0" + << "; function = " << std::to_string(i->second.s_function) + << "; partition_id = " << std::to_string(i->second.s_partition_id); + LOG_DEBUG(ss); + } LOG_DEBUG(ss); ss << __PRETTY_FUNCTION__ << " | (primary node add) After AddToDeviceList() -->" @@ -1010,6 +1029,15 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << " BDF = " << std::to_string(myBdfId) << " (" << print_int_as_hex(myBdfId) << ")"; LOG_DEBUG(ss); + if (doesDeviceSupportPartitions && strCompPartition != "SPX" + && it->second.s_partition_id == 0) { + it->second.s_partition_id = it->second.s_function; + ss << __PRETTY_FUNCTION__ << " | (secondary node add #2) fall back - " + << "detected !SPX && partition_id == 0" + << "; function = " << std::to_string(it->second.s_function) + << "; partition_id = " << std::to_string(it->second.s_partition_id); + LOG_DEBUG(ss); + } ss << __PRETTY_FUNCTION__ << " | (secondary node add #2) B4 AddToDeviceList() -->" << "\n[node_id = " << std::to_string(it->second.s_node_id) diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc index 4fabb84f6f..254dc6acaa 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc @@ -88,6 +88,13 @@ void TestComputePartitionReadWrite::Close() { TestBase::Close(); } +const uint32_t MAX_UNSUPPORTED_PARTITIONS = 0; +const uint32_t MAX_SPX_PARTITIONS = 1; +const uint32_t MAX_DPX_PARTITIONS = 2; +const uint32_t MAX_TPX_PARTITIONS = 3; +const uint32_t MAX_QPX_PARTITIONS = 4; +const uint32_t MAX_CPX_PARTITIONS = 8; + static const std::string computePartitionString(rsmi_compute_partition_type computeParitionType) { /** @@ -139,50 +146,186 @@ static void system_wait(int seconds) { static const std::map mapStringToRSMIComputePartitionTypes { - {"CPX", RSMI_COMPUTE_PARTITION_CPX}, - {"SPX", RSMI_COMPUTE_PARTITION_SPX}, {"DPX", RSMI_COMPUTE_PARTITION_DPX}, {"TPX", RSMI_COMPUTE_PARTITION_TPX}, - {"QPX", RSMI_COMPUTE_PARTITION_QPX} + {"QPX", RSMI_COMPUTE_PARTITION_QPX}, + {"CPX", RSMI_COMPUTE_PARTITION_CPX}, + {"SPX", RSMI_COMPUTE_PARTITION_SPX} }; -void TestComputePartitionReadWrite::Run(void) { - rsmi_status_t ret, err; - char orig_char_computePartition[255]; - char current_char_computePartition[255]; +static void checkPartitionIdChanges( + uint32_t dev, const std::string current_partition, bool isVerbose, + bool reinitialize) { + uint32_t max_loop = MAX_SPX_PARTITIONS; - TestBase::Run(); - if (setup_failed_) { - std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl; - return; + // re-initialize to ensure new device ordering is followed + if (reinitialize) { + if (isVerbose) { + std::cout << "\t**Reinitializing device list due to parition changes.\n"; + } + rsmi_shut_down(); + rsmi_init(0); } - // Confirm system supports compute partition, before executing wait - ret = rsmi_dev_compute_partition_get(0, orig_char_computePartition, 255); - if (ret == RSMI_STATUS_SUCCESS) { - system_wait(25); - } - - for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { - if (dv_ind != 0) { - IF_VERB(STANDARD) { - std::cout << std::endl; + if (current_partition == "DPX") { + max_loop = MAX_DPX_PARTITIONS; + } else if (current_partition == "TPX") { + max_loop = MAX_TPX_PARTITIONS; + } else if (current_partition == "QPX") { + max_loop = MAX_QPX_PARTITIONS; + } else if (current_partition == "CPX") { + max_loop = MAX_CPX_PARTITIONS; + uint16_t num_xcd; + rsmi_status_t ret = rsmi_dev_metrics_xcd_counter_get(dev, &num_xcd); + if (ret == RSMI_STATUS_SUCCESS) { + max_loop = num_xcd; + if (isVerbose) { + std::cout << "\t**Expecting num_xcd = " << num_xcd << " to equal " + "total CPX nodes\n"; } } - PrintDeviceHeader(dv_ind); - bool devicePartitionUpdated = false; + } - // Standard checks to see if API is supported, before running full tests - ret = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition, - 255); - if (ret == RSMI_STATUS_NOT_SUPPORTED) { - IF_VERB(STANDARD) { - std::cout << "\t**" << ": " - << "Not supported on this device" << std::endl; + for (uint32_t i = dev; i < dev + max_loop; i++) { + uint32_t partition_id; + rsmi_status_t ret = rsmi_dev_partition_id_get(i, &partition_id); + std::cout << "\t** Checking Partition ID | Device: " << std::to_string(i) + << "; Current Partition: " << current_partition + << " ; Max partition IDs to check: " << max_loop << "\n"; + ASSERT_EQ(ret, RSMI_STATUS_SUCCESS); + if (ret == RSMI_STATUS_SUCCESS && current_partition == "SPX") { + ASSERT_LT(partition_id, max_loop); + if (isVerbose) { + std::cout << "\n\t**Confirmed partition_id < " << max_loop + << " for SPX" + << "\n\t**rsmi_dev_partition_id_get(" + std::to_string(i) + + ", &partition_id); partition_id = " + << static_cast(partition_id) << std::endl; + } + } else if (ret == RSMI_STATUS_SUCCESS && current_partition == "DPX") { + ASSERT_LT(partition_id, max_loop); + if (isVerbose) { + std::cout << "\n\t**Confirmed partition_id < " << max_loop + << " for DPX" + << "\n\t**rsmi_dev_partition_id_get(" + std::to_string(i) + + ", &partition_id); partition_id = " + << static_cast(partition_id) << std::endl; + } + } else if (ret == RSMI_STATUS_SUCCESS && current_partition == "TPX") { + ASSERT_LT(partition_id, max_loop); + if (isVerbose) { + std::cout << "\n\t**Confirmed partition_id < " + << max_loop << " for TPX" + << "\n\t**rsmi_dev_partition_id_get(" + std::to_string(i) + + ", &partition_id); partition_id = " + << static_cast(partition_id) << std::endl; + } + } else if (ret == RSMI_STATUS_SUCCESS && current_partition == "QPX") { + ASSERT_LT(partition_id, max_loop); + if (isVerbose) { + std::cout << "\n\t**Confirmed partition_id < " + << max_loop << " for QPX" + << "\n\t**rsmi_dev_partition_id_get(" + std::to_string(i) + + ", &partition_id); partition_id = " + << static_cast(partition_id) << std::endl; + } + } else if (ret == RSMI_STATUS_SUCCESS && current_partition == "CPX") { + ASSERT_LT(partition_id, max_loop); + if (isVerbose) { + std::cout << "\n\t**Confirmed partition_id < " + << max_loop << " for CPX" + << "\n\t**rsmi_dev_partition_id_get(" + std::to_string(i) + + ", &partition_id); partition_id = " + << static_cast(partition_id) << std::endl; + } + } else if (ret == RSMI_STATUS_SUCCESS && current_partition == "UNKNOWN") { + ASSERT_EQ(partition_id, max_loop - 1); + if (isVerbose) { + std::cout << "\n\t**Confirmed partition_id = " + << (max_loop - 1) + << " for current_partition = UNKNOWN" + << "\n\t**rsmi_dev_partition_id_get(" + std::to_string(i) + + ", &partition_id); partition_id = " + << static_cast(partition_id) << std::endl; + } + } + } +} + +void TestComputePartitionReadWrite::Run(void) { + rsmi_status_t ret, err; + char orig_char_computePartition[255]; + orig_char_computePartition[0] = '\0'; + char current_char_computePartition[255]; + current_char_computePartition[0] = '\0'; + + TestBase::Run(); + if (setup_failed_) { + std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl; + return; + } + bool isVerbose = (this->verbosity() && + this->verbosity() >= (this->TestBase::VERBOSE_STANDARD)) ? true: false; + + // Confirm system supports compute partition, before executing wait + ret = rsmi_dev_compute_partition_get(0, orig_char_computePartition, 255); + if (ret == RSMI_STATUS_SUCCESS) { + system_wait(15); + } + + // initial_num_devices - keep this value static, due to parition changes + // fluctuating # of devices. We should end up with same # of devices at + // end of test. + uint32_t initial_num_devices = num_monitor_devs(); + for (uint32_t dv_ind = 0; dv_ind < initial_num_devices; ++dv_ind) { + if (dv_ind >= 0) { + IF_VERB(STANDARD) { + std::cout << std::endl; + std::cout << "\t**" + << "========= LOOP THROUGH DEVICES - DEVICE #" + << std::to_string(dv_ind) << " ==============" + << std::endl; } + } + PrintDeviceHeader(dv_ind); + bool devicePartitionUpdated = false; + + ret = rsmi_dev_partition_id_get(dv_ind, nullptr); + ASSERT_EQ(ret, RSMI_STATUS_INVALID_ARGS); + IF_VERB(STANDARD) { + if (ret == RSMI_STATUS_INVALID_ARGS) { + std::cout << "\t**" << "Confirmed rsmi_dev_partition_id_get(..,nullptr): " + << "RSMI_STATUS_INVALID_ARGS" << std::endl; + } + } + + std::string partitionStr = ""; + ret = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition, 255); + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_compute_partition_get(): " + << "Not supported on this device" + << std::endl; + } + partitionStr = orig_char_computePartition; + if (partitionStr.empty()) { + partitionStr = computePartitionString( + rsmi_compute_partition_type_t::RSMI_COMPUTE_PARTITION_INVALID); + } + // Regardless of partition support - no changes made, so no device + // refresh needed (ie. rsmi_dev_compute_partition_set(..)) + checkPartitionIdChanges(dv_ind, partitionStr, isVerbose, false); continue; } else { CHK_ERR_ASRT(ret) + std::string partitionStr = orig_char_computePartition; + if (partitionStr.empty()) { + partitionStr = computePartitionString( + rsmi_compute_partition_type_t::RSMI_COMPUTE_PARTITION_INVALID); + } + // Regardless of partition support - no changes made, so no device + // refresh needed (ie. rsmi_dev_compute_partition_set(..)) + checkPartitionIdChanges(dv_ind, partitionStr, isVerbose, false); } IF_VERB(STANDARD) { std::cout << std::endl << "\t**" @@ -236,32 +379,12 @@ void TestComputePartitionReadWrite::Run(void) { } } - // Verify api support checking functionality is working - rsmi_compute_partition_type_t breakMe; - err = rsmi_dev_compute_partition_set(dv_ind, breakMe); - std::cout << "\t**rsmi_dev_compute_partition_set(null ptr): " - << amd::smi::getRSMIStatusString(err, false) << "\n"; - ASSERT_TRUE((err == RSMI_STATUS_INVALID_ARGS) || - (err == RSMI_STATUS_NOT_SUPPORTED) || - (err == RSMI_STATUS_PERMISSION)); - IF_VERB(STANDARD) { - if (err == RSMI_STATUS_INVALID_ARGS) { - std::cout << "\t**" - << "Confirmed RSMI_STATUS_INVALID_ARGS was returned." - << std::endl; - } else if (err == RSMI_STATUS_PERMISSION) { - DISPLAY_RSMI_ERR(err) - // tests should not continue if err is a permission issue - ASSERT_FALSE(err == RSMI_STATUS_PERMISSION); - } else { - DISPLAY_RSMI_ERR(err) - } - } - // Re-run original get, so we can reset to later ret = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition, 255); ASSERT_EQ(RSMI_STATUS_SUCCESS, ret); + std::cout << "\t**rsmi_dev_compute_partition_get(" << dv_ind + << ", " << orig_char_computePartition << ")\n"; /** * RSMI_COMPUTE_PARTITION_INVALID = 0, @@ -277,8 +400,27 @@ void TestComputePartitionReadWrite::Run(void) { * //!< work together with shared memory */ - for (int partition = static_cast(RSMI_COMPUTE_PARTITION_CPX); - partition <= static_cast(RSMI_COMPUTE_PARTITION_QPX); + /** + * General Loop Logic: + * [0:SPX, 1:SPX, 2:SPX, 3:SPX] + * [0:DPX, 1:DPX, 2: SPX, 3:SPX, 4:SPX] <- set 0 to DPX + * [0:TPX, 1:TPX, 2:TPX, 3:SPX, 4:SPX, 5:SPX] <- set 0 to TPX + * [0:QPX, 1:QPX, 2:QPX, 3:QPX, 4:SPX, 5:SPX, 6:SPX] <- set 0 to TPX + * [0:CPX, 1:CPX, 2:CPX, 3:CPX, 4:CPX, 5:SPX, 6:SPX, 7:SPX] <- set 0 to CPX + * [0:SPX, 1:SPX, 2:SPX, 3:SPX] <- reset(0) + * +1 index + * [0:SPX, 1:SPX, 2:SPX, 3:SPX] + * [0:SPX, 1:DPX, 2: DPX, 3:SPX, 4:SPX] <- set 1 to DPX + * [0:SPX, 1:TPX, 2:TPX, 3:TPX, 4:SPX, 5:SPX] <- set 1 to TPX + * [0:SPX, 1:QPX, 2:QPX, 3:QPX, 4:QPX, 5:SPX, 6:SPX] <- set 1 to TPX + * [0:SPX, 1:CPX, 2:CPX, 3:CPX, 4:CPX, 5:CPX, 6:SPX, 7:SPX] <- set 1 to CPX + * [0:SPX, 1:SPX, 2:SPX, 3:SPX] <- reset(1) + * ... + * + */ + std::string final_partition_state = "UNKNOWN"; + for (int partition = static_cast(RSMI_COMPUTE_PARTITION_SPX); + partition <= static_cast(RSMI_COMPUTE_PARTITION_CPX); partition++) { rsmi_compute_partition_type_t updatePartition = static_cast(partition); @@ -292,7 +434,8 @@ void TestComputePartitionReadWrite::Run(void) { ret = rsmi_dev_compute_partition_set(dv_ind, updatePartition); IF_VERB(STANDARD) { std::cout << "\t**" - << "rsmi_dev_compute_partition_set(dv_ind, updatePartition): " + << "rsmi_dev_compute_partition_set(" << dv_ind + << ", updatePartition): " << amd::smi::getRSMIStatusString(ret, false) << "\n" << "\t**New Partition (set): " << computePartitionString(updatePartition) << "\n"; @@ -341,6 +484,7 @@ void TestComputePartitionReadWrite::Run(void) { if (strcmp(orig_char_computePartition, current_char_computePartition) != 0) { devicePartitionUpdated = true; + final_partition_state = current_char_computePartition; } else { devicePartitionUpdated = false; } @@ -356,8 +500,13 @@ void TestComputePartitionReadWrite::Run(void) { << computePartitionString(updatePartition) << ")" << std::endl; } + + checkPartitionIdChanges(dv_ind, computePartitionString(updatePartition), + isVerbose, true); } } // END looping through partition changes + std::cout << "\t**=========== END PARTITION LOOP (dev = " + << std::to_string(dv_ind) << ") ===========\n"; /* TEST RETURN TO BOOT COMPUTE PARTITION SETTING */ IF_VERB(STANDARD) { @@ -371,7 +520,7 @@ void TestComputePartitionReadWrite::Run(void) { ret = rsmi_dev_compute_partition_reset(dv_ind); IF_VERB(STANDARD) { std::cout << "\t**" - << "rsmi_dev_compute_partition_reset(dv_ind): " + << "rsmi_dev_compute_partition_reset(" << dv_ind << "): " << amd::smi::getRSMIStatusString(ret, false) << "\n"; } ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) || @@ -393,6 +542,14 @@ void TestComputePartitionReadWrite::Run(void) { << "\t**" << "Partitions Updated: " << (devicePartitionUpdated ? "TRUE" : "FALSE") << "\n"; } + + if (final_partition_state != std::string(current_char_computePartition)) { + checkPartitionIdChanges(dv_ind, std::string(current_char_computePartition), + isVerbose, true); + } else { + checkPartitionIdChanges(dv_ind, std::string(current_char_computePartition), + isVerbose, false); + } if (wasResetSuccess && devicePartitionUpdated) { ASSERT_STRNE(oldPartition.c_str(), current_char_computePartition); IF_VERB(STANDARD) { @@ -401,6 +558,7 @@ void TestComputePartitionReadWrite::Run(void) { << "equal to current\n\t partition (" << current_char_computePartition << ")" << std::endl; } + final_partition_state = std::string(current_char_computePartition); } else { ASSERT_STREQ(oldPartition.c_str(), current_char_computePartition); IF_VERB(STANDARD) { @@ -424,7 +582,12 @@ void TestComputePartitionReadWrite::Run(void) { ret = rsmi_dev_compute_partition_set(dv_ind, newPartition); CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { - std::cout << "\t**" << "Returning compute partition to: " + std::cout << "\t**" + << "rsmi_dev_compute_partition_set(" + << std::to_string(dv_ind) << ", " + << std::string(orig_char_computePartition) << ")" << std::endl; + std::cout << "\t**" + << "Returning compute partition to: " << computePartitionString(newPartition) << std::endl; } ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition, @@ -440,5 +603,20 @@ void TestComputePartitionReadWrite::Run(void) { ASSERT_EQ(RSMI_STATUS_SUCCESS, ret); ASSERT_STREQ(computePartitionString(newPartition).c_str(), current_char_computePartition); + + // only refresh (rsmi_shut_down() -> rsmi_init(0)) device list + // if there was a partition change + if (final_partition_state != std::string(current_char_computePartition)) { + checkPartitionIdChanges(dv_ind, computePartitionString(newPartition), + isVerbose, true); + } else { + checkPartitionIdChanges(dv_ind, computePartitionString(newPartition), + isVerbose, false); + } + std::cout << "\t**" + << "========= END LOOP THROUGH DEVICES - DEVICE #" + << std::to_string(dv_ind) << " ==============" + << std::endl; } // END looping through devices + std::cout << "\t**=========== END TEST ===========\n"; } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/test_common.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/test_common.cc index 5e8e3f8e47..f1e7888980 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/test_common.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/test_common.cc @@ -229,7 +229,7 @@ const char *FreqEnumToStr(rsmi_clk_type rsmi_clk) { } void printRSMIError(rsmi_status_t err) { - std::cout << "err = " << amd::smi::getRSMIStatusString(err); + std::cout << "err = " << amd::smi::getRSMIStatusString(err) << "\n"; } #if ENABLE_SMI