[SWDEV-463213] Add partition ID fallback + new API

Changes:
- Added rsmi_dev_partition_id_get() -> uses fallback described
  below for devices which support partition updates.
- Updated/added to tests for partitions to reflect these changes.

Due to driver changes in KFD, some devices may report bits [31:28] or [2:0].
bits [63:32] = domain
bits [31:28] = partition id
bits [27:16] = reserved
bits [15:8]  = Bus
bits [7:3] = Device
bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes

Change-Id: Ia5641cfb8dbe2d1bff52f8eb81d5a159954528d3
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/rocm_smi_lib commit: 323ab1105d]
此提交包含在:
Charis Poag
2024-06-20 19:36:23 -05:00
父節點 0a11e23c08
當前提交 33eb3fa429
共有 9 個檔案被更改,包括 491 行新增115 行删除
+35
查看文件
@@ -4,6 +4,41 @@ Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/]
***All information listed below is for reference and subject to change.***
## rocm_smi_lib for ROCm 6.2
### Added
- **Added Partition ID API (`rsmi_dev_partition_id_get(..)`)**
Previously `rsmi_dev_partition_id_get` could only be retrived by querying through `rsmi_dev_pci_id_get()`
and parsing optional bits in our python CLI/API. We are now making this available directly through API.
As well as added testing, in our compute partitioning tests verifing partition IDs update accordingly.
### Changed
- N/A
### Optimized
- N/A
### Fixed
- **Partition ID CLI output**
Due to driver changes in KFD, some devices may report bits [31:28] or [2:0]. With the newly added `rsmi_dev_partition_id_get(..)`, we provided this fallback to properly retreive partition ID. We
plan to eventually remove partition ID from the function portion of the BDF (Bus Device Function). See below for PCI ID description.
- bits [63:32] = domain
- bits [31:28] or bits [2:0] = partition id
- bits [27:16] = reserved
- bits [15:8] = Bus
- bits [7:3] = Device
- bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes
### Known Issues
- N/A
## rocm_smi_lib for ROCm 6.1.2
### Added
+26 -2
查看文件
@@ -367,8 +367,6 @@ typedef rsmi_clk_type_t rsmi_clk_type;
*/
typedef enum {
RSMI_COMPUTE_PARTITION_INVALID = 0,
RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with
//!< shared memory
RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work
//!< together with shared memory
RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work
@@ -377,6 +375,8 @@ typedef enum {
//!< work together with shared memory
RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs
//!< work together with shared memory
RSMI_COMPUTE_PARTITION_CPX //!< Core mode (CPX)- Per-chip XCC with
//!< shared memory
} rsmi_compute_partition_type_t;
/// \cond Ignore in docs.
typedef rsmi_compute_partition_type_t rsmi_compute_partition_type;
@@ -4053,6 +4053,30 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
*/
rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind);
/**
* @brief Retrieves the partition_id for a desired device
*
* @details
* Given a device index @p dv_ind and a uint32_t pointer @p partition_id ,
* this function will attempt to obtain the device's partition ID.
* Upon successful retreival, the obtained device's partition will be stored
* in the passed @p partition_id uint32_t variable. If device does
* not support partitions or is in SPX, a @p partition_id ID of 0 shall
* be returned.
*
* @param[in] dv_ind a device index
*
* @param[inout] partition_id a uint32_t variable,
* which the device's partition_id will be written to.
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function
*
*/
rsmi_status_t rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id);
/** @} */ // end of ComputePartition
/*****************************************************************************/
+13 -11
查看文件
@@ -196,9 +196,11 @@ def getBus(device, silent=False):
# BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) | ((BUS & 0xFF) << 8) |
# ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7)
# bits [63:32] = domain
# bits [31:28] = partition id
# bits [31:28] or bits [2:0] = partition id
# bits [27:16] = reserved
# bits [15: 0] = pci bus/device/function
# bits [15:8] = Bus
# bits [7:3] = Device
# bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes
domain = (bdfid.value >> 32) & 0xffffffff
bus = (bdfid.value >> 8) & 0xff
device = (bdfid.value >> 3) & 0x1f
@@ -215,19 +217,19 @@ def getPartitionId(device, silent=False):
:param silent: Turn on to silence error output
(you plan to handle manually). Default is off.
"""
bdfid = c_uint64(0)
ret = rocmsmi.rsmi_dev_pci_id_get(device, byref(bdfid))
partition_id = c_uint32(0)
ret = rocmsmi.rsmi_dev_partition_id_get(device, byref(partition_id))
# BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) | ((BUS & 0xFF) << 8) |
# ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7)
# bits [63:32] = domain
# bits [31:28] = partition id
# bits [27:16] = reserved
# bits [15: 0] = pci bus/device/function
partition_num = (bdfid.value >> 28) & 0xf
pci_id = bdfid.value
partition_id = '{:d}'.format(partition_num)
if rsmi_ret_ok(ret, device, 'get_pci_id', silent):
# bits [31:28] or bits [2:0] = partition id
# bits [27:16] = reserved
# bits [15:8] = Bus
# bits [7:3] = Device
# bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes
partition_id = '{:d}'.format(partition_id.value)
if rsmi_ret_ok(ret, device, 'rsmi_dev_partition_id_get', silent):
return partition_id
+11 -11
查看文件
@@ -583,19 +583,19 @@ class rsmi_func_id_value_t(Union):
class rsmi_compute_partition_type_t(c_int):
RSMI_COMPUTE_PARTITION_INVALID = 0
RSMI_COMPUTE_PARTITION_CPX = 1
RSMI_COMPUTE_PARTITION_SPX = 2
RSMI_COMPUTE_PARTITION_DPX = 3
RSMI_COMPUTE_PARTITION_TPX = 4
RSMI_COMPUTE_PARTITION_QPX = 5
RSMI_COMPUTE_PARTITION_SPX = 1
RSMI_COMPUTE_PARTITION_DPX = 2
RSMI_COMPUTE_PARTITION_TPX = 3
RSMI_COMPUTE_PARTITION_QPX = 4
RSMI_COMPUTE_PARTITION_CPX = 5
rsmi_compute_partition_type_dict = {
#'RSMI_COMPUTE_PARTITION_INVALID': 0,
'CPX': 1,
'SPX': 2,
'DPX': 3,
'TPX': 4,
'QPX': 5
'SPX': 1,
'DPX': 2,
'TPX': 3,
'QPX': 4,
'CPX': 5,
}
rsmi_compute_partition_type = rsmi_compute_partition_type_t
@@ -604,7 +604,7 @@ rsmi_compute_partition_type = rsmi_compute_partition_type_t
# Usage example to get corresponding names:
# compute_partition_type_l[rsmi_compute_partition_type_t.RSMI_COMPUTE_PARTITION_CPX]
# will return string 'CPX'
compute_partition_type_l = ['CPX', 'SPX', 'DPX', 'TPX', 'QPX']
compute_partition_type_l = ['SPX', 'DPX', 'TPX', 'QPX', 'CPX']
class rsmi_memory_partition_type_t(c_int):
RSMI_MEMORY_PARTITION_UNKNOWN = 0
+133 -24
查看文件
@@ -754,13 +754,18 @@ rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) {
kfd_node->get_property_value("domain", &domain);
// Add domain to full pci_id:
// BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) |
// ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7)
// bits [63:32] = domain
// bits [31:28] = partition id in multi partition system
// bits [27:16] = reserved
// bits [15: 0] = pci bus/device/function
/**
* Add domain to full pci_id:
* BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) |
* ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7)
*
* bits [63:32] = domain
* bits [31:28] or bits [2:0] = partition id
* bits [27:16] = reserved
* bits [15:8] = Bus
* bits [7:3] = Device
* bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes
*/
assert((domain & 0xFFFFFFFF00000000) == 0);
(*bdfid) &= 0xFFFFFFFF; // keep bottom 32 bits of pci_id
*bdfid |= (domain & 0xFFFFFFFF) << 32; // Add domain to top of pci_id
@@ -4575,9 +4580,12 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst,
CATCH
}
static rsmi_status_t
get_compute_partition(uint32_t dv_ind, std::string &compute_partition) {
static rsmi_status_t get_compute_partition(uint32_t dv_ind,
std::string &compute_partition) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind;
LOG_TRACE(ss);
CHK_SUPPORT_NAME_ONLY(compute_partition.c_str())
std::string compute_partition_str;
@@ -4601,6 +4609,8 @@ get_compute_partition(uint32_t dv_ind, std::string &compute_partition) {
return RSMI_STATUS_UNEXPECTED_DATA;
}
compute_partition = compute_partition_str;
ss << __PRETTY_FUNCTION__ << " | ======= END =======, " << dv_ind;
LOG_TRACE(ss);
return RSMI_STATUS_SUCCESS;
CATCH
}
@@ -4610,7 +4620,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition,
uint32_t len) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "| ======= start =======, dv_ind = "
ss << __PRETTY_FUNCTION__ << " | ======= start =======, dv_ind = "
<< dv_ind;
LOG_TRACE(ss);
if ((len == 0) || (compute_partition == nullptr)) {
@@ -4646,7 +4656,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition,
return ret;
}
std::size_t length = returning_compute_partition.copy(compute_partition, len);
std::size_t length = returning_compute_partition.copy(compute_partition, len-1);
compute_partition[length]='\0';
if (len < (returning_compute_partition.size() + 1)) {
@@ -4680,20 +4690,47 @@ static rsmi_status_t
is_available_compute_partition(uint32_t dv_ind,
std::string new_compute_partition) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind;
LOG_TRACE(ss);
DEVICE_MUTEX
std::string availableComputePartitions;
rsmi_status_t ret =
get_dev_value_line(amd::smi::kDevAvailableComputePartition,
dv_ind, &availableComputePartitions);
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | FAIL "
<< " | Device #: " << dv_ind
<< " | Type: "
<< devInfoTypesStrings.at(amd::smi::kDevAvailableComputePartition)
<< " | Data: could not retrieve requested data"
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
LOG_ERROR(ss);
return ret;
}
bool isComputePartitionAvailable =
amd::smi::containsString(availableComputePartitions,
new_compute_partition);
return (isComputePartitionAvailable) ? RSMI_STATUS_SUCCESS :
RSMI_STATUS_SETTING_UNAVAILABLE;
ret = ((isComputePartitionAvailable) ? RSMI_STATUS_SUCCESS :
RSMI_STATUS_SETTING_UNAVAILABLE);
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success "
<< " | Device #: " << dv_ind
<< " | Type: "
<< devInfoTypesStrings.at(amd::smi::kDevAvailableComputePartition)
<< " | Data: available_partitions = " << availableComputePartitions
<< " | Data: isComputePartitionAvailable = "
<< (isComputePartitionAvailable ? "True" : "False")
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
LOG_INFO(ss);
return ret;
CATCH
}
@@ -4702,16 +4739,14 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
rsmi_compute_partition_type_t compute_partition) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind;
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
if (!amd::smi::is_sudo_user()) {
return RSMI_STATUS_PERMISSION;
}
DEVICE_MUTEX
std::string newComputePartitionStr
= mapRSMIToStringComputePartitionTypes.at(compute_partition);
std::string currentComputePartition;
std::string currentComputePartition = "";
std::string newComputePartitionStr = "";
switch (compute_partition) {
case RSMI_COMPUTE_PARTITION_CPX:
@@ -4719,9 +4754,13 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
case RSMI_COMPUTE_PARTITION_DPX:
case RSMI_COMPUTE_PARTITION_TPX:
case RSMI_COMPUTE_PARTITION_QPX:
newComputePartitionStr =
mapRSMIToStringComputePartitionTypes.at(compute_partition);
break;
case RSMI_COMPUTE_PARTITION_INVALID:
default:
newComputePartitionStr =
mapRSMIToStringComputePartitionTypes.at(RSMI_COMPUTE_PARTITION_INVALID);
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
@@ -4798,8 +4837,8 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
<< "| sizeof string = " << std::dec
<< sizeof(newComputePartitionStr);
LOG_DEBUG(ss);
GET_DEV_FROM_INDX
DEVICE_MUTEX
int ret = dev->writeDevInfo(amd::smi::kDevComputePartition,
newComputePartitionStr);
rsmi_status_t returnResponse = amd::smi::ErrnoToRsmiStatus(ret);
@@ -4814,7 +4853,6 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
<< getRSMIStatusString(returnResponse) << " |";
LOG_TRACE(ss);
// TODO(charpoag): investigate providing GPU busy state occured with
return returnResponse;
CATCH
}
@@ -4822,6 +4860,9 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
static rsmi_status_t get_memory_partition(uint32_t dv_ind,
std::string &memory_partition) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind;
LOG_TRACE(ss);
CHK_SUPPORT_NAME_ONLY(memory_partition.c_str())
std::string val_str;
@@ -4845,6 +4886,8 @@ static rsmi_status_t get_memory_partition(uint32_t dv_ind,
return RSMI_STATUS_UNEXPECTED_DATA;
}
memory_partition = val_str;
ss << __PRETTY_FUNCTION__ << " | ======= END =======, " << dv_ind;
LOG_TRACE(ss);
return RSMI_STATUS_SUCCESS;
CATCH
}
@@ -4854,7 +4897,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
rsmi_memory_partition_type_t memory_partition) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind;
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
@@ -4989,7 +5032,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition,
uint32_t len) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind;
LOG_TRACE(ss);
if ((len == 0) || (memory_partition == nullptr)) {
ss << __PRETTY_FUNCTION__
@@ -5059,7 +5102,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition,
rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind;
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
@@ -5098,7 +5141,7 @@ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) {
rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
ss << __PRETTY_FUNCTION__ << "| ======= start =======, " << dv_ind;
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
@@ -5134,6 +5177,72 @@ rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) {
CATCH
}
rsmi_status_t
rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) {
TRY
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "| ======= start =======, " << dv_ind;
LOG_TRACE(ss);
if (partition_id == nullptr) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | FAIL"
<< " | Device #: " << dv_ind
<< " | Type: partition_id"
<< " | Data: nullptr"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
LOG_ERROR(ss);
return RSMI_STATUS_INVALID_ARGS;
}
DEVICE_MUTEX
std::string strCompPartition = "UNKNOWN";
const uint32_t PARTITION_LEN = 10;
char compute_partition[PARTITION_LEN];
rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, compute_partition, PARTITION_LEN);
if (ret == RSMI_STATUS_SUCCESS) {
strCompPartition = compute_partition;
}
uint64_t pci_id = UINT64_MAX;
*partition_id = UINT32_MAX;
ret = rsmi_dev_pci_id_get(dv_ind, &pci_id);
if (ret == RSMI_STATUS_SUCCESS) {
*partition_id = static_cast<uint32_t>((pci_id >> 28) & 0xf);
}
/**
* Fall back is required due to driver changes within KFD.
* Some devices may report bits [31:28] or [2:0].
* With the newly added rsmi_dev_partition_id_get(..),
* we provided this fallback to properly retrieve the partition ID. We
* plan to eventually remove partition ID from the function portion of the
* BDF (Bus Device Function). See below for PCI ID description.
*
* bits [63:32] = domain
* bits [31:28] or bits [2:0] = partition id
* bits [27:16] = reserved
* bits [15:8] = Bus
* bits [7:3] = Device
* bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes
*/
if (*partition_id != UINT32_MAX && *partition_id == 0 &&
(strCompPartition == "DPX" || strCompPartition == "TPX"
|| strCompPartition == "CPX" || strCompPartition == "QPX")) {
*partition_id = static_cast<uint32_t>(pci_id & 0x7);
}
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success"
<< " | Device #: " << dv_ind
<< " | Type: partition_id"
<< " | Data: " << *partition_id
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |";
LOG_INFO(ss);
return ret;
CATCH
}
rsmi_status_t rsmi_dev_target_graphics_version_get(uint32_t dv_ind,
uint64_t *gfx_version) {
TRY
+1 -1
查看文件
@@ -265,7 +265,7 @@ void ROCmLogging::Logger::buffer(const char* text) throw() {
// and timestamp in the buffer message. Just log the raw bytes.
if ((m_LogType == FILE_LOG) && (m_LogLevel >= LOG_LEVEL_BUFFER)) {
lock();
if(!m_File.is_open()) {
if (!m_File.is_open()) {
initialize_resources();
if (!m_File.is_open()) {
std::cout << "WARNING: re-initializing resources was unsuccessful."
+36 -8
查看文件
@@ -235,15 +235,7 @@ RocmSMI::Initialize(uint64_t flags) {
int i_ret;
std::ostringstream ss;
LOG_ALWAYS("=============== ROCM SMI initialize ================");
ROCmLogging::Logger::getInstance()->enableAllLogLevels();
// Leaving below to allow developers to check current log settings
// std::string logSettings = Logger::getInstance()->getLogSettings();
// std::cout << "Current log settings:\n" << logSettings << std::endl;
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
logSystemDetails();
}
assert(ref_count_ == 1);
if (ref_count_ != 1) {
@@ -259,6 +251,15 @@ RocmSMI::Initialize(uint64_t flags) {
// To help debug env variable issues
// debugRSMIEnvVarInfo();
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
ROCmLogging::Logger::getInstance()->enableAllLogLevels();
LOG_ALWAYS("=============== ROCM SMI initialize ================");
logSystemDetails();
}
// Leaving below to allow developers to check current log settings
// std::string logSettings = ROCmLogging::Logger::getInstance()->getLogSettings();
// std::cout << "Current log settings:\n" << logSettings << std::endl;
while (!std::string(kAMDMonitorTypes[i]).empty()) {
amd_monitor_types_.insert(kAMDMonitorTypes[i]);
++i;
@@ -863,6 +864,15 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
<< " BDF = " << std::to_string(primaryBdfId)
<< " (" << print_int_as_hex(primaryBdfId) << ")";
LOG_DEBUG(ss);
if (doesDeviceSupportPartitions && strCompPartition != "SPX"
&& i->second.s_partition_id == 0) {
i->second.s_partition_id = i->second.s_function;
ss << __PRETTY_FUNCTION__ << " | (secondary node add) fall back - "
<< "detected !SPX && partition_id == 0"
<< "; function = " << std::to_string(i->second.s_function)
<< "; partition_id = " << std::to_string(i->second.s_partition_id);
LOG_DEBUG(ss);
}
ss << __PRETTY_FUNCTION__
<< " | (secondary node add) B4 AddToDeviceList() -->"
<< "\n[node_id = " << std::to_string(i->second.s_node_id)
@@ -881,6 +891,15 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
} else {
ss << __PRETTY_FUNCTION__ << " | primary node add ; "
<< " BDF = " << std::to_string(UINT64_MAX);
if (doesDeviceSupportPartitions && strCompPartition != "SPX"
&& i->second.s_partition_id == 0) {
i->second.s_partition_id = i->second.s_function;
ss << __PRETTY_FUNCTION__ << " | (primary node add) fall back - "
<< "detected !SPX && partition_id == 0"
<< "; function = " << std::to_string(i->second.s_function)
<< "; partition_id = " << std::to_string(i->second.s_partition_id);
LOG_DEBUG(ss);
}
LOG_DEBUG(ss);
ss << __PRETTY_FUNCTION__
<< " | (primary node add) After AddToDeviceList() -->"
@@ -1010,6 +1029,15 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
<< " BDF = " << std::to_string(myBdfId)
<< " (" << print_int_as_hex(myBdfId) << ")";
LOG_DEBUG(ss);
if (doesDeviceSupportPartitions && strCompPartition != "SPX"
&& it->second.s_partition_id == 0) {
it->second.s_partition_id = it->second.s_function;
ss << __PRETTY_FUNCTION__ << " | (secondary node add #2) fall back - "
<< "detected !SPX && partition_id == 0"
<< "; function = " << std::to_string(it->second.s_function)
<< "; partition_id = " << std::to_string(it->second.s_partition_id);
LOG_DEBUG(ss);
}
ss << __PRETTY_FUNCTION__
<< " | (secondary node add #2) B4 AddToDeviceList() -->"
<< "\n[node_id = " << std::to_string(it->second.s_node_id)
+235 -57
查看文件
@@ -88,6 +88,13 @@ void TestComputePartitionReadWrite::Close() {
TestBase::Close();
}
const uint32_t MAX_UNSUPPORTED_PARTITIONS = 0;
const uint32_t MAX_SPX_PARTITIONS = 1;
const uint32_t MAX_DPX_PARTITIONS = 2;
const uint32_t MAX_TPX_PARTITIONS = 3;
const uint32_t MAX_QPX_PARTITIONS = 4;
const uint32_t MAX_CPX_PARTITIONS = 8;
static const std::string
computePartitionString(rsmi_compute_partition_type computeParitionType) {
/**
@@ -139,50 +146,186 @@ static void system_wait(int seconds) {
static const std::map<std::string, rsmi_compute_partition_type_t>
mapStringToRSMIComputePartitionTypes {
{"CPX", RSMI_COMPUTE_PARTITION_CPX},
{"SPX", RSMI_COMPUTE_PARTITION_SPX},
{"DPX", RSMI_COMPUTE_PARTITION_DPX},
{"TPX", RSMI_COMPUTE_PARTITION_TPX},
{"QPX", RSMI_COMPUTE_PARTITION_QPX}
{"QPX", RSMI_COMPUTE_PARTITION_QPX},
{"CPX", RSMI_COMPUTE_PARTITION_CPX},
{"SPX", RSMI_COMPUTE_PARTITION_SPX}
};
void TestComputePartitionReadWrite::Run(void) {
rsmi_status_t ret, err;
char orig_char_computePartition[255];
char current_char_computePartition[255];
static void checkPartitionIdChanges(
uint32_t dev, const std::string current_partition, bool isVerbose,
bool reinitialize) {
uint32_t max_loop = MAX_SPX_PARTITIONS;
TestBase::Run();
if (setup_failed_) {
std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl;
return;
// re-initialize to ensure new device ordering is followed
if (reinitialize) {
if (isVerbose) {
std::cout << "\t**Reinitializing device list due to parition changes.\n";
}
rsmi_shut_down();
rsmi_init(0);
}
// Confirm system supports compute partition, before executing wait
ret = rsmi_dev_compute_partition_get(0, orig_char_computePartition, 255);
if (ret == RSMI_STATUS_SUCCESS) {
system_wait(25);
}
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
if (dv_ind != 0) {
IF_VERB(STANDARD) {
std::cout << std::endl;
if (current_partition == "DPX") {
max_loop = MAX_DPX_PARTITIONS;
} else if (current_partition == "TPX") {
max_loop = MAX_TPX_PARTITIONS;
} else if (current_partition == "QPX") {
max_loop = MAX_QPX_PARTITIONS;
} else if (current_partition == "CPX") {
max_loop = MAX_CPX_PARTITIONS;
uint16_t num_xcd;
rsmi_status_t ret = rsmi_dev_metrics_xcd_counter_get(dev, &num_xcd);
if (ret == RSMI_STATUS_SUCCESS) {
max_loop = num_xcd;
if (isVerbose) {
std::cout << "\t**Expecting num_xcd = " << num_xcd << " to equal "
"total CPX nodes\n";
}
}
PrintDeviceHeader(dv_ind);
bool devicePartitionUpdated = false;
}
// Standard checks to see if API is supported, before running full tests
ret = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition,
255);
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
IF_VERB(STANDARD) {
std::cout << "\t**" << ": "
<< "Not supported on this device" << std::endl;
for (uint32_t i = dev; i < dev + max_loop; i++) {
uint32_t partition_id;
rsmi_status_t ret = rsmi_dev_partition_id_get(i, &partition_id);
std::cout << "\t** Checking Partition ID | Device: " << std::to_string(i)
<< "; Current Partition: " << current_partition
<< " ; Max partition IDs to check: " << max_loop << "\n";
ASSERT_EQ(ret, RSMI_STATUS_SUCCESS);
if (ret == RSMI_STATUS_SUCCESS && current_partition == "SPX") {
ASSERT_LT(partition_id, max_loop);
if (isVerbose) {
std::cout << "\n\t**Confirmed partition_id < " << max_loop
<< " for SPX"
<< "\n\t**rsmi_dev_partition_id_get(" + std::to_string(i) +
", &partition_id); partition_id = "
<< static_cast<uint32_t>(partition_id) << std::endl;
}
} else if (ret == RSMI_STATUS_SUCCESS && current_partition == "DPX") {
ASSERT_LT(partition_id, max_loop);
if (isVerbose) {
std::cout << "\n\t**Confirmed partition_id < " << max_loop
<< " for DPX"
<< "\n\t**rsmi_dev_partition_id_get(" + std::to_string(i) +
", &partition_id); partition_id = "
<< static_cast<uint32_t>(partition_id) << std::endl;
}
} else if (ret == RSMI_STATUS_SUCCESS && current_partition == "TPX") {
ASSERT_LT(partition_id, max_loop);
if (isVerbose) {
std::cout << "\n\t**Confirmed partition_id < "
<< max_loop << " for TPX"
<< "\n\t**rsmi_dev_partition_id_get(" + std::to_string(i) +
", &partition_id); partition_id = "
<< static_cast<uint32_t>(partition_id) << std::endl;
}
} else if (ret == RSMI_STATUS_SUCCESS && current_partition == "QPX") {
ASSERT_LT(partition_id, max_loop);
if (isVerbose) {
std::cout << "\n\t**Confirmed partition_id < "
<< max_loop << " for QPX"
<< "\n\t**rsmi_dev_partition_id_get(" + std::to_string(i) +
", &partition_id); partition_id = "
<< static_cast<uint32_t>(partition_id) << std::endl;
}
} else if (ret == RSMI_STATUS_SUCCESS && current_partition == "CPX") {
ASSERT_LT(partition_id, max_loop);
if (isVerbose) {
std::cout << "\n\t**Confirmed partition_id < "
<< max_loop << " for CPX"
<< "\n\t**rsmi_dev_partition_id_get(" + std::to_string(i) +
", &partition_id); partition_id = "
<< static_cast<uint32_t>(partition_id) << std::endl;
}
} else if (ret == RSMI_STATUS_SUCCESS && current_partition == "UNKNOWN") {
ASSERT_EQ(partition_id, max_loop - 1);
if (isVerbose) {
std::cout << "\n\t**Confirmed partition_id = "
<< (max_loop - 1)
<< " for current_partition = UNKNOWN"
<< "\n\t**rsmi_dev_partition_id_get(" + std::to_string(i) +
", &partition_id); partition_id = "
<< static_cast<uint32_t>(partition_id) << std::endl;
}
}
}
}
void TestComputePartitionReadWrite::Run(void) {
rsmi_status_t ret, err;
char orig_char_computePartition[255];
orig_char_computePartition[0] = '\0';
char current_char_computePartition[255];
current_char_computePartition[0] = '\0';
TestBase::Run();
if (setup_failed_) {
std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl;
return;
}
bool isVerbose = (this->verbosity() &&
this->verbosity() >= (this->TestBase::VERBOSE_STANDARD)) ? true: false;
// Confirm system supports compute partition, before executing wait
ret = rsmi_dev_compute_partition_get(0, orig_char_computePartition, 255);
if (ret == RSMI_STATUS_SUCCESS) {
system_wait(15);
}
// initial_num_devices - keep this value static, due to parition changes
// fluctuating # of devices. We should end up with same # of devices at
// end of test.
uint32_t initial_num_devices = num_monitor_devs();
for (uint32_t dv_ind = 0; dv_ind < initial_num_devices; ++dv_ind) {
if (dv_ind >= 0) {
IF_VERB(STANDARD) {
std::cout << std::endl;
std::cout << "\t**"
<< "========= LOOP THROUGH DEVICES - DEVICE #"
<< std::to_string(dv_ind) << " =============="
<< std::endl;
}
}
PrintDeviceHeader(dv_ind);
bool devicePartitionUpdated = false;
ret = rsmi_dev_partition_id_get(dv_ind, nullptr);
ASSERT_EQ(ret, RSMI_STATUS_INVALID_ARGS);
IF_VERB(STANDARD) {
if (ret == RSMI_STATUS_INVALID_ARGS) {
std::cout << "\t**" << "Confirmed rsmi_dev_partition_id_get(..,nullptr): "
<< "RSMI_STATUS_INVALID_ARGS" << std::endl;
}
}
std::string partitionStr = "";
ret = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition, 255);
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
IF_VERB(STANDARD) {
std::cout << "\t**rsmi_dev_compute_partition_get(): "
<< "Not supported on this device"
<< std::endl;
}
partitionStr = orig_char_computePartition;
if (partitionStr.empty()) {
partitionStr = computePartitionString(
rsmi_compute_partition_type_t::RSMI_COMPUTE_PARTITION_INVALID);
}
// Regardless of partition support - no changes made, so no device
// refresh needed (ie. rsmi_dev_compute_partition_set(..))
checkPartitionIdChanges(dv_ind, partitionStr, isVerbose, false);
continue;
} else {
CHK_ERR_ASRT(ret)
std::string partitionStr = orig_char_computePartition;
if (partitionStr.empty()) {
partitionStr = computePartitionString(
rsmi_compute_partition_type_t::RSMI_COMPUTE_PARTITION_INVALID);
}
// Regardless of partition support - no changes made, so no device
// refresh needed (ie. rsmi_dev_compute_partition_set(..))
checkPartitionIdChanges(dv_ind, partitionStr, isVerbose, false);
}
IF_VERB(STANDARD) {
std::cout << std::endl << "\t**"
@@ -236,32 +379,12 @@ void TestComputePartitionReadWrite::Run(void) {
}
}
// Verify api support checking functionality is working
rsmi_compute_partition_type_t breakMe;
err = rsmi_dev_compute_partition_set(dv_ind, breakMe);
std::cout << "\t**rsmi_dev_compute_partition_set(null ptr): "
<< amd::smi::getRSMIStatusString(err, false) << "\n";
ASSERT_TRUE((err == RSMI_STATUS_INVALID_ARGS) ||
(err == RSMI_STATUS_NOT_SUPPORTED) ||
(err == RSMI_STATUS_PERMISSION));
IF_VERB(STANDARD) {
if (err == RSMI_STATUS_INVALID_ARGS) {
std::cout << "\t**"
<< "Confirmed RSMI_STATUS_INVALID_ARGS was returned."
<< std::endl;
} else if (err == RSMI_STATUS_PERMISSION) {
DISPLAY_RSMI_ERR(err)
// tests should not continue if err is a permission issue
ASSERT_FALSE(err == RSMI_STATUS_PERMISSION);
} else {
DISPLAY_RSMI_ERR(err)
}
}
// Re-run original get, so we can reset to later
ret = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition,
255);
ASSERT_EQ(RSMI_STATUS_SUCCESS, ret);
std::cout << "\t**rsmi_dev_compute_partition_get(" << dv_ind
<< ", " << orig_char_computePartition << ")\n";
/**
* RSMI_COMPUTE_PARTITION_INVALID = 0,
@@ -277,8 +400,27 @@ void TestComputePartitionReadWrite::Run(void) {
* //!< work together with shared memory
*/
for (int partition = static_cast<int>(RSMI_COMPUTE_PARTITION_CPX);
partition <= static_cast<int>(RSMI_COMPUTE_PARTITION_QPX);
/**
* General Loop Logic:
* [0:SPX, 1:SPX, 2:SPX, 3:SPX]
* [0:DPX, 1:DPX, 2: SPX, 3:SPX, 4:SPX] <- set 0 to DPX
* [0:TPX, 1:TPX, 2:TPX, 3:SPX, 4:SPX, 5:SPX] <- set 0 to TPX
* [0:QPX, 1:QPX, 2:QPX, 3:QPX, 4:SPX, 5:SPX, 6:SPX] <- set 0 to TPX
* [0:CPX, 1:CPX, 2:CPX, 3:CPX, 4:CPX, 5:SPX, 6:SPX, 7:SPX] <- set 0 to CPX
* [0:SPX, 1:SPX, 2:SPX, 3:SPX] <- reset(0)
* +1 index
* [0:SPX, 1:SPX, 2:SPX, 3:SPX]
* [0:SPX, 1:DPX, 2: DPX, 3:SPX, 4:SPX] <- set 1 to DPX
* [0:SPX, 1:TPX, 2:TPX, 3:TPX, 4:SPX, 5:SPX] <- set 1 to TPX
* [0:SPX, 1:QPX, 2:QPX, 3:QPX, 4:QPX, 5:SPX, 6:SPX] <- set 1 to TPX
* [0:SPX, 1:CPX, 2:CPX, 3:CPX, 4:CPX, 5:CPX, 6:SPX, 7:SPX] <- set 1 to CPX
* [0:SPX, 1:SPX, 2:SPX, 3:SPX] <- reset(1)
* ...
*
*/
std::string final_partition_state = "UNKNOWN";
for (int partition = static_cast<int>(RSMI_COMPUTE_PARTITION_SPX);
partition <= static_cast<int>(RSMI_COMPUTE_PARTITION_CPX);
partition++) {
rsmi_compute_partition_type_t updatePartition
= static_cast<rsmi_compute_partition_type_t>(partition);
@@ -292,7 +434,8 @@ void TestComputePartitionReadWrite::Run(void) {
ret = rsmi_dev_compute_partition_set(dv_ind, updatePartition);
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "rsmi_dev_compute_partition_set(dv_ind, updatePartition): "
<< "rsmi_dev_compute_partition_set(" << dv_ind
<< ", updatePartition): "
<< amd::smi::getRSMIStatusString(ret, false) << "\n"
<< "\t**New Partition (set): "
<< computePartitionString(updatePartition) << "\n";
@@ -341,6 +484,7 @@ void TestComputePartitionReadWrite::Run(void) {
if (strcmp(orig_char_computePartition, current_char_computePartition) !=
0) {
devicePartitionUpdated = true;
final_partition_state = current_char_computePartition;
} else {
devicePartitionUpdated = false;
}
@@ -356,8 +500,13 @@ void TestComputePartitionReadWrite::Run(void) {
<< computePartitionString(updatePartition) << ")"
<< std::endl;
}
checkPartitionIdChanges(dv_ind, computePartitionString(updatePartition),
isVerbose, true);
}
} // END looping through partition changes
std::cout << "\t**=========== END PARTITION LOOP (dev = "
<< std::to_string(dv_ind) << ") ===========\n";
/* TEST RETURN TO BOOT COMPUTE PARTITION SETTING */
IF_VERB(STANDARD) {
@@ -371,7 +520,7 @@ void TestComputePartitionReadWrite::Run(void) {
ret = rsmi_dev_compute_partition_reset(dv_ind);
IF_VERB(STANDARD) {
std::cout << "\t**"
<< "rsmi_dev_compute_partition_reset(dv_ind): "
<< "rsmi_dev_compute_partition_reset(" << dv_ind << "): "
<< amd::smi::getRSMIStatusString(ret, false) << "\n";
}
ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) ||
@@ -393,6 +542,14 @@ void TestComputePartitionReadWrite::Run(void) {
<< "\t**" << "Partitions Updated: "
<< (devicePartitionUpdated ? "TRUE" : "FALSE") << "\n";
}
if (final_partition_state != std::string(current_char_computePartition)) {
checkPartitionIdChanges(dv_ind, std::string(current_char_computePartition),
isVerbose, true);
} else {
checkPartitionIdChanges(dv_ind, std::string(current_char_computePartition),
isVerbose, false);
}
if (wasResetSuccess && devicePartitionUpdated) {
ASSERT_STRNE(oldPartition.c_str(), current_char_computePartition);
IF_VERB(STANDARD) {
@@ -401,6 +558,7 @@ void TestComputePartitionReadWrite::Run(void) {
<< "equal to current\n\t partition ("
<< current_char_computePartition << ")" << std::endl;
}
final_partition_state = std::string(current_char_computePartition);
} else {
ASSERT_STREQ(oldPartition.c_str(), current_char_computePartition);
IF_VERB(STANDARD) {
@@ -424,7 +582,12 @@ void TestComputePartitionReadWrite::Run(void) {
ret = rsmi_dev_compute_partition_set(dv_ind, newPartition);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "\t**" << "Returning compute partition to: "
std::cout << "\t**"
<< "rsmi_dev_compute_partition_set("
<< std::to_string(dv_ind) << ", "
<< std::string(orig_char_computePartition) << ")" << std::endl;
std::cout << "\t**"
<< "Returning compute partition to: "
<< computePartitionString(newPartition) << std::endl;
}
ret = rsmi_dev_compute_partition_get(dv_ind, current_char_computePartition,
@@ -440,5 +603,20 @@ void TestComputePartitionReadWrite::Run(void) {
ASSERT_EQ(RSMI_STATUS_SUCCESS, ret);
ASSERT_STREQ(computePartitionString(newPartition).c_str(),
current_char_computePartition);
// only refresh (rsmi_shut_down() -> rsmi_init(0)) device list
// if there was a partition change
if (final_partition_state != std::string(current_char_computePartition)) {
checkPartitionIdChanges(dv_ind, computePartitionString(newPartition),
isVerbose, true);
} else {
checkPartitionIdChanges(dv_ind, computePartitionString(newPartition),
isVerbose, false);
}
std::cout << "\t**"
<< "========= END LOOP THROUGH DEVICES - DEVICE #"
<< std::to_string(dv_ind) << " =============="
<< std::endl;
} // END looping through devices
std::cout << "\t**=========== END TEST ===========\n";
}
+1 -1
查看文件
@@ -229,7 +229,7 @@ const char *FreqEnumToStr(rsmi_clk_type rsmi_clk) {
}
void printRSMIError(rsmi_status_t err) {
std::cout << "err = " << amd::smi::getRSMIStatusString(err);
std::cout << "err = " << amd::smi::getRSMIStatusString(err) << "\n";
}
#if ENABLE_SMI