[SWDEV-451104] Update static --board + amdsmi_get_gpu_board_info()

Updates:
    * Expanded `amdsmi_get_gpu_board_info()` amdsmi_board_info_t structure size
      Updated sizes that work for retrieving relevant board
      information across AMD's ASIC products.
    * Fixed `amdsmi_get_gpu_board_info()` to no longer return junk char strings

Change-Id: Ie1553c6109d678d283d82c24e9284f8e19cd6ccc
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/amdsmi commit: 4295bba37f]
This commit is contained in:
Charis Poag
2024-04-16 21:26:52 -05:00
vanhempi 96f603783c
commit aaed28aaab
6 muutettua tiedostoa jossa 195 lisäystä ja 62 poistoa
+11 -1
Näytä tiedosto
@@ -60,10 +60,14 @@ GPU PCIE_BW
### Changes
- **Updated `amdsmi_get_gpu_board_info()` now has larger structure sizes for `amdsmi_board_info_t`**.
Updated sizes that work for retreiving relavant board information across AMD's
ASIC products. This requires users to update any ABIs using this structure.
- **`amdsmi_get_power_cap_info` now returns values in uW instead of W**.
`amdsmi_get_power_cap_info` will return in uW as originally reflected by driver. Previously `amdsmi_get_power_cap_info` returned W values, this conflicts with our sets and modifies values retrieved from driver. We decided to keep the values returned from driver untouched (in original units, uW). Then in CLI we will convert to watts (as previously done - no changes here). Additionally, driver made updates to min power cap displayed for devices when overdrive is disabled which prompted for this change (in this case min_power_cap and max_power_cap are the same).
- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info**.
- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info**.
Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty.
- **Updated `amd-smi metric --ecc-blocks` output**.
@@ -108,6 +112,12 @@ amdsmi_get_gpu_process_info was removed from the C library in an earlier build,
### Fixes
- **`amdsmi_get_gpu_board_info()` no longer returns junk char strings**.
Previously if there was a partial failure to retrieve character strings, we would return
garbage output to users using the API. This fix intends to populate as many values as possible.
Then any failure(s) found along the way, `\0` is provided to `amdsmi_board_info_t`
structures data members which cannot be populated. Ensuring empty char string values.
- **Fixed `amd-smi metric --power` now provides power output for Navi2x/Navi3x/MI1x**.
These systems use an older version of gpu_metrics in amdgpu. This fix only updates what CLI outputs.
No change in any of our APIs.
@@ -83,7 +83,7 @@ typedef enum {
#define AMDSMI_MAX_DEVICES 32
#define AMDSMI_MAX_NAME 32
#define AMDSMI_MAX_DRIVER_VERSION_LENGTH 80
#define AMDSMI_PRODUCT_NAME_LENGTH 128
#define AMDSMI_256_LENGTH 256
#define AMDSMI_MAX_CONTAINER_TYPE 2
#define AMDSMI_MAX_CACHE_TYPES 10
#define AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK 64
@@ -573,7 +573,7 @@ typedef struct {
} amdsmi_fw_info_t;
typedef struct {
char market_name[AMDSMI_MAX_STRING_LENGTH];
char market_name[AMDSMI_256_LENGTH];
uint32_t vendor_id; //< Use 32 bit to be compatible with other platform.
char vendor_name[AMDSMI_MAX_STRING_LENGTH];
uint32_t subvendor_id; //< The subsystem vendor id
@@ -619,11 +619,11 @@ typedef struct {
} amdsmi_driver_info_t;
typedef struct {
char model_number[AMDSMI_NORMAL_STRING_LENGTH];
char model_number[AMDSMI_256_LENGTH];
char product_serial[AMDSMI_NORMAL_STRING_LENGTH];
char fru_id[AMDSMI_NORMAL_STRING_LENGTH];
char product_name[AMDSMI_PRODUCT_NAME_LENGTH];
char manufacturer_name[AMDSMI_NORMAL_STRING_LENGTH];
char product_name[AMDSMI_256_LENGTH];
char manufacturer_name[AMDSMI_MAX_STRING_LENGTH];
uint32_t reserved[32];
} amdsmi_board_info_t;
@@ -539,6 +539,21 @@ def _make_amdsmi_bdf_from_list(bdf):
amdsmi_bdf.fields.domain_number = bdf[0]
return amdsmi_bdf
def _padHexValue(value, length):
""" Pad a hexadecimal value with a given length of zeros
:param value: A hexadecimal value to be padded with zeros
:param length: Number of zeros to pad the hexadecimal value
:param return original string string or
padded hex of confirmed hex output (using length provided)
"""
# Ensure value entered meets the minimum length and is hexadecimal
if len(value) > 2 and length > 1 and value[:2].lower() == '0x' \
and all(c in '0123456789abcdefABCDEF' for c in value[2:]):
# Pad with zeros after '0x' prefix
return '0x' + value[2:].zfill(length)
return value
def amdsmi_get_socket_handles() -> List[amdsmi_wrapper.amdsmi_socket_handle]:
"""
@@ -1585,12 +1600,12 @@ def amdsmi_get_gpu_asic_info(
)
asic_info = {
"market_name": asic_info_struct.market_name.decode("utf-8"),
"market_name": _padHexValue(asic_info_struct.market_name.decode("utf-8"), 4),
"vendor_id": asic_info_struct.vendor_id,
"vendor_name": asic_info_struct.vendor_name.decode("utf-8"),
"subvendor_id": asic_info_struct.subvendor_id,
"device_id": asic_info_struct.device_id,
"rev_id": hex(asic_info_struct.rev_id),
"rev_id": _padHexValue(hex(asic_info_struct.rev_id), 2),
"asic_serial": asic_info_struct.asic_serial.decode("utf-8"),
"oam_id": asic_info_struct.oam_id
}
@@ -1863,17 +1878,16 @@ def amdsmi_get_gpu_board_info(
)
board_info_dict = {
"model_number": board_info.model_number.decode("utf-8").strip(),
"model_number": _padHexValue(board_info.model_number.decode("utf-8").strip(), 4),
"product_serial": board_info.product_serial.decode("utf-8").strip(),
"fru_id": board_info.fru_id.decode("utf-8").strip(),
"product_name": board_info.product_name.decode("utf-8").strip(),
"product_name": _padHexValue(board_info.product_name.decode("utf-8").strip(), 4),
"manufacturer_name": board_info.manufacturer_name.decode("utf-8").strip()
}
for key, value in board_info_dict.items():
if value == "":
board_info_dict[key] = "N/A"
return board_info_dict
@@ -882,7 +882,7 @@ class struct_amdsmi_asic_info_t(Structure):
struct_amdsmi_asic_info_t._pack_ = 1 # source:False
struct_amdsmi_asic_info_t._fields_ = [
('market_name', ctypes.c_char * 64),
('market_name', ctypes.c_char * 256),
('vendor_id', ctypes.c_uint32),
('vendor_name', ctypes.c_char * 64),
('subvendor_id', ctypes.c_uint32),
@@ -961,11 +961,11 @@ class struct_amdsmi_board_info_t(Structure):
struct_amdsmi_board_info_t._pack_ = 1 # source:False
struct_amdsmi_board_info_t._fields_ = [
('model_number', ctypes.c_char * 32),
('model_number', ctypes.c_char * 256),
('product_serial', ctypes.c_char * 32),
('fru_id', ctypes.c_char * 32),
('product_name', ctypes.c_char * 128),
('manufacturer_name', ctypes.c_char * 32),
('product_name', ctypes.c_char * 256),
('manufacturer_name', ctypes.c_char * 64),
('reserved', ctypes.c_uint32 * 32),
]
+74 -6
Näytä tiedosto
@@ -389,7 +389,6 @@ amdsmi_get_gpu_device_bdf(amdsmi_processor_handle processor_handle, amdsmi_bdf_t
}
amdsmi_status_t amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_handle, amdsmi_board_info_t *board_info) {
AMDSMI_CHECK_INIT();
if (board_info == nullptr) {
@@ -405,17 +404,86 @@ amdsmi_status_t amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_hand
if (gpu_device->check_if_drm_is_supported()) {
// Populate product_serial, product_name, & product_number from sysfs
status = smi_amdgpu_get_board_info(gpu_device, board_info);
}
else {
} else {
// ignore the errors so that it can populate as many fields as possible.
// call rocm-smi which search multiple places for device name
status = rsmi_wrapper(rsmi_dev_name_get, processor_handle,
board_info->product_name, AMDSMI_PRODUCT_NAME_LENGTH);
board_info->product_name, AMDSMI_256_LENGTH);
status = rsmi_wrapper(rsmi_dev_serial_number_get, processor_handle,
board_info->product_serial, AMDSMI_NORMAL_STRING_LENGTH);
}
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "[Before rocm smi correction] "
<< "Returning status = AMDSMI_STATUS_SUCCESS"
<< "\n; info->model_number: |" << board_info->model_number << "|"
<< "\n; info->product_serial: |" << board_info->product_serial << "|"
<< "\n; info->fru_id: |" << board_info->fru_id << "|"
<< "\n; info->manufacturer_name: |" << board_info->manufacturer_name << "|"
<< "\n; info->product_name: |" << board_info->product_name << "|";
LOG_INFO(ss);
// Correct any missing details
if (board_info->model_number[0] == '\0') {
status = rsmi_wrapper(rsmi_dev_name_get, processor_handle, board_info->model_number,
AMDSMI_256_LENGTH);
if (status != AMDSMI_STATUS_SUCCESS) {
memset(board_info->model_number, 0,
AMDSMI_256_LENGTH * sizeof(board_info->model_number[0]));
}
ss << __PRETTY_FUNCTION__ << " | [rsmi_correction] board_info->model_number= |"
<< board_info->model_number << "|";
LOG_INFO(ss);
}
if (board_info->product_serial[0] == '\0') {
status = rsmi_wrapper(rsmi_dev_serial_number_get, processor_handle,
board_info->product_serial, AMDSMI_NORMAL_STRING_LENGTH);
if (status != AMDSMI_STATUS_SUCCESS) {
memset(board_info->product_serial, 0,
AMDSMI_NORMAL_STRING_LENGTH * sizeof(board_info->product_serial[0]));
}
ss << __PRETTY_FUNCTION__ << " | [rsmi_correction] board_info->product_serial= |"
<< board_info->product_serial << "|";
LOG_INFO(ss);
}
if (board_info->product_name[0] == '\0') {
status = rsmi_wrapper(rsmi_dev_subsystem_name_get,
processor_handle, board_info->product_name,
AMDSMI_256_LENGTH);
if (status != AMDSMI_STATUS_SUCCESS) {
memset(board_info->product_name, 0,
AMDSMI_256_LENGTH * sizeof(board_info->product_name[0]));
}
ss << __PRETTY_FUNCTION__ << " | [rsmi_correction] board_info->product_name= |"
<< board_info->product_name << "|";
LOG_INFO(ss);
}
if (board_info->manufacturer_name[0] == '\0') {
status = rsmi_wrapper(rsmi_dev_vendor_name_get,
processor_handle, board_info->manufacturer_name,
AMDSMI_MAX_STRING_LENGTH);
if (status != AMDSMI_STATUS_SUCCESS) {
memset(board_info->manufacturer_name, 0,
AMDSMI_MAX_STRING_LENGTH * sizeof(board_info->manufacturer_name[0]));
}
ss << __PRETTY_FUNCTION__ << " | [rsmi_correction] board_info->manufacturer_name= |"
<< board_info->manufacturer_name << "|";
LOG_INFO(ss);
}
ss << __PRETTY_FUNCTION__ << "[After rocm smi correction] "
<< "Returning status = AMDSMI_STATUS_SUCCESS"
<< "\n; info->model_number: |" << board_info->model_number << "|"
<< "\n; info->product_serial: |" << board_info->product_serial << "|"
<< "\n; info->fru_id: |" << board_info->fru_id << "|"
<< "\n; info->manufacturer_name: |" << board_info->manufacturer_name << "|"
<< "\n; info->product_name: |" << board_info->product_name << "|";
LOG_INFO(ss);
return AMDSMI_STATUS_SUCCESS;
}
@@ -654,7 +722,7 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
status = smi_amdgpu_get_market_name_from_dev_id(dev_info.device_id, info->market_name);
if (status != AMDSMI_STATUS_SUCCESS) {
rsmi_wrapper(rsmi_dev_brand_get, processor_handle,
info->market_name, AMDSMI_NORMAL_STRING_LENGTH);
info->market_name, AMDSMI_256_LENGTH);
}
info->device_id = dev_info.device_id;
@@ -667,7 +735,7 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
if (status == AMDSMI_STATUS_SUCCESS) snprintf(info->asic_serial, sizeof(info->asic_serial), "%lu", dv_uid);
status = rsmi_wrapper(rsmi_dev_brand_get, processor_handle,
info->market_name, AMDSMI_NORMAL_STRING_LENGTH);
info->market_name, AMDSMI_256_LENGTH);
status = rsmi_wrapper(rsmi_dev_vendor_id_get, processor_handle,
&vendor_id);
@@ -36,7 +36,9 @@
#include <fstream>
#include <iostream>
#include <regex>
#include <cstdio>
#include <sstream>
#include <iterator>
#include <sys/ioctl.h>
#include <algorithm>
#include <limits.h>
@@ -45,6 +47,69 @@
#include "shared_mutex.h" // NOLINT
#include "rocm_smi/rocm_smi_logger.h"
std::string leftTrim(const std::string &s) {
if (!s.empty()) {
return std::regex_replace(s, std::regex("^\\s+"), "");
}
return s;
}
std::string rightTrim(const std::string &s) {
if (!s.empty()) {
return std::regex_replace(s, std::regex("\\s+$"), "");
}
return s;
}
std::string removeNewLines(const std::string &s) {
if (!s.empty()) {
return std::regex_replace(s, std::regex("\n+"), "");
}
return s;
}
std::string trim(const std::string &s) {
if (!s.empty()) {
// remove new lines -> trim white space at ends
std::string noNewLines = removeNewLines(s);
return leftTrim(rightTrim(noNewLines));
}
return s;
}
// Given original string and string to remove (removeMe)
// Return will provide the resulting modified string with the removed string(s)
std::string removeString(const std::string origStr,
const std::string &removeMe) {
std::string modifiedStr = origStr;
std::string::size_type l = removeMe.length();
for (std::string::size_type i = modifiedStr.find(removeMe);
i != std::string::npos;
i = modifiedStr.find(removeMe)) {
modifiedStr.erase(i, l);
}
return modifiedStr;
}
void openFileAndModifyBuffer(std::string path, char *buff, size_t sizeOfBuff) {
bool errorDiscovered = false;
std::ifstream file(path, std::ifstream::in);
std::string contents = {std::istreambuf_iterator<char>{file}, std::istreambuf_iterator<char>{}};
memset(buff, 0, sizeof(char) * sizeOfBuff);
if (!file.is_open()) {
errorDiscovered = true;
} else {
contents = trim(contents);
}
file.close();
if (!errorDiscovered && file.good() && !file.bad() && !file.fail() && !file.eof()
&& !contents.empty()) {
std::strncpy(buff, contents.c_str(), sizeOfBuff-1);
buff[sizeOfBuff-1] = '\0';
}
}
static const uint32_t kAmdGpuId = 0x1002;
static bool isAMDGPU(std::string dev_path) {
@@ -123,50 +188,26 @@ amdsmi_status_t smi_amdgpu_get_board_info(amd::smi::AMDSmiGPUDevice* device, amd
std::string manufacturer_name_path = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/manufacturer");
std::string product_name_path = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/product_name");
FILE *fp;
openFileAndModifyBuffer(model_number_path, info->model_number, AMDSMI_256_LENGTH);
openFileAndModifyBuffer(product_serial_path, info->product_serial, AMDSMI_NORMAL_STRING_LENGTH);
openFileAndModifyBuffer(fru_id_path, info->fru_id, AMDSMI_NORMAL_STRING_LENGTH);
openFileAndModifyBuffer(manufacturer_name_path, info->manufacturer_name,
AMDSMI_MAX_STRING_LENGTH);
openFileAndModifyBuffer(product_name_path, info->product_name, AMDSMI_256_LENGTH);
fp = fopen(model_number_path.c_str(), "rb");
if (fp) {
fgets(info->model_number, sizeof(info->model_number), fp);
fclose(fp);
}
fp = fopen(product_serial_path.c_str(), "rb");
if (fp) {
fgets(info->product_serial, sizeof(info->product_serial), fp);
fclose(fp);
}
fp = fopen(fru_id_path.c_str(), "rb");
if (fp) {
fgets(info->fru_id, sizeof(info->fru_id), fp);
fclose(fp);
}
fp = fopen(manufacturer_name_path.c_str(), "rb");
if (fp) {
fgets(info->manufacturer_name, sizeof(info->manufacturer_name), fp);
fclose(fp);
}
fp = fopen(product_name_path.c_str(), "rb");
if (fp) {
fgets(info->product_name, sizeof(info->product_name), fp);
fclose(fp);
}
std::ostringstream ss;
ss << __PRETTY_FUNCTION__
ss << __PRETTY_FUNCTION__ << "[Before correction] "
<< "Returning status = AMDSMI_STATUS_SUCCESS"
<< " | model_number_path = " << model_number_path
<< "; info->model_number: " << info->model_number
<< "\n product_serial_path = " << product_serial_path
<< "; info->product_serial: " << info->product_serial
<< "\n fru_id_path = " << fru_id_path
<< "; info->fru_id: " << info->fru_id
<< "\n manufacturer_name_path = " << manufacturer_name_path
<< "; info->manufacturer_name: " << info->manufacturer_name
<< "\n product_name_path = " << product_name_path
<< "; info->product_name: " << info->product_name;
<< " | model_number_path = |" << model_number_path << "|\n"
<< "; info->model_number: |" << info->model_number << "|\n"
<< "\n product_serial_path = |" << product_serial_path << "|\n"
<< "; info->product_serial: |" << info->product_serial << "|\n"
<< "\n fru_id_path = |" << fru_id_path << "|\n"
<< "; info->fru_id: |" << info->fru_id << "|\n"
<< "\n manufacturer_name_path = |" << manufacturer_name_path << "|\n"
<< "; info->manufacturer_name: |" << info->manufacturer_name << "|\n"
<< "\n product_name_path = |" << product_name_path << "|\n"
<< "; info->product_name: |" << info->product_name << "|";
LOG_INFO(ss);
return AMDSMI_STATUS_SUCCESS;