diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 5ca0d50466..262a0bc442 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -60,10 +60,14 @@ GPU PCIE_BW ### Changes +- **Updated `amdsmi_get_gpu_board_info()` now has larger structure sizes for `amdsmi_board_info_t`**. +Updated sizes that work for retreiving relavant board information across AMD's +ASIC products. This requires users to update any ABIs using this structure. + - **`amdsmi_get_power_cap_info` now returns values in uW instead of W**. `amdsmi_get_power_cap_info` will return in uW as originally reflected by driver. Previously `amdsmi_get_power_cap_info` returned W values, this conflicts with our sets and modifies values retrieved from driver. We decided to keep the values returned from driver untouched (in original units, uW). Then in CLI we will convert to watts (as previously done - no changes here). Additionally, driver made updates to min power cap displayed for devices when overdrive is disabled which prompted for this change (in this case min_power_cap and max_power_cap are the same). -- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info**. +- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info**. Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty. - **Updated `amd-smi metric --ecc-blocks` output**. @@ -108,6 +112,12 @@ amdsmi_get_gpu_process_info was removed from the C library in an earlier build, ### Fixes +- **`amdsmi_get_gpu_board_info()` no longer returns junk char strings**. +Previously if there was a partial failure to retrieve character strings, we would return +garbage output to users using the API. This fix intends to populate as many values as possible. +Then any failure(s) found along the way, `\0` is provided to `amdsmi_board_info_t` +structures data members which cannot be populated. Ensuring empty char string values. + - **Fixed `amd-smi metric --power` now provides power output for Navi2x/Navi3x/MI1x**. These systems use an older version of gpu_metrics in amdgpu. This fix only updates what CLI outputs. No change in any of our APIs. diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index bfc9967228..d80f3260ca 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -83,7 +83,7 @@ typedef enum { #define AMDSMI_MAX_DEVICES 32 #define AMDSMI_MAX_NAME 32 #define AMDSMI_MAX_DRIVER_VERSION_LENGTH 80 -#define AMDSMI_PRODUCT_NAME_LENGTH 128 +#define AMDSMI_256_LENGTH 256 #define AMDSMI_MAX_CONTAINER_TYPE 2 #define AMDSMI_MAX_CACHE_TYPES 10 #define AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK 64 @@ -573,7 +573,7 @@ typedef struct { } amdsmi_fw_info_t; typedef struct { - char market_name[AMDSMI_MAX_STRING_LENGTH]; + char market_name[AMDSMI_256_LENGTH]; uint32_t vendor_id; //< Use 32 bit to be compatible with other platform. char vendor_name[AMDSMI_MAX_STRING_LENGTH]; uint32_t subvendor_id; //< The subsystem vendor id @@ -619,11 +619,11 @@ typedef struct { } amdsmi_driver_info_t; typedef struct { - char model_number[AMDSMI_NORMAL_STRING_LENGTH]; + char model_number[AMDSMI_256_LENGTH]; char product_serial[AMDSMI_NORMAL_STRING_LENGTH]; char fru_id[AMDSMI_NORMAL_STRING_LENGTH]; - char product_name[AMDSMI_PRODUCT_NAME_LENGTH]; - char manufacturer_name[AMDSMI_NORMAL_STRING_LENGTH]; + char product_name[AMDSMI_256_LENGTH]; + char manufacturer_name[AMDSMI_MAX_STRING_LENGTH]; uint32_t reserved[32]; } amdsmi_board_info_t; diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index b5d6f3ecf0..9c2aa42b89 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -539,6 +539,21 @@ def _make_amdsmi_bdf_from_list(bdf): amdsmi_bdf.fields.domain_number = bdf[0] return amdsmi_bdf +def _padHexValue(value, length): + """ Pad a hexadecimal value with a given length of zeros + + :param value: A hexadecimal value to be padded with zeros + :param length: Number of zeros to pad the hexadecimal value + :param return original string string or + padded hex of confirmed hex output (using length provided) + """ + # Ensure value entered meets the minimum length and is hexadecimal + if len(value) > 2 and length > 1 and value[:2].lower() == '0x' \ + and all(c in '0123456789abcdefABCDEF' for c in value[2:]): + # Pad with zeros after '0x' prefix + return '0x' + value[2:].zfill(length) + return value + def amdsmi_get_socket_handles() -> List[amdsmi_wrapper.amdsmi_socket_handle]: """ @@ -1585,12 +1600,12 @@ def amdsmi_get_gpu_asic_info( ) asic_info = { - "market_name": asic_info_struct.market_name.decode("utf-8"), + "market_name": _padHexValue(asic_info_struct.market_name.decode("utf-8"), 4), "vendor_id": asic_info_struct.vendor_id, "vendor_name": asic_info_struct.vendor_name.decode("utf-8"), "subvendor_id": asic_info_struct.subvendor_id, "device_id": asic_info_struct.device_id, - "rev_id": hex(asic_info_struct.rev_id), + "rev_id": _padHexValue(hex(asic_info_struct.rev_id), 2), "asic_serial": asic_info_struct.asic_serial.decode("utf-8"), "oam_id": asic_info_struct.oam_id } @@ -1863,17 +1878,16 @@ def amdsmi_get_gpu_board_info( ) board_info_dict = { - "model_number": board_info.model_number.decode("utf-8").strip(), + "model_number": _padHexValue(board_info.model_number.decode("utf-8").strip(), 4), "product_serial": board_info.product_serial.decode("utf-8").strip(), "fru_id": board_info.fru_id.decode("utf-8").strip(), - "product_name": board_info.product_name.decode("utf-8").strip(), + "product_name": _padHexValue(board_info.product_name.decode("utf-8").strip(), 4), "manufacturer_name": board_info.manufacturer_name.decode("utf-8").strip() } for key, value in board_info_dict.items(): if value == "": board_info_dict[key] = "N/A" - return board_info_dict diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index 912199d186..11206dad40 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -882,7 +882,7 @@ class struct_amdsmi_asic_info_t(Structure): struct_amdsmi_asic_info_t._pack_ = 1 # source:False struct_amdsmi_asic_info_t._fields_ = [ - ('market_name', ctypes.c_char * 64), + ('market_name', ctypes.c_char * 256), ('vendor_id', ctypes.c_uint32), ('vendor_name', ctypes.c_char * 64), ('subvendor_id', ctypes.c_uint32), @@ -961,11 +961,11 @@ class struct_amdsmi_board_info_t(Structure): struct_amdsmi_board_info_t._pack_ = 1 # source:False struct_amdsmi_board_info_t._fields_ = [ - ('model_number', ctypes.c_char * 32), + ('model_number', ctypes.c_char * 256), ('product_serial', ctypes.c_char * 32), ('fru_id', ctypes.c_char * 32), - ('product_name', ctypes.c_char * 128), - ('manufacturer_name', ctypes.c_char * 32), + ('product_name', ctypes.c_char * 256), + ('manufacturer_name', ctypes.c_char * 64), ('reserved', ctypes.c_uint32 * 32), ] diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index cc504ea3b9..d9c9b1244c 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -389,7 +389,6 @@ amdsmi_get_gpu_device_bdf(amdsmi_processor_handle processor_handle, amdsmi_bdf_t } amdsmi_status_t amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_handle, amdsmi_board_info_t *board_info) { - AMDSMI_CHECK_INIT(); if (board_info == nullptr) { @@ -405,17 +404,86 @@ amdsmi_status_t amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_hand if (gpu_device->check_if_drm_is_supported()) { // Populate product_serial, product_name, & product_number from sysfs status = smi_amdgpu_get_board_info(gpu_device, board_info); - } - else { + } else { // ignore the errors so that it can populate as many fields as possible. // call rocm-smi which search multiple places for device name status = rsmi_wrapper(rsmi_dev_name_get, processor_handle, - board_info->product_name, AMDSMI_PRODUCT_NAME_LENGTH); + board_info->product_name, AMDSMI_256_LENGTH); status = rsmi_wrapper(rsmi_dev_serial_number_get, processor_handle, board_info->product_serial, AMDSMI_NORMAL_STRING_LENGTH); } + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "[Before rocm smi correction] " + << "Returning status = AMDSMI_STATUS_SUCCESS" + << "\n; info->model_number: |" << board_info->model_number << "|" + << "\n; info->product_serial: |" << board_info->product_serial << "|" + << "\n; info->fru_id: |" << board_info->fru_id << "|" + << "\n; info->manufacturer_name: |" << board_info->manufacturer_name << "|" + << "\n; info->product_name: |" << board_info->product_name << "|"; + LOG_INFO(ss); + + // Correct any missing details + if (board_info->model_number[0] == '\0') { + status = rsmi_wrapper(rsmi_dev_name_get, processor_handle, board_info->model_number, + AMDSMI_256_LENGTH); + if (status != AMDSMI_STATUS_SUCCESS) { + memset(board_info->model_number, 0, + AMDSMI_256_LENGTH * sizeof(board_info->model_number[0])); + } + ss << __PRETTY_FUNCTION__ << " | [rsmi_correction] board_info->model_number= |" + << board_info->model_number << "|"; + LOG_INFO(ss); + } + + if (board_info->product_serial[0] == '\0') { + status = rsmi_wrapper(rsmi_dev_serial_number_get, processor_handle, + board_info->product_serial, AMDSMI_NORMAL_STRING_LENGTH); + if (status != AMDSMI_STATUS_SUCCESS) { + memset(board_info->product_serial, 0, + AMDSMI_NORMAL_STRING_LENGTH * sizeof(board_info->product_serial[0])); + } + ss << __PRETTY_FUNCTION__ << " | [rsmi_correction] board_info->product_serial= |" + << board_info->product_serial << "|"; + LOG_INFO(ss); + } + + if (board_info->product_name[0] == '\0') { + status = rsmi_wrapper(rsmi_dev_subsystem_name_get, + processor_handle, board_info->product_name, + AMDSMI_256_LENGTH); + if (status != AMDSMI_STATUS_SUCCESS) { + memset(board_info->product_name, 0, + AMDSMI_256_LENGTH * sizeof(board_info->product_name[0])); + } + ss << __PRETTY_FUNCTION__ << " | [rsmi_correction] board_info->product_name= |" + << board_info->product_name << "|"; + LOG_INFO(ss); + } + + if (board_info->manufacturer_name[0] == '\0') { + status = rsmi_wrapper(rsmi_dev_vendor_name_get, + processor_handle, board_info->manufacturer_name, + AMDSMI_MAX_STRING_LENGTH); + if (status != AMDSMI_STATUS_SUCCESS) { + memset(board_info->manufacturer_name, 0, + AMDSMI_MAX_STRING_LENGTH * sizeof(board_info->manufacturer_name[0])); + } + ss << __PRETTY_FUNCTION__ << " | [rsmi_correction] board_info->manufacturer_name= |" + << board_info->manufacturer_name << "|"; + LOG_INFO(ss); + } + + ss << __PRETTY_FUNCTION__ << "[After rocm smi correction] " + << "Returning status = AMDSMI_STATUS_SUCCESS" + << "\n; info->model_number: |" << board_info->model_number << "|" + << "\n; info->product_serial: |" << board_info->product_serial << "|" + << "\n; info->fru_id: |" << board_info->fru_id << "|" + << "\n; info->manufacturer_name: |" << board_info->manufacturer_name << "|" + << "\n; info->product_name: |" << board_info->product_name << "|"; + LOG_INFO(ss); + return AMDSMI_STATUS_SUCCESS; } @@ -654,7 +722,7 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i status = smi_amdgpu_get_market_name_from_dev_id(dev_info.device_id, info->market_name); if (status != AMDSMI_STATUS_SUCCESS) { rsmi_wrapper(rsmi_dev_brand_get, processor_handle, - info->market_name, AMDSMI_NORMAL_STRING_LENGTH); + info->market_name, AMDSMI_256_LENGTH); } info->device_id = dev_info.device_id; @@ -667,7 +735,7 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i if (status == AMDSMI_STATUS_SUCCESS) snprintf(info->asic_serial, sizeof(info->asic_serial), "%lu", dv_uid); status = rsmi_wrapper(rsmi_dev_brand_get, processor_handle, - info->market_name, AMDSMI_NORMAL_STRING_LENGTH); + info->market_name, AMDSMI_256_LENGTH); status = rsmi_wrapper(rsmi_dev_vendor_id_get, processor_handle, &vendor_id); diff --git a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc index 10f27ab309..0c61ba6127 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc @@ -36,7 +36,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -45,6 +47,69 @@ #include "shared_mutex.h" // NOLINT #include "rocm_smi/rocm_smi_logger.h" +std::string leftTrim(const std::string &s) { + if (!s.empty()) { + return std::regex_replace(s, std::regex("^\\s+"), ""); + } + return s; +} + +std::string rightTrim(const std::string &s) { + if (!s.empty()) { + return std::regex_replace(s, std::regex("\\s+$"), ""); + } + return s; +} + +std::string removeNewLines(const std::string &s) { + if (!s.empty()) { + return std::regex_replace(s, std::regex("\n+"), ""); + } + return s; +} + +std::string trim(const std::string &s) { + if (!s.empty()) { + // remove new lines -> trim white space at ends + std::string noNewLines = removeNewLines(s); + return leftTrim(rightTrim(noNewLines)); + } + return s; +} + +// Given original string and string to remove (removeMe) +// Return will provide the resulting modified string with the removed string(s) +std::string removeString(const std::string origStr, + const std::string &removeMe) { + std::string modifiedStr = origStr; + std::string::size_type l = removeMe.length(); + for (std::string::size_type i = modifiedStr.find(removeMe); + i != std::string::npos; + i = modifiedStr.find(removeMe)) { + modifiedStr.erase(i, l); + } + return modifiedStr; +} + +void openFileAndModifyBuffer(std::string path, char *buff, size_t sizeOfBuff) { + bool errorDiscovered = false; + std::ifstream file(path, std::ifstream::in); + std::string contents = {std::istreambuf_iterator{file}, std::istreambuf_iterator{}}; + memset(buff, 0, sizeof(char) * sizeOfBuff); + if (!file.is_open()) { + errorDiscovered = true; + } else { + contents = trim(contents); + } + + file.close(); + if (!errorDiscovered && file.good() && !file.bad() && !file.fail() && !file.eof() + && !contents.empty()) { + std::strncpy(buff, contents.c_str(), sizeOfBuff-1); + buff[sizeOfBuff-1] = '\0'; + } +} + static const uint32_t kAmdGpuId = 0x1002; static bool isAMDGPU(std::string dev_path) { @@ -123,50 +188,26 @@ amdsmi_status_t smi_amdgpu_get_board_info(amd::smi::AMDSmiGPUDevice* device, amd std::string manufacturer_name_path = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/manufacturer"); std::string product_name_path = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/product_name"); - FILE *fp; + openFileAndModifyBuffer(model_number_path, info->model_number, AMDSMI_256_LENGTH); + openFileAndModifyBuffer(product_serial_path, info->product_serial, AMDSMI_NORMAL_STRING_LENGTH); + openFileAndModifyBuffer(fru_id_path, info->fru_id, AMDSMI_NORMAL_STRING_LENGTH); + openFileAndModifyBuffer(manufacturer_name_path, info->manufacturer_name, + AMDSMI_MAX_STRING_LENGTH); + openFileAndModifyBuffer(product_name_path, info->product_name, AMDSMI_256_LENGTH); - fp = fopen(model_number_path.c_str(), "rb"); - if (fp) { - fgets(info->model_number, sizeof(info->model_number), fp); - fclose(fp); - } - - fp = fopen(product_serial_path.c_str(), "rb"); - if (fp) { - fgets(info->product_serial, sizeof(info->product_serial), fp); - fclose(fp); - } - - fp = fopen(fru_id_path.c_str(), "rb"); - if (fp) { - fgets(info->fru_id, sizeof(info->fru_id), fp); - fclose(fp); - } - - fp = fopen(manufacturer_name_path.c_str(), "rb"); - if (fp) { - fgets(info->manufacturer_name, sizeof(info->manufacturer_name), fp); - fclose(fp); - } - - fp = fopen(product_name_path.c_str(), "rb"); - if (fp) { - fgets(info->product_name, sizeof(info->product_name), fp); - fclose(fp); - } std::ostringstream ss; - ss << __PRETTY_FUNCTION__ + ss << __PRETTY_FUNCTION__ << "[Before correction] " << "Returning status = AMDSMI_STATUS_SUCCESS" - << " | model_number_path = " << model_number_path - << "; info->model_number: " << info->model_number - << "\n product_serial_path = " << product_serial_path - << "; info->product_serial: " << info->product_serial - << "\n fru_id_path = " << fru_id_path - << "; info->fru_id: " << info->fru_id - << "\n manufacturer_name_path = " << manufacturer_name_path - << "; info->manufacturer_name: " << info->manufacturer_name - << "\n product_name_path = " << product_name_path - << "; info->product_name: " << info->product_name; + << " | model_number_path = |" << model_number_path << "|\n" + << "; info->model_number: |" << info->model_number << "|\n" + << "\n product_serial_path = |" << product_serial_path << "|\n" + << "; info->product_serial: |" << info->product_serial << "|\n" + << "\n fru_id_path = |" << fru_id_path << "|\n" + << "; info->fru_id: |" << info->fru_id << "|\n" + << "\n manufacturer_name_path = |" << manufacturer_name_path << "|\n" + << "; info->manufacturer_name: |" << info->manufacturer_name << "|\n" + << "\n product_name_path = |" << product_name_path << "|\n" + << "; info->product_name: |" << info->product_name << "|"; LOG_INFO(ss); return AMDSMI_STATUS_SUCCESS;