diff --git a/projects/amdsmi/.gitignore b/projects/amdsmi/.gitignore index 014ca7c138..6f052a082c 100644 --- a/projects/amdsmi/.gitignore +++ b/projects/amdsmi/.gitignore @@ -28,3 +28,6 @@ _toc.yml _build/ _doxygen/ docBin/ + +# Simulated SYSFS - for early development or debug +device/ diff --git a/projects/amdsmi/DEBIAN/postinst.in b/projects/amdsmi/DEBIAN/postinst.in index a0ecb7fdb1..eed81b8ba7 100755 --- a/projects/amdsmi/DEBIAN/postinst.in +++ b/projects/amdsmi/DEBIAN/postinst.in @@ -62,7 +62,7 @@ EOF # confirm logrotate file exists in daily if [ -f /etc/cron.daily/logrotate ]; then # move logrotate daily to hourly - if [ -f /etc/cron.hourly/logrotate ]; then + if [ -d /etc/cron.hourly ]; then sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate else echo "[WARNING] Could find and configure hourly cron for $packageName's"\ @@ -77,6 +77,7 @@ EOF "$packageName logs (when turned on) may not rotate properly." fi fi + return #done configuring for non-systemd timers else # Configure systemd timers - the typical setup for modern Linux logrotation setups if [ -f /lib/systemd/system/logrotate.timer ]; then @@ -102,6 +103,7 @@ EOF echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ "$packageName logs (when turned on) will not rotate properly." fi + return #done configuring for systemd timers fi } diff --git a/projects/amdsmi/DEBIAN/prerm.in b/projects/amdsmi/DEBIAN/prerm.in index 2de4969580..e2d6e1443e 100755 --- a/projects/amdsmi/DEBIAN/prerm.in +++ b/projects/amdsmi/DEBIAN/prerm.in @@ -38,6 +38,7 @@ rm_pyc() { rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/amd_smi/__pycache__ } + case "$1" in ( remove | upgrade) rm_ldconfig diff --git a/projects/amdsmi/RPM/post.in b/projects/amdsmi/RPM/post.in index 4a8ed2876c..1e10be1849 100755 --- a/projects/amdsmi/RPM/post.in +++ b/projects/amdsmi/RPM/post.in @@ -62,7 +62,7 @@ EOF # confirm logrotate file exists in daily if [ -f /etc/cron.daily/logrotate ]; then # move logrotate daily to hourly - if [ -f /etc/cron.hourly/logrotate ]; then + if [ -d /etc/cron.hourly ]; then sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate else echo "[WARNING] Could find and configure hourly cron for $packageName's"\ @@ -77,6 +77,7 @@ EOF "$packageName logs (when turned on) may not rotate properly." fi fi + return #done configuring for non-systemd timers else # Configure systemd timers - the typical setup for modern Linux logrotation setups if [ -f /lib/systemd/system/logrotate.timer ]; then @@ -102,6 +103,7 @@ EOF echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ "$packageName logs (when turned on) will not rotate properly." fi + return #done configuring for systemd timers fi } diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index e47fd436ba..8ef2bb8a52 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -1456,6 +1456,21 @@ amdsmi_status_t amdsmi_get_processor_handle_from_bdf(amdsmi_bdf_t bdf, amdsmi_pr */ amdsmi_status_t amdsmi_get_gpu_id(amdsmi_processor_handle processor_handle, uint16_t *id); +/** + * @brief Get the device revision associated with the device + * + * @details Given a processor handle @p processor_handle and a pointer to a + * uint16_t @p revision to which the revision id will be written + * + * @param[in] processor_handle a processor handle + * + * @param[out] revision a pointer to uint16_t to which the device revision + * will be written + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_gpu_revision(amdsmi_processor_handle processor_handle, uint16_t *revision); + /** * @brief Get the name string for a give vendor ID * diff --git a/projects/amdsmi/oam/src/amd_oam.cc b/projects/amdsmi/oam/src/amd_oam.cc new file mode 100755 index 0000000000..62d4b28287 --- /dev/null +++ b/projects/amdsmi/oam/src/amd_oam.cc @@ -0,0 +1,395 @@ +/* + * MIT License + * + * Copyright (c) 2020 Open Compute Project + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include // NOLINT +#include + +#include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_device.h" +#include "rocm_smi/rocm_smi_utils.h" +#include "rocm_smi/rocm_smi_exception.h" +#include "rocm_smi/rocm_smi_counters.h" +#include "rocm_smi/rocm_smi_kfd.h" +#include "rocm_smi/rocm_smi.h" + +#include "oam/oam_mapi.h" +#include "oam/amd_oam.h" + +static const std::map err_map = { + { AMDOAM_STATUS_INVALID_ARGS, "Invalid arguments" }, + { AMDOAM_STATUS_NOT_SUPPORTED, "Feature not supported" }, + { AMDOAM_STATUS_FILE_ERROR, "Problem accessing a file" }, + { AMDOAM_STATUS_PERMISSION, "Permission denied" }, + { AMDOAM_STATUS_OUT_OF_RESOURCES, "Not enough memory or other resource" }, + { AMDOAM_STATUS_INTERNAL_EXCEPTION, "An internal exception was caught" }, + { AMDOAM_STATUS_INPUT_OUT_OF_BOUNDS, + "The provided input is out of allowable or safe range" }, + { AMDOAM_STATUS_INIT_ERROR, "AMDOAM is not initialized or init failed" }, + { AMDOAM_STATUS_ERROR, "Generic error" }, + { AMDOAM_STATUS_NOT_FOUND, "An item was searched for but not found" } +}; + +#define TRY try { +#define CATCH } catch (...) {return handleRSMIException();} + +static bool rsmi_initialized; + +static int rsmi_status_to_amdoam_errorcode(rsmi_status_t status) { + if (status > RSMI_STATUS_INIT_ERROR) + return -AMDOAM_STATUS_ERROR; + else + return -1 * static_cast(status); +} + +static int handleRSMIException() { + rsmi_status_t ret = amd::smi::handleException(); + return rsmi_status_to_amdoam_errorcode(ret); +} + +int amdoam_get_error_description(int code, const char **description) { + if (description == nullptr) + return -AMDOAM_STATUS_INVALID_ARGS; + + auto search = err_map.find(code); + if (search == err_map.end()) + return -AMDOAM_STATUS_NOT_FOUND; + + *description = search->second; + return AMDOAM_STATUS_SUCCESS; +} + +int amdoam_init(void) { + TRY + + rsmi_status_t status = rsmi_init(0); + + if (status != RSMI_STATUS_SUCCESS) + return rsmi_status_to_amdoam_errorcode(status); + + rsmi_initialized = true; + return AMDOAM_STATUS_SUCCESS; + + CATCH +} + +int amdoam_free(void) { + rsmi_status_t status = rsmi_shut_down(); + + if (status != RSMI_STATUS_SUCCESS) + return rsmi_status_to_amdoam_errorcode(status); + + return AMDOAM_STATUS_SUCCESS; +} + +int amdoam_discover_devices(uint32_t *device_count) { + rsmi_status_t status; + + if (device_count == nullptr) { + return -AMDOAM_STATUS_INVALID_ARGS; + } + + status = rsmi_num_monitor_devices(device_count); + if (status != RSMI_STATUS_SUCCESS) + return rsmi_status_to_amdoam_errorcode(status); + + return AMDOAM_STATUS_SUCCESS; +} + +int amdoam_get_pci_properties(uint32_t device_id, oam_pci_info_t *pci_info) { + uint64_t bdfid; + + TRY + if (pci_info == nullptr) { + return -AMDOAM_STATUS_INVALID_ARGS; + } + + rsmi_status_t status = rsmi_dev_pci_id_get(device_id, &bdfid); + if (status != RSMI_STATUS_SUCCESS) + return rsmi_status_to_amdoam_errorcode(status); + + pci_info->domain = (uint16_t)(bdfid >> 32) & 0xffff; + pci_info->bus = (bdfid >> 8) & 0xff; + pci_info->device = (bdfid >> 3) & 0x1f; + pci_info->function = bdfid & 0x7; + CATCH + + return AMDOAM_STATUS_SUCCESS; +} + +int amdoam_get_dev_properties(uint32_t num_devices, + oam_dev_properties_t *devices) { + const size_t buf_size = 32; + char buf[buf_size] = ""; + uint32_t dev_inx; + oam_dev_properties_t *dev = devices; + +TRY + if (devices == nullptr) + return -AMDOAM_STATUS_INVALID_ARGS; + if (!rsmi_initialized) + return -AMDOAM_STATUS_INIT_ERROR; + + for (dev_inx = 0; dev_inx < num_devices; dev_inx++) { + dev->device_id = dev_inx; + /* If fails to get any following properties, it's not treated as a deal + * breaker. Variable not filled means that property is not available on + * this device or AMD doesn't support that property. + */ + rsmi_dev_vendor_name_get(dev_inx, dev->device_vendor, DEVICE_VENDOR_LEN); + rsmi_dev_name_get(dev_inx, dev->device_name, DEVICE_NAME_LEN); + rsmi_dev_vbios_version_get(dev_inx, buf, buf_size); + if (std::strlen(buf) > 0) { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-truncation" + std::strncpy(dev->sku_name, &buf[4], 6); + std::strncpy(dev->board_name, buf, 12); +#pragma GCC diagnostic pop + } + rsmi_dev_serial_number_get(dev_inx, dev->board_serial_number, + BOARD_SERIAL_NUM_LEN); + ++dev; + } +CATCH + + return AMDOAM_STATUS_SUCCESS; +} + +static uint32_t +get_num_sensors(std::string hwmon_path, std::string fn_reg) { + uint32_t sensor_max = 0; + std::string fn_reg_ex = "\\b" + fn_reg + "([0-9]+)([^ ]*)"; + std::string fn; + std::smatch m; + int32_t temp = 0; + std::string s1("in"); + std::regex re(fn_reg_ex); + auto hwmon_dir = opendir(hwmon_path.c_str()); + assert(hwmon_dir != nullptr); + auto dentry = readdir(hwmon_dir); + while (dentry != nullptr) { + fn = dentry->d_name; + if (std::regex_search(fn, m, re)) { + std::string output = std::regex_replace( + fn, + std::regex("[^0-9]*([0-9]+).*"), + std::string("$1")); + temp = stoi(output); + + assert(temp >= 0); + + if (s1.compare(fn_reg) == 0) + ++temp; + if (static_cast(temp) > sensor_max) + sensor_max = static_cast(temp); + } + dentry = readdir(hwmon_dir); + } + + closedir(hwmon_dir); + return sensor_max; +} + + +int amdoam_get_sensors_count(uint32_t device_id, + oam_sensor_count_t *sensor_count) { + uint32_t dv_ind = device_id; + + TRY + if (sensor_count == nullptr) + return -AMDOAM_STATUS_INVALID_ARGS; + GET_DEV_FROM_INDX + assert(dev->monitor() != nullptr); + std::string hwmon_path = dev->monitor()->path(); + sensor_count->num_temperature_sensors = get_num_sensors(hwmon_path, "temp"); + sensor_count->num_fans = get_num_sensors(hwmon_path, "fan"); + sensor_count->num_voltage_sensors = get_num_sensors(hwmon_path, "in"); + sensor_count->num_power_sensors = get_num_sensors(hwmon_path, "power"); + sensor_count->num_current_sensors = get_num_sensors(hwmon_path, "current"); + CATCH + + return AMDOAM_STATUS_SUCCESS; +} + +int amdoam_get_sensors_info(uint32_t device_id, oam_sensor_type_t type, + uint32_t num_sensors, oam_sensor_info_t sensor_info[]) { + uint32_t dv_ind = device_id; + std::string val_str; + uint32_t i; + rsmi_status_t status; + + TRY + if ((sensor_info == nullptr) || (type >= OAM_SENSOR_TYPE_UNKNOWN)) + return -AMDOAM_STATUS_INVALID_ARGS; + GET_DEV_FROM_INDX + assert(dev->monitor() != nullptr); + switch (type) { + case OAM_SENSOR_TYPE_POWER: + for (i = 0; i < num_sensors; i++) { + snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX, + "POWER_SENSOR_%u", i+1); + sensor_info[i].sensor_type = type; + status = rsmi_dev_power_ave_get(device_id, i, + reinterpret_cast(&sensor_info[i].value)); + if (status != RSMI_STATUS_SUCCESS) + return rsmi_status_to_amdoam_errorcode(status); + } + break; + + case OAM_SENSOR_TYPE_VOLTAGE: + for (i = 0; i < num_sensors; i++) { + snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX, + "VOLTAGE_SENSOR_%u", i); + sensor_info[i].sensor_type = type; + status = rsmi_dev_volt_metric_get(device_id, RSMI_VOLT_TYPE_VDDGFX, + RSMI_VOLT_CURRENT, &sensor_info[i].value); + if (status != RSMI_STATUS_SUCCESS) + return rsmi_status_to_amdoam_errorcode(status); + } + break; + + case OAM_SENSOR_TYPE_TEMP: + for (i = 0; i < num_sensors; i++) { + snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX, + "TEMP_SENSOR_%u", i+1); + sensor_info[i].sensor_type = type; + status = rsmi_dev_temp_metric_get(device_id, i, RSMI_TEMP_CURRENT, + &sensor_info[i].value); + if (status != RSMI_STATUS_SUCCESS) + return rsmi_status_to_amdoam_errorcode(status); + } + break; + + case OAM_SENSOR_TYPE_FAN_SPEED: + for (i = 0; i < num_sensors; i++) { + snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX, + "FAN_SENSOR_%u", i+1); + sensor_info[i].sensor_type = type; + status = rsmi_dev_fan_speed_get(device_id, i, &sensor_info[i].value); + if (status != RSMI_STATUS_SUCCESS) + return rsmi_status_to_amdoam_errorcode(status); + } + break; + + default: + return -AMDOAM_STATUS_NOT_SUPPORTED; + } + CATCH + + return AMDOAM_STATUS_SUCCESS; +} + +// TODO(x): This function doesn't work for OAM. It's just a version +// of rsmi_dev_ecc_count_get(), which has similar functionality. +// The purpose here is just to drive refactoring; e.g., making macros +// available and previously static functions global. +int +get_device_error_count(oam_dev_handle_t *handle, + oam_dev_error_count_t *count) { + std::vector val_vec; + rsmi_status_t ret; + + TRY + // TODO(x): replace with final code... + // Below, we are just returning errors for RSMI_GPU_BLOCK_GFX as a + // placeholder + (void)handle; // Just ignore for now + + rsmi_gpu_block_t block = RSMI_GPU_BLOCK_GFX; + + // The macro CHK_SUPPORT_VAR assumes the existence of a device index variable + // "dv_ind". Presumably, the device index will come from the "handle" + // pointer. Since I don't know how that will be implemented, for now we + // will just make up a device index: + uint32_t dv_ind = 0; + CHK_SUPPORT_VAR(count, block) + + amd::smi::DevInfoTypes type; + switch (block) { + case RSMI_GPU_BLOCK_UMC: + type = amd::smi::kDevErrCntUMC; + break; + + case RSMI_GPU_BLOCK_SDMA: + type = amd::smi::kDevErrCntSDMA; + break; + + case RSMI_GPU_BLOCK_GFX: + type = amd::smi::kDevErrCntGFX; + break; + + case RSMI_GPU_BLOCK_MMHUB: + type = amd::smi::kDevErrCntMMHUB; + break; + + case RSMI_GPU_BLOCK_PCIE_BIF: + type = amd::smi::kDevErrCntPCIEBIF; + break; + + case RSMI_GPU_BLOCK_HDP: + type = amd::smi::kDevErrCntHDP; + break; + + case RSMI_GPU_BLOCK_XGMI_WAFL: + type = amd::smi::kDevErrCntXGMIWAFL; + break; + + default: + return RSMI_STATUS_NOT_SUPPORTED; + } + + DEVICE_MUTEX + + ret = GetDevValueVec(type, dv_ind, &val_vec); + + if (ret == RSMI_STATUS_FILE_ERROR) { + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + return static_cast(ret); + } + + assert(val_vec.size() == 2); + + std::string junk; + std::istringstream fs1(val_vec[0]); + + fs1 >> junk; + assert(junk == "ue:"); + fs1 >> count->total_error_count; + + std::istringstream fs2(val_vec[1]); + + fs2 >> junk; + assert(junk == "ce:"); + fs2 >> count->total_error_count; + + return static_cast(ret); + CATCH +} diff --git a/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc b/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc index bb456f7a0e..9e9019e2b8 100755 --- a/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc +++ b/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc @@ -718,6 +718,9 @@ int main() { ret = rsmi_dev_id_get(i, &val_ui16); CHK_RSMI_RET_I(ret) std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl; + ret = rsmi_dev_revision_get(i, &val_ui16); + CHK_RSMI_RET_I(ret) + std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << val_ui16 << std::endl; char current_compute_partition[256]; current_compute_partition[0] = '\0'; diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/kfd_ioctl.h b/projects/amdsmi/rocm_smi/include/rocm_smi/kfd_ioctl.h index 5817833eae..3b781ce129 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/kfd_ioctl.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/kfd_ioctl.h @@ -36,6 +36,12 @@ struct kfd_ioctl_get_version_args { __u32 minor_version; /* from KFD */ }; +struct kfd_ioctl_get_available_memory_args { + __u64 available; /* from KFD */ + __u32 gpu_id; /* to KFD */ + __u32 pad; +}; + /* For kfd_ioctl_create_queue_args.queue_type. */ #define KFD_IOC_QUEUE_TYPE_COMPUTE 0x0 #define KFD_IOC_QUEUE_TYPE_SDMA 0x1 @@ -726,6 +732,10 @@ struct kfd_ioctl_cross_memory_copy_args { #define AMDKFD_IOC_CROSS_MEMORY_COPY \ AMDKFD_IOWR(0x22, struct kfd_ioctl_cross_memory_copy_args) + +#define AMDKFD_IOC_AVAILABLE_MEMORY \ + AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args) + #define AMDKFD_COMMAND_START 0x01 #undef AMDKFD_COMMAND_END #define AMDKFD_COMMAND_END 0x22 diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index 8e4bacd03a..786de78eee 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -1088,6 +1088,21 @@ rsmi_status_t rsmi_num_monitor_devices(uint32_t *num_devices); */ rsmi_status_t rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id); +/** + * @brief Get the device revision associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint32_t to + * which the revision will be written + * + * @param[in] dv_ind a device index + * + * @param[inout] revision a pointer to uint32_t to which the device revision + * will be written + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * + */ +rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision); /** * @brief Get the SKU for a desired device associated with the device with diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h index c975baae55..a1b2809457 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -52,12 +52,14 @@ #include #include #include +#include #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_power_mon.h" #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_counters.h" +#include "rocm_smi/rocm_smi_properties.h" #include "shared_mutex.h" //NOLINT namespace amd { @@ -100,6 +102,7 @@ enum DevInfoTypes { kDevOverDriveLevel, kDevMemOverDriveLevel, kDevDevID, + kDevDevRevID, kDevDevProdName, kDevDevProdNum, kDevVendorID, @@ -172,6 +175,7 @@ typedef struct { std::vector variants; } dev_depends_t; + class Device { public: explicit Device(std::string path, RocmSMI_env_vars const *e); @@ -212,7 +216,7 @@ class Device { void set_evt_notif_anon_fd(uint32_t fd) { evt_notif_anon_fd_ = static_cast(fd);} int evt_notif_anon_fd(void) const {return evt_notif_anon_fd_;} - metrics_table_header_t & gpu_metrics_ver(void) {return gpu_metrics_ver_;} + metrics_table_header_t &gpu_metrics_ver(void) {return gpu_metrics_ver_;} void fillSupportedFuncs(void); void DumpSupportedFunctions(void); bool DeviceAPISupported(std::string name, uint64_t variant, @@ -220,6 +224,8 @@ class Device { rsmi_status_t restartAMDGpuDriver(void); rsmi_status_t storeDevicePartitions(uint32_t dv_ind); template std::string readBootPartitionState(uint32_t dv_ind); + rsmi_status_t check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type); + private: std::shared_ptr monitor_; @@ -240,6 +246,7 @@ class Device { int readDevInfoBinary(DevInfoTypes type, std::size_t b_size, void *p_binary_data); int writeDevInfoStr(DevInfoTypes type, std::string valStr); + rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query); uint64_t bdfid_; uint64_t kfd_gpu_id_; std::unordered_set devInfoTypesStrings; private: diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_monitor.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_monitor.h index 648e159b65..ea639eae35 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_monitor.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_monitor.h @@ -94,6 +94,44 @@ enum MonitorTypes { kMonInvalid = 0xFFFFFFFF, }; +const std::map monitorTypesToString { + {MonitorTypes::kMonName, "amd::smi::kMonName"}, + {MonitorTypes::kMonTemp, "amd::smi::kMonName"}, + {MonitorTypes::kMonFanSpeed, "amd::smi::kMonName"}, + {MonitorTypes::kMonMaxFanSpeed, "amd::smi::kMonName"}, + {MonitorTypes::kMonFanRPMs, "amd::smi::kMonName"}, + {MonitorTypes::kMonFanCntrlEnable, "amd::smi::kMonName"}, + {MonitorTypes::kMonPowerCap, "amd::smi::kMonName"}, + {MonitorTypes::kMonPowerCapDefault, "amd::smi::kMonName"}, + {MonitorTypes::kMonPowerCapMax, "amd::smi::kMonName"}, + {MonitorTypes::kMonPowerCapMin, "amd::smi::kMonName"}, + {MonitorTypes::kMonPowerAve, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempMax, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempMin, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempMaxHyst, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempMinHyst, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempCritical, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempCriticalHyst, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempEmergency, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempEmergencyHyst, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempCritMin, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempCritMinHyst, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempOffset, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempLowest, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempHighest, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempLabel, "amd::smi::kMonName"}, + {MonitorTypes::kMonVolt, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltMax, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltMinCrit, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltMin, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltMaxCrit, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltAverage, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltLowest, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltHighest, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltLabel, "amd::smi::kMonName"}, + {MonitorTypes::kMonInvalid, "amd::smi::kMonName"}, +}; + class Monitor { public: diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_properties.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_properties.h new file mode 100644 index 0000000000..67d285cbbc --- /dev/null +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_properties.h @@ -0,0 +1,160 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_PROPERTIES_H_ +#define INCLUDE_ROCM_SMI_ROCM_SMI_PROPERTIES_H_ + +#include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi.h" + +#include +#include + + +namespace amd { +namespace smi { + +// +// Property reinforcement check list +// +using AMDGpuPropertyId_t = uint32_t; +using AMDGpuDevIdx_t = uint32_t; +using AMDGpuVerbId_t = uint32_t; +using AMDGpuAsicId_t = uint16_t; +using AMDGpuAsicRevId_t = uint16_t; +using AMDGpuOpModeType_t = uint8_t; + +enum class AMDGpuVerbTypes_t : AMDGpuVerbId_t +{ + kNone = 0, + kSetGpuPciBandwidth, + kSetPowerCap, + kSetGpuPowerProfile, + kSetGpuClkRange, + kSetGpuOdClkInfo, + kSetGpuOdVoltInfo, + kSetGpuPerfLevelV1, + kSetGpuPerfLevel, + kGetGpuPowerProfilePresets, + kResetGpu, + kSetGpuPerfDeterminismMode, + kSetGpuFanSpeed, + kResetGpuFan, + kSetClkFreq, + kSetGpuOverdriveLevelV1, + kSetGpuOverdriveLevel, + kGetGpuFanRpms, + kGetGpuFanSpeed, + kGetGpuFanSpeedMax, + kGetGpuVoltMetric, + kGetGpuOverDriveLevel, + kGetGpuOdVoltInfo, + kGetGpuOdVoltCurveRegions, +}; +using AMDGpuVerbList_t = std::map; + + +enum class AMDGpuPropertyTypesOffset_t : AMDGpuPropertyId_t +{ + kNone = 0, + kDevInfoTypes = (0x1000 << 0), + kMonitorTypes = (0x1000 << 1), + kPerfTypes = (0x1000 << 2), + kClkTypes = (0x1000 << 3), + kVoltMetricTypes = (0x1000 << 4), +}; + +using AMDGpuPropertyOffsetType = std::underlying_type::type; +using AMDGpuPropertyTypesOffsetList_t = std::map; +AMDGpuPropertyTypesOffset_t operator| (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs); +AMDGpuPropertyTypesOffset_t operator& (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs); + + +enum class AMDGpuPropertyOpModeTypes_t : AMDGpuOpModeType_t +{ + kBareMetal = (0x1 << 0), + kSrIov = (0x1 << 1), + kBoth = (0x1 << 2), +}; + +using AMDGpuPropertyOpModeType = std::underlying_type::type; +using AMDGpuOpModeList_t = std::map; +AMDGpuPropertyOpModeTypes_t operator| (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs); +AMDGpuPropertyOpModeTypes_t operator& (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs); + + +struct AMDGpuProperties_t +{ + AMDGpuAsicRevId_t m_pci_rev_id; + AMDGpuPropertyId_t m_property; + AMDGpuVerbTypes_t m_verb_id; + AMDGpuPropertyOpModeTypes_t m_opmode; + bool m_should_be_available; +}; +using AMDGpuPropertyList_t = std::multimap; + +struct AMDGpuPropertyQuery_t +{ + AMDGpuAsicId_t m_asic_id; + AMDGpuAsicRevId_t m_pci_rev_id; + AMDGpuDevIdx_t m_dev_idx; + AMDGpuPropertyId_t m_property; + AMDGpuVerbTypes_t m_verb_id; +}; + + +// +AMDGpuPropertyId_t make_unique_property_id(AMDGpuPropertyTypesOffset_t type_offset, AMDGpuPropertyId_t property_id); +AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id); + +rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, + AMDGpuVerbTypes_t dev_info_type, + rsmi_status_t actual_error_code); + +void dump_amdgpu_property_reinforcement_list(); + + +} // namespace smi +} // namespace amd + +#endif // INCLUDE_ROCM_SMI_ROCM_SMI_DEVICE_H_ diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h index a4558dbd2d..697fcb3723 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -48,6 +48,9 @@ #include #include #include +#include +#include +#include #include "rocm_smi/rocm_smi_device.h" @@ -84,6 +87,8 @@ std::tuple readTmpFile( std::string stateName, std::string parameterName); void displayAppTmpFilesContent(void); +std::string debugVectorContent(std::vector v); +std::string displayAllDevicePaths(std::vector> v); rsmi_status_t handleException(); rsmi_status_t GetDevValueVec(amd::smi::DevInfoTypes type, @@ -94,8 +99,53 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type, rsmi_status_t ErrnoToRsmiStatus(int err); std::string getRSMIStatusString(rsmi_status_t ret); std::tuple getSystemDetails(void); + std::string, std::string, std::string, std::string> + getSystemDetails(void); void logSystemDetails(void); +rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str); +void logHexDump(const char *desc, const void *addr, const size_t len, + size_t perLine); +bool isSystemBigEndian(); +template +std::string print_int_as_hex(T i, bool showHexNotation=true) { + std::stringstream ss; + if (showHexNotation) { + ss << "0x" << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; + } else { + ss << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; + } + + if (std::is_same::value) { + ss << static_cast(i|0); + } else if (std::is_same::value) { + ss << static_cast(static_cast(i|0)); + } else if (std::is_signed::value) { + ss << static_cast(i | 0); + } else { + ss << static_cast(i | 0); + } + ss << std::dec; + return ss.str(); +}; + +template +std::string print_unsigned_int(T i) { + std::stringstream ss; + ss << static_cast(i | 0); + return ss.str(); +} + +template +std::string print_unsigned_hex_and_int(T i, std::string heading="") { + std::stringstream ss; + if (heading.empty() == false) { + ss << "\n" << heading << " = "; + } + ss << "Hex (MSB): " << print_int_as_hex(i) << ", " + << "Unsigned int: " << print_unsigned_int(i) << ", " + << "Byte Size: " << sizeof(T); + return ss.str(); +} struct pthread_wrap { public: diff --git a/projects/amdsmi/rocm_smi/python_smi_tools/rocm_smi.py b/projects/amdsmi/rocm_smi/python_smi_tools/rocm_smi.py index 5c12624142..003efadc66 100755 --- a/projects/amdsmi/rocm_smi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/rocm_smi/python_smi_tools/rocm_smi.py @@ -47,7 +47,7 @@ headerString = ' ROCm System Management Interface ' footerString = ' End of ROCm SMI Log ' # Output formatting -appWidth = 84 +appWidth = 100 deviceList = [] # Enable or disable serialized format @@ -112,19 +112,10 @@ def formatCsv(deviceList): if outputType == 'system': jsonobj = json.loads(jsondata) keylist = header - for record in jsonobj: - my_string += str(record) - for key in keylist: - if key == 'system': - tempstr = str(jsonobj[record]) - tempstr = tempstr[tempstr.find('\'')+1:] - tempstr = tempstr[:tempstr.find('\'')] - # Force output device type to 'system' - my_string += ',%s\nsystem,%s' % (tempstr, jsonobj[record][tempstr]) - my_string += '\n' - # Force output device type to 'system' - if my_string.startswith('system'): - my_string = 'device' + my_string[6:] + for record in jsonobj['system']: + my_string += "\"%s\", \"%s\"\n" % (record, jsonobj['system'][record]) + # add header + my_string = "name, value\n" + my_string return my_string headerkeys = [] # Separate device-specific information from system-level information @@ -249,6 +240,17 @@ def getId(device): return hex(dv_id.value) +def getRev(device): + """ Return the hexadecimal value of a device's Revision + + @param device: DRM device identifier + """ + dv_rev = c_short() + ret = rocmsmi.rsmi_dev_revision_get(device, byref(dv_rev)) + if rsmi_ret_ok(ret, device, 'get_device_rev'): + return hex(dv_rev.value) + + def getMaxPower(device): """ Return the maximum power cap of a given device @@ -391,6 +393,25 @@ def getTemp(device, sensor): return temp.value / 1000 return 'N/A' +def findFirstAvailableTemp(device): + """ Discovers the first available device temperature to display + + Returns a tuple of (temp_type, temp_value) for the device specified + @param device: DRM device identifier + """ + temp = c_int64(0) + metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT + ret_temp = "N/A" + ret_temp_type = "(Unknown)" + for i, templist_val in enumerate(temp_type_lst): + ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), i, metric, byref(temp)) + if rsmi_ret_ok(ret, device, 'get_temp_metric_' + templist_val, silent=True): + ret_temp = temp.value / 1000 + ret_temp_type = '(' + templist_val.capitalize() + ')' + break + else: + continue + return (ret_temp_type, ret_temp) def getVbiosVersion(device): """ Returns the VBIOS version for a given device @@ -399,7 +420,9 @@ def getVbiosVersion(device): """ vbios = create_string_buffer(256) ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256) - if rsmi_ret_ok(ret, device): + if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: + return "Unsupported" + elif rsmi_ret_ok(ret, device): return vbios.value.decode() @@ -425,7 +448,7 @@ def getComputePartition(device): ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256) if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode(): return str(currentComputePartition.value.decode()) - return "UNKNOWN" + return "N/A" def getMemoryPartition(device): @@ -437,7 +460,7 @@ def getMemoryPartition(device): ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256) if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode(): return str(currentNPSMode.value.decode()) - return "UNKNOWN" + return "N/A" def print2DArray(dataArray): @@ -537,16 +560,23 @@ def printEventList(device, delay, eventList): data = rsmi_evt_notification_data_t(1) rocmsmi.rsmi_event_notification_get(delay, byref(num_elements), byref(data)) if len(data.message) > 0: - print2DArray([['\rGPU[%d]:\t' % (device), ctime().split()[3], notification_type_names[data.event.value - 1], + print2DArray([['\rGPU[%d]:\t' % (data.dv_ind), ctime().split()[3], notification_type_names[data.event.value - 1], data.message.decode('utf8') + '\r']]) -def printLog(device, metricName, value=None, extraSpace=False): +def printLog(device, metricName, value=None, extraSpace=False, useItalics=False): """ Print out to the SMI log @param device: DRM device identifier @param metricName: Title of the item to print to the log @param value: The item's value to print to the log """ + red = '\033[91m' + green = '\033[92m' + blue = '\033[94m' + bold = '\033[1m' + italics = '\033[3m' + underline = '\033[4m' + end = '\033[0m' global PRINT_JSON if PRINT_JSON: if value is not None and device is not None: @@ -563,6 +593,8 @@ def printLog(device, metricName, value=None, extraSpace=False): # Force thread safe printing lock = multiprocessing.Lock() lock.acquire() + if useItalics: + logstr = italics + logstr + end if extraSpace: print('\n' + logstr + '\n', end='', flush=True) else: @@ -1353,7 +1385,7 @@ def setPowerOverDrive(deviceList, value, autoRespond): RETCODE = 1 continue if new_power_cap.value == current_power_cap.value: - printErrLog(device,'Max power was already at: {}W'.format(new_power_cap.value / 1000000)) + printLog(device,'Max power was already at: {}W'.format(new_power_cap.value / 1000000)) if current_power_cap.value < default_power_cap.value: current_power_cap.value = default_power_cap.value @@ -1540,18 +1572,39 @@ def showAllConcise(deviceList): print('ERROR: Cannot print JSON/CSV output for concise output') sys.exit(1) printLogSpacer(' Concise Info ') - header = ['GPU', 'Temp (DieEdge)', 'AvgPwr', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%'] + deviceList.sort() + (temp_type, _) = findFirstAvailableTemp(deviceList[0]) + available_temp_type = temp_type.lower() + available_temp_type = available_temp_type.replace('(', '') + available_temp_type = available_temp_type.replace(')', '') + header = ['GPU', 'Temp', 'AvgPwr', 'Partitions', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%'] + subheader = ['', temp_type, '', '(Mem, Compute)', '', '', '', '', '', '', ''] + # add additional spaces to match header + for idx, item in enumerate(subheader): + header_size = len(header[idx]) + subheader_size = len(subheader[idx]) + if header_size != subheader_size: + numSpacesToFill_subheader = header_size - subheader_size + numSpacesToFill_header = subheader_size - header_size + #take pos spaces to mean, we need to match size of the other + if numSpacesToFill_subheader > 0: + subheader[idx] = subheader[idx] + (' ' * numSpacesToFill_subheader) + if numSpacesToFill_header > 0: + header[idx] = header[idx] + (' ' * numSpacesToFill_header) head_widths = [len(head) + 2 for head in header] values = {} + degree_sign = u'\N{DEGREE SIGN}' for device in deviceList: - temp = str(getTemp(device, 'edge')) - if temp != 'N/A': - temp += 'c' + temp_val = str(getTemp(device, available_temp_type)) + if temp_val != 'N/A': + temp_val += degree_sign + 'C' avgPwr = str(getPower(device)) if avgPwr != '0.0' and avgPwr != 'N/A': avgPwr += 'W' else: avgPwr = 'N/A' + combined_partition = (getMemoryPartition(device) + ", " + + getComputePartition(device)) concise = True sclk = showCurrentClocks([device], 'sclk', concise) mclk = showCurrentClocks([device], 'mclk', concise) @@ -1575,7 +1628,9 @@ def showAllConcise(deviceList): mem_use_pct='Unsupported' if vram_used != None and vram_total != None and float(vram_total) != 0: mem_use_pct = '% 3.0f%%' % (100 * (float(vram_used) / float(vram_total))) - values['card%s' % (str(device))] = [device, temp, avgPwr, sclk, mclk, fan, str(perf).lower(), pwrCap, + values['card%s' % (str(device))] = [device, temp_val, avgPwr, + combined_partition, sclk, mclk, + fan, str(perf).lower(), pwrCap, mem_use_pct, gpu_busy] val_widths = {} for device in deviceList: @@ -1585,6 +1640,9 @@ def showAllConcise(deviceList): for col in range(len(val_widths[device])): max_widths[col] = max(max_widths[col], val_widths[device][col]) printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), header)), None) + printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader)), + None, useItalics=True) + printLogSpacer(fill='=') for device in deviceList: printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), values['card%s' % (str(device))])), None) @@ -1601,19 +1659,23 @@ def showAllConciseHw(deviceList): print('ERROR: Cannot print JSON/CSV output for concise hardware output') sys.exit(1) printLogSpacer(' Concise Hardware Info ') - header = ['GPU', 'DID', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS'] + header = ['GPU', 'DID', 'DREV', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS'] head_widths = [len(head) + 2 for head in header] values = {} for device in deviceList: gpuid = getId(device) if str(gpuid).startswith('0x'): gpuid = str(gpuid)[2:] + gpurev = getRev(device) + if str(gpurev).startswith('0x'): + gpurev = str(gpurev)[2:] + gfxRas = getRasEnablement(device, 'GFX') sdmaRas = getRasEnablement(device, 'SDMA') umcRas = getRasEnablement(device, 'UMC') vbios = getVbiosVersion(device) bus = getBus(device) - values['card%s' % (str(device))] = [device, gpuid, gfxRas, sdmaRas, umcRas, vbios, bus] + values['card%s' % (str(device))] = [device, gpuid, gpurev, gfxRas, sdmaRas, umcRas, vbios, bus] val_widths = {} for device in deviceList: val_widths[device] = [len(str(val)) + 2 for val in values['card%s' % (str(device))]] @@ -1952,6 +2014,7 @@ def showId(deviceList): printLogSpacer(' ID ') for device in deviceList: printLog(device, 'GPU ID', getId(device)) + printLog(device, 'GPU Rev', getRev(device)) printLogSpacer() @@ -2272,8 +2335,12 @@ def showProductName(deviceList): # if rsmi_ret_ok(ret, device) and sku.value.decode(): # device_sku = sku.value.decode() # Retrieve the device SKU as a substring from VBIOS + device_sku = "" ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256) - if rsmi_ret_ok(ret, device, 'get_vbios_version') and vbios.value.decode(): + if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: + device_sku = "Unsupported" + printLog(device, 'Card SKU', '\t\t' + device_sku) + elif rsmi_ret_ok(ret, device, 'get_vbios_version') and vbios.value.decode(): # Device SKU is just the characters in between the two '-' in vbios_version if vbios.value.decode().count('-') == 2 and len(str(vbios.value.decode().split('-')[1])) > 1: device_sku = vbios.value.decode().split('-')[1] @@ -2535,7 +2602,7 @@ def showEvents(deviceList, eventTypes): break -def printTempGraph(deviceList, delay): +def printTempGraph(deviceList, delay, temp_type): # deviceList must be in ascending order deviceList.sort() devices = 0 @@ -2549,7 +2616,7 @@ def printTempGraph(deviceList, delay): terminalWidth = os.get_terminal_size()[0] printStrings = list() for device in deviceList: - temp = getTemp(device, 'edge') + temp = getTemp(device, temp_type) if temp == 'N/A': percentage = 0 else: @@ -2622,11 +2689,16 @@ def getGraphColor(percentage): def showTempGraph(deviceList): - printLogSpacer(' Temperature Graph ') + deviceList.sort() + (temp_type, temp_value) = findFirstAvailableTemp(deviceList[0]) + printLogSpacer(' Temperature Graph ' + temp_type + ' ') + temp_type = temp_type.lower() + temp_type = temp_type.replace('(', '') + temp_type = temp_type.replace(')', '') # Start a thread for constantly printing try: # Create a thread (call print function, devices, delay in ms) - _thread.start_new_thread(printTempGraph, (deviceList, 150)) + _thread.start_new_thread(printTempGraph, (deviceList, 150, temp_type)) except Exception as e: printErrLog(device, 'Unable to start new thread. %s' % (e)) # Catch user input for program termination diff --git a/projects/amdsmi/rocm_smi/python_smi_tools/rsmiBindings.py b/projects/amdsmi/rocm_smi/python_smi_tools/rsmiBindings.py index b6e7f2474d..9ffcac138d 100644 --- a/projects/amdsmi/rocm_smi/python_smi_tools/rsmiBindings.py +++ b/projects/amdsmi/rocm_smi/python_smi_tools/rsmiBindings.py @@ -11,8 +11,16 @@ import os # Use ROCm installation path if running from standard installation # With File Reorg rsmiBindings.py will be installed in /opt/rocm/libexec/rocm_smi. -# relative path changed accordingly -path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@' +# relative path changed accordingly. +# if ROCM_SMI_LIB_PATH is set, we can load 'librocm_smi64.so' from that location +# +path_librocm = str() +rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH') +if (rocm_smi_lib_path != None): + path_librocm = rocm_smi_lib_path +else: + path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@' + if not os.path.isfile(path_librocm): print('Unable to find %s . Trying /opt/rocm*' % path_librocm) for root, dirs, files in os.walk('/opt', followlinks=True): @@ -22,9 +30,10 @@ if not os.path.isfile(path_librocm): print('Using lib from %s' % path_librocm) else: print('Unable to find librocm_smi64.so.@VERSION_MAJOR@') +else: + print('Library loaded from: %s ' % path_librocm) # ----------> TODO: Support static libs as well as SO - try: cdll.LoadLibrary(path_librocm) rocmsmi = CDLL(path_librocm) @@ -36,7 +45,6 @@ except OSError: .format('\33[33m', '\033[0m')) exit() - # Device ID dv_id = c_uint64() # GPU ID diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi.cc b/projects/amdsmi/rocm_smi/src/rocm_smi.cc index ffb89292b3..dd9e232f11 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi.cc @@ -78,6 +78,7 @@ #include "rocm_smi/rocm_smi_logger.h" using namespace ROCmLogging; +using namespace amd::smi; static const uint32_t kMaxOverdriveLevel = 20; static const float kEnergyCounterResolution = 15.3f; @@ -632,7 +633,7 @@ rsmi_status_t rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t *ec) { std::vector val_vec; - rsmi_status_t ret; + rsmi_status_t ret(RSMI_STATUS_NOT_SUPPORTED); std::ostringstream ss; TRY @@ -673,8 +674,8 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, default: ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", default case -> reporting RSMI_STATUS_NOT_SUPPORTED" - << amd::smi::getRSMIStatusString(ret); + << ", default case -> reporting " + << amd::smi::getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED); LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } @@ -682,6 +683,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, DEVICE_MUTEX ret = GetDevValueVec(type, dv_ind, &val_vec); + if (val_vec.size() != 2 ) ret = RSMI_STATUS_FILE_ERROR; if (ret == RSMI_STATUS_FILE_ERROR || val_vec.size() != 2) { ss << __PRETTY_FUNCTION__ << " | ======= end =======" @@ -698,8 +700,6 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, return ret; } - assert(val_vec.size() == 2); - std::string junk; std::istringstream fs1(val_vec[0]); @@ -820,6 +820,21 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { return ret; } +rsmi_status_t +rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision) { + std::ostringstream outss; + rsmi_status_t ret; + outss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(outss); + CHK_SUPPORT_NAME_ONLY(revision) + + ret = get_id(dv_ind, amd::smi::kDevDevRevID, revision); + outss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", reporting " << amd::smi::getRSMIStatusString(ret); + LOG_TRACE(outss); + return ret; +} + rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *id) { TRY @@ -2503,7 +2518,16 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, } if (temperature == nullptr) { - return RSMI_STATUS_INVALID_ARGS; + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: temperature was a null ptr reference" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_INVALID_ARGS; } // The HBM temperature is retreived from the gpu_metrics @@ -2512,12 +2536,32 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, || sensor_type == RSMI_TEMP_TYPE_HBM_2 || sensor_type == RSMI_TEMP_TYPE_HBM_3) { if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: To retreive HBM temp, we only support metric = " + << "RSMI_TEMP_CURRENT" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } rsmi_gpu_metrics_t gpu_metrics; ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: rsmi_dev_gpu_metrics_info_get returned " + << getRSMIStatusString(ret) + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_ERROR(ss); return ret; } @@ -2537,11 +2581,28 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, default: return RSMI_STATUS_INVALID_ARGS; } - if (val_ui16 == UINT16_MAX) + if (val_ui16 == UINT16_MAX) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: Reached UINT16 max value, overflow" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; - else + } else *temperature = val_ui16 * CENTRIGRADE_TO_MILLI_CENTIGRADE; + ss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Data: " << *temperature + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " | "; + LOG_INFO(ss); return RSMI_STATUS_SUCCESS; } // end HBM temperature @@ -2550,6 +2611,15 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, GET_DEV_FROM_INDX if (dev->monitor() == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: monitor returned nullptr" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } std::shared_ptr m = dev->monitor(); @@ -2563,6 +2633,15 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, CHK_API_SUPPORT_ONLY(temperature, metric, sensor_index) ret = get_dev_mon_value(mon_type, dv_ind, sensor_index, temperature); + ss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Sensor_index: " << sensor_index + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Data: " << *temperature + << " | Returning = " + << getRSMIStatusString(ret) << " | "; + LOG_INFO(ss); return ret; CATCH @@ -2995,6 +3074,7 @@ rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t reserved, DEVICE_MUTEX rsmi_status_t ret = get_power_profiles(dv_ind, status, nullptr); + return ret; CATCH } @@ -3015,6 +3095,7 @@ rsmi_dev_power_profile_set(uint32_t dv_ind, uint32_t dummy, return RSMI_STATUS_NOT_SUPPORTED; } rsmi_status_t ret = set_power_profile(dv_ind, profile); + return ret; CATCH } @@ -3052,6 +3133,14 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, DEVICE_MUTEX ret = get_dev_value_int(mem_type_file, dv_ind, total); + // Fallback to KFD reported memory if VRAM total is 0 + if (mem_type == RSMI_MEM_TYPE_VRAM && *total == 0) { + GET_DEV_AND_KFDNODE_FROM_INDX + if (kfd_node->get_total_memory(total) == 0 && *total > 0) { + return RSMI_STATUS_SUCCESS; + } + } + return ret; CATCH } @@ -3088,6 +3177,17 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, DEVICE_MUTEX ret = get_dev_value_int(mem_type_file, dv_ind, used); + // Fallback to KFD reported memory if no VRAM + if (mem_type == RSMI_MEM_TYPE_VRAM && *used == 0) { + GET_DEV_AND_KFDNODE_FROM_INDX + uint64_t total = 0; + ret = get_dev_value_int(amd::smi::kDevMemTotVRAM, dv_ind, &total); + if (total != 0) return ret; // do not need to fallback + if ( kfd_node->get_used_memory(used) == 0 ) { + return RSMI_STATUS_SUCCESS; + } + } + return ret; CATCH } diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc index 18c96b7f13..ddaf41a44a 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -59,6 +59,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_device.h" @@ -85,6 +86,7 @@ static const char *kDevPerfLevelFName = "power_dpm_force_performance_level"; static const char *kDevDevProdNameFName = "product_name"; static const char *kDevDevProdNumFName = "product_number"; static const char *kDevDevIDFName = "device"; +static const char *kDevDevRevIDFName = "revision"; static const char *kDevVendorIDFName = "vendor"; static const char *kDevSubSysDevIDFName = "subsystem_device"; static const char *kDevSubSysVendorIDFName = "subsystem_vendor"; @@ -238,6 +240,7 @@ static const std::map kDevAttribNameMap = { {kDevDevProdName, kDevDevProdNameFName}, {kDevDevProdNum, kDevDevProdNumFName}, {kDevDevID, kDevDevIDFName}, + {kDevDevRevID, kDevDevRevIDFName}, {kDevVendorID, kDevVendorIDFName}, {kDevSubSysDevID, kDevSubSysDevIDFName}, {kDevSubSysVendorID, kDevSubSysVendorIDFName}, @@ -374,12 +377,13 @@ static const std::map kDevFuncDependsMap = { // Functions with only mandatory dependencies {"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}}, {"rsmi_dev_id_get", {{kDevDevIDFName}, {}}}, + {"rsmi_dev_revision_get", {{kDevDevRevIDFName}, {}}}, {"rsmi_dev_vendor_id_get", {{kDevVendorIDFName}, {}}}, - {"rsmi_dev_name_get", {{kDevVendorIDFName, kDevDevIDFName}, {}}}, {"rsmi_dev_sku_get", {{kDevDevProdNumFName}, {}}}, - {"rsmi_dev_brand_get", {{kDevVendorIDFName}, {}}}, + {"rsmi_dev_brand_get", {{kDevVendorIDFName, + kDevVBiosVerFName}, {}}}, {"rsmi_dev_vendor_name_get", {{kDevVendorIDFName}, {}}}, {"rsmi_dev_serial_number_get", {{kDevSerialNumberFName}, {}}}, {"rsmi_dev_subsystem_id_get", {{kDevSubSysDevIDFName}, {}}}, @@ -823,7 +827,12 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, } ss << "Successfully read DevInfoBinary for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS (" - << sysfs_path << "), returning binaryData = " << p_binary_data; + << sysfs_path << "), returning binaryData = " << p_binary_data + << "; byte_size = " << std::dec << static_cast(b_size); + + std::string metricDescription = "AMD SMI GPU METRICS (16-byte width), " + + sysfs_path; + logHexDump(metricDescription.c_str(), p_binary_data, b_size, 16); LOG_INFO(ss); return 0; } @@ -888,6 +897,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { switch (type) { case kDevDevID: + case kDevDevRevID: case kDevSubSysDevID: case kDevSubSysVendorID: case kDevVendorID: @@ -1025,6 +1035,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevDevProdName: case kDevDevProdNum: case kDevDevID: + case kDevDevRevID: case kDevSubSysDevID: case kDevSubSysVendorID: case kDevVendorID: @@ -1375,6 +1386,7 @@ std::string Device::readBootPartitionState( return boot_state; } + #undef RET_IF_NONZERO } // namespace smi } // namespace amd diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc index 62281dfe4a..3e147e2f9f 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -61,6 +61,10 @@ #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_exception.h" +#include "rocm_smi/rocm_smi_logger.h" + +using namespace ROCmLogging; +using namespace amd::smi; #define TRY try { #define CATCH } catch (...) {return amd::smi::handleException();} @@ -140,6 +144,196 @@ typedef struct { } rsmi_gpu_metrics_v_1_3; + +// log current gpu_metrics file content read +// any metrics value can be a nullptr +void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, + const rsmi_gpu_metrics_v_1_2 *rsmi_gpu_metrics_v_1_2, + const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3, + const rsmi_gpu_metrics_t *rsmi_gpu_metrics) { + if (RocmSMI::getInstance().isLoggingOn() == false) { + return; + } + std::ostringstream ss; + if (gpu_metrics_table_header != nullptr) { + ss + /* Common Header */ + << print_unsigned_hex_and_int( + gpu_metrics_table_header->structure_size, + "gpu_metrics_table_header->structure_size") + << print_unsigned_hex_and_int( + gpu_metrics_table_header->format_revision, + "gpu_metrics_table_header->format_revision") + << print_unsigned_hex_and_int( + gpu_metrics_table_header->content_revision, + "gpu_metrics_table_header->content_revision"); + LOG_DEBUG(ss); + } + if (rsmi_gpu_metrics == nullptr) { + return; + } else { + // do nothing - continue + } + ss + /* Common Header */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->common_header.structure_size, + "rsmi_gpu_metrics->common_header.structure_size") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->common_header.format_revision, + "rsmi_gpu_metrics->common_header.format_revision") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->common_header.content_revision, + "rsmi_gpu_metrics->common_header.content_revision") + /* Temperature */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_edge, + "rsmi_gpu_metrics->temperature_edge") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_hotspot, + "rsmi_gpu_metrics->temperature_hotspot") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_mem, + "rsmi_gpu_metrics->temperature_mem") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_vrgfx, + "rsmi_gpu_metrics->temperature_vrgfx") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_vrsoc, + "rsmi_gpu_metrics->temperature_vrsoc") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_vrmem, + "rsmi_gpu_metrics->temperature_vrmem") + /* Utilization */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_gfx_activity, + "rsmi_gpu_metrics->average_gfx_activity") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_umc_activity, + "rsmi_gpu_metrics->average_umc_activity") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_mm_activity, + "rsmi_gpu_metrics->average_mm_activity") + /* Power/Energy */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_socket_power, + "rsmi_gpu_metrics->average_socket_power") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->energy_accumulator, + "rsmi_gpu_metrics->energy_accumulator") + /* Driver attached timestamp (in ns) */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->system_clock_counter, + "rsmi_gpu_metrics->system_clock_counter") + /* Average clocks */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_gfxclk_frequency, + "rsmi_gpu_metrics->average_gfxclk_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_socclk_frequency, + "rsmi_gpu_metrics->average_socclk_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_uclk_frequency, + "rsmi_gpu_metrics->average_uclk_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_vclk0_frequency, + "rsmi_gpu_metrics->average_vclk0_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_dclk0_frequency, + "rsmi_gpu_metrics->average_dclk0_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_vclk1_frequency, + "rsmi_gpu_metrics->average_vclk1_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_dclk1_frequency, + "rsmi_gpu_metrics->average_dclk1_frequency") + /* Current clocks */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_gfxclk, + "rsmi_gpu_metrics->current_gfxclk") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_socclk, + "rsmi_gpu_metrics->current_socclk") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_uclk, + "rsmi_gpu_metrics->current_uclk") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_vclk0, + "rsmi_gpu_metrics->current_vclk0") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_dclk0, + "rsmi_gpu_metrics->current_dclk0") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_vclk1, + "rsmi_gpu_metrics->current_vclk1") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_dclk1, + "rsmi_gpu_metrics->current_dclk1") + /* Throttle status */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->throttle_status, + "rsmi_gpu_metrics->throttle_status") + /* Fans */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_fan_speed, + "rsmi_gpu_metrics->current_fan_speed") + /* Link width/speed */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->pcie_link_width, + "rsmi_gpu_metrics->pcie_link_width") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->pcie_link_speed, + "rsmi_gpu_metrics->pcie_link_speed") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->padding, + "rsmi_gpu_metrics->padding") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->gfx_activity_acc, + "rsmi_gpu_metrics->gfx_activity_acc") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->mem_actvity_acc, + "rsmi_gpu_metrics->mem_actvity_acc"); + for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) { + ss << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_hbm[i], + "rsmi_gpu_metrics->temperature_hbm[" + std::to_string(i) + "]"); + } + + if (rsmi_gpu_metrics_v_1_2 != nullptr) { + /* PMFW attached timestamp (10ns resolution) */ + ss + << print_unsigned_hex_and_int( + rsmi_gpu_metrics_v_1_2->firmware_timestamp, + "rsmi_gpu_metrics_v_1_2->firmware_timestamp"); + } + + if (gpu_metrics_v_1_3 != nullptr) { + /* PMFW attached timestamp (10ns resolution) */ + ss + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->firmware_timestamp, + "gpu_metrics_v_1_3->firmware_timestamp") + /* Voltage (mV) */ + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->voltage_soc, + "gpu_metrics_v_1_3->voltage_soc") + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->gfx_voltage, + "gpu_metrics_v_1_3->voltage_gfx") + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->mem_voltage, + "gpu_metrics_v_1_3->voltage_mem") + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->padding1, + "gpu_metrics_v_1_3->padding1") + /* Throttle status (ASIC independent) */ + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->indep_throttle_status, + "gpu_metrics_v_1_3->indep_throttle_status"); + } + LOG_DEBUG(ss); +} + static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind, rsmi_gpu_metrics_t *data, uint8_t content_v) { assert(content_v != RSMI_GPU_METRICS_API_CONTENT_VER_1 && @@ -269,16 +463,28 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { rsmi_gpu_metrics_v_1_3 smu_v_1_3; rsmi_status_t ret; + std::ostringstream ss; if (!dev->gpu_metrics_ver().structure_size) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(struct metrics_table_header_t), &dev->gpu_metrics_ver()); + log_gpu_metrics(&dev->gpu_metrics_ver(), nullptr, nullptr, nullptr); if (ret != RSMI_STATUS_SUCCESS) { + ss << "Returning = " << getRSMIStatusString(ret) + << ",\ndev->gpu_metrics_ver().structure_size = " + << print_unsigned_int(dev->gpu_metrics_ver().structure_size) + << ", could not read common header"; + LOG_ERROR(ss); return ret; } } // only supports gpu_metrics_v1_x version if (dev->gpu_metrics_ver().format_revision != 1) { + ss << "Returning = " << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) + << ",\ndev->gpu_metrics_ver().format_revision = " + << print_unsigned_int(dev->gpu_metrics_ver().format_revision) + << " was not equal to 1"; + LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } @@ -290,19 +496,31 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { RSMI_GPU_METRICS_API_CONTENT_VER_1) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(rsmi_gpu_metrics_t), smu); + ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_1"; + LOG_DEBUG(ss); + log_gpu_metrics(nullptr, nullptr, nullptr, smu); } else if (dev->gpu_metrics_ver().content_revision == RSMI_GPU_METRICS_API_CONTENT_VER_2) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(rsmi_gpu_metrics_v_1_2), &smu_v_1_2); map_gpu_metrics_1_2_to_rsmi_gpu_metrics_t(&smu_v_1_2, smu); + ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_2"; + LOG_DEBUG(ss); + log_gpu_metrics(nullptr, &smu_v_1_2, nullptr, smu); } else if (dev->gpu_metrics_ver().content_revision == RSMI_GPU_METRICS_API_CONTENT_VER_3) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(rsmi_gpu_metrics_v_1_3), &smu_v_1_3); map_gpu_metrics_1_3_to_rsmi_gpu_metrics_t(&smu_v_1_3, smu); + ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_3"; + LOG_DEBUG(ss); + log_gpu_metrics(nullptr, nullptr, &smu_v_1_3, smu); } else { ret = GetGPUMetricsFormat1(dv_ind, smu, dev->gpu_metrics_ver().content_revision); + ss << __PRETTY_FUNCTION__ << " | GetGPUMetricsFormat1"; + LOG_DEBUG(ss); + log_gpu_metrics(nullptr, nullptr, nullptr, smu); } if (ret != RSMI_STATUS_SUCCESS) { diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc index 13aed64588..092bcb3414 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc @@ -43,6 +43,9 @@ #include #include +#include +#include +#include #include #include @@ -770,6 +773,95 @@ KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth, return 0; } +// /sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties +// size_in_bytes 68702699520 +int KFDNode::get_total_memory(uint64_t* total) { + if (total == nullptr) return EINVAL; + *total = 0; + + std::string f_path = kKFDNodesPathRoot; + f_path += "/"; + f_path += std::to_string(node_indx_); + f_path += "/mem_banks"; + + auto kfd_node_dir = opendir(f_path.c_str()); + if (kfd_node_dir == nullptr) { + return errno; + } + auto dentry = readdir(kfd_node_dir); + while (dentry != nullptr) { + if (dentry->d_name[0] == '.') { + dentry = readdir(kfd_node_dir); + continue; + } + + if (!is_number(dentry->d_name)) { + dentry = readdir(kfd_node_dir); + continue; + } + + // read "size_in_bytes 68702699520" line + const std::string size_in_bytes_property = "size_in_bytes "; + std::string memory_bank_file = f_path + "/" + + dentry->d_name + "/properties"; + std::ifstream fs(memory_bank_file); + if (!fs) { + dentry = readdir(kfd_node_dir); + continue; + } + std::string line; + while (std::getline(fs, line)) { + if (line.substr(0, size_in_bytes_property.length()) + == size_in_bytes_property) { + auto bytes = line.substr(size_in_bytes_property.length()); + try { + *total += std::stol(bytes); + break; + } catch(...) { + dentry = readdir(kfd_node_dir); + continue; + } + } + } // end loop for lines in property file + } // end loop for mem_bank directory + + if (closedir(kfd_node_dir)) { + std::string err_str = "Failed to close KFD node directory "; + err_str += f_path; + err_str += "."; + perror(err_str.c_str()); + return 1; + } + return 0; +} + +// ioctl on kfd node device +int KFDNode::get_used_memory(uint64_t* used) { + if (used == nullptr) return EINVAL; + static const char *kPathKFDIoctl = "/dev/kfd"; + + int kfd_fd = open(kPathKFDIoctl, O_RDWR | O_CLOEXEC); + if (kfd_fd <= 0) { + return 1; + } + struct kfd_ioctl_get_available_memory_args mem = {0, 0, 0}; + mem.gpu_id = gpu_id_; + if (ioctl(kfd_fd, AMDKFD_IOC_AVAILABLE_MEMORY , &mem) != 0) { + close(kfd_fd); + return 1; + } + close(kfd_fd); + + // used = total - available + uint64_t total = 0; + int ret = get_total_memory(&total); + if (ret == 0 && total > 0 && mem.available < total) { + *used = total - mem.available; + return 0; + } + + return 1; +} } // namespace smi } // namespace amd diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_logger.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_logger.cc index 9472e40717..24ddd6d6f2 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_logger.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_logger.cc @@ -177,6 +177,9 @@ void Logger::error(const char* text) throw() { logIntoFile(data); } else if (m_LogType == CONSOLE) { logOnConsole(data); + } else if (m_LogType == BOTH_FILE_AND_CONSOLE) { + logOnConsole(data); + logIntoFile(data); } } @@ -208,6 +211,9 @@ void Logger::alarm(const char* text) throw() { logIntoFile(data); } else if (m_LogType == CONSOLE) { logOnConsole(data); + } else if (m_LogType == BOTH_FILE_AND_CONSOLE) { + logOnConsole(data); + logIntoFile(data); } } @@ -239,6 +245,9 @@ void Logger::always(const char* text) throw() { logIntoFile(data); } else if (m_LogType == CONSOLE) { logOnConsole(data); + } else if (m_LogType == BOTH_FILE_AND_CONSOLE) { + logOnConsole(data); + logIntoFile(data); } } @@ -303,6 +312,10 @@ void Logger::info(const char* text) throw() { logIntoFile(data); } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_INFO)) { logOnConsole(data); + } else if ((m_LogType == BOTH_FILE_AND_CONSOLE) + && (m_LogLevel >= LOG_LEVEL_INFO)) { + logOnConsole(data); + logIntoFile(data); } } @@ -333,6 +346,10 @@ void Logger::trace(const char* text) throw() { logIntoFile(data); } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_TRACE)) { logOnConsole(data); + } else if ((m_LogType == BOTH_FILE_AND_CONSOLE) + && (m_LogLevel >= LOG_LEVEL_TRACE)) { + logOnConsole(data); + logIntoFile(data); } } @@ -363,6 +380,10 @@ void Logger::debug(const char* text) throw() { logIntoFile(data); } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_DEBUG)) { logOnConsole(data); + } else if ((m_LogType == BOTH_FILE_AND_CONSOLE) + && (m_LogLevel >= LOG_LEVEL_DEBUG)) { + logOnConsole(data); + logIntoFile(data); } } @@ -424,6 +445,9 @@ std::string Logger::getLogSettings() { case CONSOLE: logSettings += "LogType = CONSOLE"; break; + case BOTH_FILE_AND_CONSOLE: + logSettings += "LogType = BOTH_FILE_AND_CONSOLE"; + break; default: logSettings += "LogType = "; } @@ -471,7 +495,26 @@ void Logger::initialize_resources() { } m_File.open(logFileName.c_str(), std::ios::out | std::ios::app); m_LogLevel = LOG_LEVEL_TRACE; - m_LogType = FILE_LOG; + // RSMI_LOGGING = 1, output to logs only + // RSMI_LOGGING = 2, output to console only + // RSMI_LOGGING = 3, output to logs and console + switch (amd::smi::RocmSMI::getInstance().getLogSetting()) { + case 0: + m_LogType = NO_LOG; + break; + case 1: + m_LogType = FILE_LOG; + break; + case 2: + m_LogType = CONSOLE; + break; + case 3: + m_LogType = BOTH_FILE_AND_CONSOLE; + break; + default: + m_LogType = NO_LOG; + break; + } if (!m_File.is_open()) { std::cout << "WARNING: Issue opening log file (" << logFileName << ") to write." << std::endl; diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc index 3a5565dbe9..8cb95fe7f2 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc @@ -84,6 +84,7 @@ amd::smi::RocmSMI::devInfoTypesStrings = { {amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"}, {amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"}, {amd::smi::kDevDevID, amdSMI + "kDevDevID"}, + {amd::smi::kDevDevRevID, amdSMI + "kDevDevRevID"}, {amd::smi::kDevDevProdName, amdSMI + "kDevDevProdName"}, {amd::smi::kDevDevProdNum, amdSMI + "kDevDevProdNum"}, {amd::smi::kDevVendorID, amdSMI + "kDevVendorID"}, @@ -169,6 +170,7 @@ static uint32_t GetDeviceIndex(const std::string s) { // computed for cardX. // On success, return drm_minor which is >= 128 otherwise return 0 static uint32_t GetDrmRenderMinor(const std::string s) { + std::ostringstream ss; std::string drm_path = s; int drm_minor = 0; const std::string render_file_prefix = "renderD"; @@ -194,6 +196,10 @@ static uint32_t GetDrmRenderMinor(const std::string s) { if (closedir(drm_dir)) { return 0; } + + ss << __PRETTY_FUNCTION__ << " | Discovered drmRenderMinor = " + << std::to_string(drm_minor) << " | For drm_path = " << drm_path << " | "; + LOG_DEBUG(ss); return static_cast(drm_minor); } @@ -376,11 +382,15 @@ RocmSMI::Initialize(uint64_t flags) { // Remove any drm nodes that don't have a corresponding readable kfd node. // kfd nodes will not be added if their properties file is not readable. + std::ostringstream ss; auto dev_iter = devices_.begin(); while (dev_iter != devices_.end()) { uint64_t bdfid = (*dev_iter)->bdfid(); if (tmp_map.find(bdfid) == tmp_map.end()) { + ss << __PRETTY_FUNCTION__ << " | removing device = " + << (*dev_iter)->path(); dev_iter = devices_.erase(dev_iter); + LOG_DEBUG(ss); continue; } dev_iter++; @@ -410,6 +420,9 @@ RocmSMI::Initialize(uint64_t flags) { } // Leaving below to help debug temp file issues // displayAppTmpFilesContent(); + std::string amdGPUDeviceList = displayAllDevicePaths(devices_); + ss << __PRETTY_FUNCTION__ << " | current device paths = " << amdGPUDeviceList; + LOG_DEBUG(ss); } void @@ -457,17 +470,21 @@ static uint32_t GetEnvVarUInteger(const char *ev_str) { // provides a way to get env variable detail in both debug & release // helps enable full logging -static bool getRSMIEnvVar_LoggingEnabled(const char *ev_str) { - bool isLoggingEnabled = false; +// RSMI_LOGGING = 1, output to logs only +// RSMI_LOGGING = 2, output to console only +// RSMI_LOGGING = 3, output to logs and console +static uint32_t getRSMIEnvVar_LoggingEnabled(const char *ev_str) { + uint32_t ret = 0; ev_str = getenv(ev_str); - if (ev_str != nullptr) { - isLoggingEnabled = true; + int ev_ret = atoi(ev_str); + ret = static_cast(ev_ret); } - return isLoggingEnabled; + return ret; } -static std::unordered_set GetEnvVarUIntegerSets(const char *ev_str) { +static inline std::unordered_set GetEnvVarUIntegerSets( + const char *ev_str) { std::unordered_set returnSet; #ifndef DEBUG (void)ev_str; @@ -518,7 +535,16 @@ const RocmSMI_env_vars& RocmSMI::getEnv(void) { } bool RocmSMI::isLoggingOn(void) { + bool isLoggingOn = false; GetEnvVariables(); + if (this->env_vars_.logging_on > 0 + && this->env_vars_.logging_on <= 3) { + isLoggingOn = true; + } + return isLoggingOn; +} + +uint32_t RocmSMI::getLogSetting() { return this->env_vars_.logging_on; } @@ -543,7 +569,9 @@ void RocmSMI::printEnvVarInfo(void) { << ((env_vars_.debug_inf_loop == 0) ? "" : std::to_string(env_vars_.debug_inf_loop)) << std::endl; - bool isLoggingOn = (env_vars_.logging_on) ? true : false; + std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " + << getLogSetting() << std::endl; + bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false; std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " << (isLoggingOn ? "true" : "false") << std::endl; std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {"; @@ -630,6 +658,9 @@ RocmSMI::FindMonitor(std::string monitor_path) { } void RocmSMI::AddToDeviceList(std::string dev_name) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); auto dev_path = std::string(kPathDRMRoot); dev_path += "/"; dev_path += dev_name; @@ -646,6 +677,10 @@ RocmSMI::AddToDeviceList(std::string dev_name) { GetSupportedEventGroups(card_indx, dev->supported_event_groups()); devices_.push_back(dev); + ss << __PRETTY_FUNCTION__ << " | Adding to device list dev_name = " + << dev_name << " | path = " << dev_path + << " | card index = " << std::to_string(card_indx) << " | "; + LOG_DEBUG(ss); return; } @@ -653,16 +688,26 @@ RocmSMI::AddToDeviceList(std::string dev_name) { static const uint32_t kAmdGpuId = 0x1002; static bool isAMDGPU(std::string dev_path) { + bool isAmdGpu = false; + std::ostringstream ss; std::string vend_path = dev_path + "/device/vendor"; if (!FileExists(vend_path.c_str())) { - return false; + ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path + << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": + "is an amdgpu device - FALSE"); + LOG_DEBUG(ss); + return isAmdGpu; } std::ifstream fs; fs.open(vend_path); if (!fs.is_open()) { - return false; + ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path + << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": + "is an amdgpu device - FALSE"); + LOG_DEBUG(ss); + return isAmdGpu; } uint32_t vendor_id; @@ -672,9 +717,13 @@ static bool isAMDGPU(std::string dev_path) { fs.close(); if (vendor_id == kAmdGpuId) { - return true; + isAmdGpu = true; } - return false; + ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path + << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": + "is an amdgpu device - FALSE"); + LOG_DEBUG(ss); + return isAmdGpu; } uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_monitor.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_monitor.cc index 82e40ff7f9..982c660447 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_monitor.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_monitor.cc @@ -313,6 +313,7 @@ int Monitor::writeMonitor(MonitorTypes type, uint32_t sensor_id, // This string version should work for all valid monitor types int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id, std::string *val) { + std::ostringstream ss; assert(val != nullptr); std::string temp_str; @@ -320,11 +321,21 @@ int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id, DBG_FILE_ERROR(sysfs_path, (std::string *)nullptr) int ret = ReadSysfsStr(sysfs_path, val); + ss << __PRETTY_FUNCTION__ + << " | Success | Read hwmon file: " << sysfs_path + << " | Type: " << monitorTypesToString.at(type) + << " | Sensor id: " << std::to_string(sensor_id) + << " | Data: " << *val + << " | Returning: " << std::to_string(ret) << " |"; + LOG_INFO(ss); return ret; } int32_t Monitor::setTempSensorLabelMap(void) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); std::string type_str; int ret; diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc index e4827216b9..7e90d29209 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc @@ -52,11 +52,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_utils.h" @@ -103,7 +106,7 @@ bool FileExists(char const *filename) { return (stat(filename, &buf) == 0); } -static void debugFilesDiscovered(std::vector files) { +static inline void debugFilesDiscovered(std::vector files) { std::ostringstream ss; int numberOfFilesFound = static_cast(files.size()); ss << "fileName.size() = " << numberOfFilesFound @@ -204,9 +207,10 @@ int ReadSysfsStr(std::string path, std::string *retStr) { if (!fs.is_open()) { ret = errno; errno = 0; - oss << "Could not read SYSFS file (" << path << ")" - << ", returning " << std::to_string(ret) << " (" - << std::strerror(ret) << ")"; + oss << __PRETTY_FUNCTION__ + << " | Fail | Cause: file does not exist or permissions issue" + << " | SYSFS file: " << path + << " | Returning: " << std::strerror(ret) << " |"; LOG_ERROR(oss); return ret; } @@ -457,9 +461,13 @@ rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName, } chmod(fileName, S_IRUSR|S_IRGRP|S_IROTH); - write(fd, storageData.c_str(), storageData.size()); + ssize_t rc_write = write(fd, storageData.c_str(), storageData.size()); close(fd); - return RSMI_STATUS_SUCCESS; + if (rc_write == -1) { + return RSMI_STATUS_FILE_ERROR; + } else { + return RSMI_STATUS_SUCCESS; + } } std::vector getListOfAppTmpFiles() { @@ -531,19 +539,39 @@ void displayAppTmpFilesContent() { } // Used to debug vector string list and their content -void displayVectorContent(std::vector v) { - std::cout << "Vector = {"; +std::string debugVectorContent(std::vector v) { + std::ostringstream ss; + ss << "Vector = {"; if (v.size() > 0) { for (auto it=v.begin(); it < v.end(); it++) { - std::cout << *it; + ss << *it; auto temp_it = it; if(++temp_it != v.end()) { - std::cout << ", "; + ss << ", "; } } - } else { - std::cout << "}" << std::endl; } + ss << "}" << std::endl; + + return ss.str(); +} + +// Used to debug vector string list and their content +std::string displayAllDevicePaths(std::vector> v) { + std::ostringstream ss; + ss << "Vector = {"; + if (v.size() > 0) { + for (auto it=v.begin(); it < v.end(); it++) { + ss << (*it)->path(); + auto temp_it = it; + if(++temp_it != v.end()) { + ss << ", "; + } + } + } + ss << "}" << std::endl; + + return ss.str(); } // Attempts to read application specific temporary file @@ -595,14 +623,20 @@ std::string getRSMIStatusString(rsmi_status_t ret) { // string domainName = domain name of the the system's node on the network // string os_distribution = pretty name of os distribution // (typically found in /etc/*-release file) +// string endianness = system's endianness. +// Expressed as big endian or little endian. +// Big Endian (BE), multi-bit symbols encoded as big endian (MSB first) +// Little Endian (LE), multi-bit symbols encoded as little endian (LSB first) std::tuple getSystemDetails(void) { + std::string, std::string, std::string, std::string> + getSystemDetails(void) { struct utsname buf; bool errorDetected = false; std::string temp_data; std::string sysname, nodename, release, version, machine; std::string domainName = ""; std::string os_distribution = ""; + std::string endianness = ""; if (uname(&buf) < 0) { errorDetected = true; @@ -630,8 +664,16 @@ std::tuple 64) bytesPerLine = 16; + + size_t i; + unsigned char buff[bytesPerLine + 1]; + const unsigned char *pc // ptr to data (char, 1 byte sized data) + = (const unsigned char *) addr; + + // Output description if given. + // if (desc != NULL) printf("%s:\n", desc); + if (desc != NULL) ss << "\n" << desc << "\n"; + + // Length checks. + if (len == 0) { + // printf(" ZERO LENGTH\n"); + ss << " ZERO LENGTH\n"; + LOG_ERROR(ss); + return; + } + std::string endianness = ""; + if (isSystemBigEndian()) { + endianness = "** System is Big Endian, multi-bit symbols encoded as" + " big endian (MSB first) **"; + } else { + endianness = "** System is Little Endian, multi-bit symbols encoded as" + " little endian (LSB first) **"; + } + ss << "\t" << endianness << "\n"; + + // Process every byte in the data. + for (i = 0; i < len; i++) { + // Multiple of bytesPerLine means new or first line (with line offset). + if ((i % bytesPerLine) == 0) { + // Only print previous-line ASCII buffer for lines beyond first. + // if (i != 0) printf(" %s\n", buff); + if (i != 0) ss << " " << buff << "\n"; + // Output the offset of current line. + // printf(" %08lx ", i); + ss << " " << std::setw(8) << std::setfill('0') << std::hex << i << " "; + } + + // Now the hex code for the specific character. + // printf(" %02x", pc[i]); + + ss << " " << std::setw(2) << std::setfill('0') << std::hex + << static_cast(pc[i]); + + // And buffer a printable ASCII character for later. + // x20 = 32 || x7e = 126 (ascii table range) + if ((pc[i] < 0x20) || (pc[i] > 0x7e)) { // isprint() may be better. + buff[i % bytesPerLine] = '.'; + } else { + buff[i % bytesPerLine] = pc[i]; + } + buff[(i % bytesPerLine) + 1] = '\0'; + } + + // Pad out last line if not exactly bytesPerLine characters. + while ((i % bytesPerLine) != 0) { + // printf(" "); + ss << " "; + i++; + } + + // And print the final ASCII buffer. + // printf(" %s\n", buff); + ss << " " << buff << "\n"; + LOG_DEBUG(ss); +} + +bool isSystemBigEndian() { + int n = 1; + bool isBigEndian = true; + if (*(char *)&n == 1) { + isBigEndian = false; + } + return isBigEndian; +} + +rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str) +{ + auto result = rsmi_status_t::RSMI_STATUS_SUCCESS; + auto bus_id = static_cast((bdf_id & 0x0000FF00) >> 8); + auto dev_id = static_cast((bdf_id & 0x000000F8) >> 3); + auto func_id = static_cast(bdf_id & 0x00000003); + + bfd_str = std::string(); + if (!(bus_id > 0)) { + result = rsmi_status_t::RSMI_STATUS_NO_DATA; + return result; + } + + std::stringstream bdf_sstream; + bdf_sstream << std::hex << std::setfill('0') << std::setw(sizeof(uint8_t) * 2) << +bus_id << ":"; + bdf_sstream << std::hex << std::setfill('0') << std::setw(sizeof(uint8_t) * 2) << +dev_id << "."; + bdf_sstream << std::hex << std::setfill('0') << +func_id; + bfd_str = bdf_sstream.str(); + return result; +} + + } // namespace smi } // namespace amd diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index f087cb5391..fa2e3edb29 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -578,6 +578,11 @@ amdsmi_status_t amdsmi_get_gpu_id(amdsmi_processor_handle processor_handle, return rsmi_wrapper(rsmi_dev_id_get, processor_handle, id); } +amdsmi_status_t amdsmi_get_gpu_revision(amdsmi_processor_handle processor_handle, + uint16_t *revision) { + return rsmi_wrapper(rsmi_dev_revision_get, processor_handle, revision); +} + // TODO(bliu) : add fw info from libdrm amdsmi_status_t amdsmi_get_fw_info(amdsmi_processor_handle processor_handle, amdsmi_fw_info_t *info) { diff --git a/projects/amdsmi/src/rocm_smi_properties.cc b/projects/amdsmi/src/rocm_smi_properties.cc new file mode 100644 index 0000000000..0e606e6874 --- /dev/null +++ b/projects/amdsmi/src/rocm_smi_properties.cc @@ -0,0 +1,560 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#include "rocm_smi/rocm_smi_properties.h" +#include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_device.h" +#include "rocm_smi/rocm_smi_logger.h" + +#include +#include +#include + + +// +// Property reinforcement check list +// +// NOTE: This is a *temporary solution* until we get a better approach, likely +// a driver API that can give us the capabilities of a GPU in question. +// +namespace amd { +namespace smi { + +const AMDGpuOpModeList_t amdgpu_opmode_check_list { + {AMDGpuPropertyOpModeTypes_t::kBareMetal, "Bare Metal"}, + {AMDGpuPropertyOpModeTypes_t::kSrIov, "SR-IOV"}, + {AMDGpuPropertyOpModeTypes_t::kBoth, "Both"}, +}; + +const AMDGpuPropertyTypesOffsetList_t amdgpu_typeoffset_check_list { + {AMDGpuPropertyTypesOffset_t::kNone, "None"}, + {AMDGpuPropertyTypesOffset_t::kDevInfoTypes, "Device Info Type"}, + {AMDGpuPropertyTypesOffset_t::kMonitorTypes, "Monitor Type"}, + {AMDGpuPropertyTypesOffset_t::kPerfTypes, "Performance Type"}, + {AMDGpuPropertyTypesOffset_t::kClkTypes, "Clock Type"}, + {AMDGpuPropertyTypesOffset_t::kVoltMetricTypes, "Volt Metric Type"}, +}; + + +AMDGpuPropertyId_t make_unique_property_id(AMDGpuPropertyTypesOffset_t type_offset, AMDGpuPropertyId_t property_id) { + return (static_cast(type_offset) | (property_id)); +} + +AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id) { + const auto property_type_offset_mask = + static_cast(AMDGpuPropertyTypesOffset_t::kDevInfoTypes) | + static_cast(AMDGpuPropertyTypesOffset_t::kMonitorTypes) | + static_cast(AMDGpuPropertyTypesOffset_t::kPerfTypes) | + static_cast(AMDGpuPropertyTypesOffset_t::kClkTypes) | + static_cast(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes); + + auto property_type_offset = (static_cast(property_type_offset_mask) & (property_id)); + auto property_type_id = (static_cast(property_id) & ~(property_type_offset_mask)); + + return property_type_id; +} + +AMDGpuPropertyTypesOffset_t operator| (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs) +{ + if (lhs == rhs) { + return lhs; + } + + return AMDGpuPropertyTypesOffset_t(static_cast(lhs) | static_cast(rhs)); +} + +AMDGpuPropertyTypesOffset_t operator& (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs) +{ + if (lhs == rhs) { + return lhs; + } + + return AMDGpuPropertyTypesOffset_t(static_cast(lhs) & static_cast(rhs)); +} + +AMDGpuPropertyOpModeTypes_t operator| (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs) +{ + if (lhs == rhs) { + return lhs; + } + + return AMDGpuPropertyOpModeTypes_t(static_cast(lhs) | static_cast(rhs)); +} + +AMDGpuPropertyOpModeTypes_t operator& (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs) +{ + if (lhs == rhs) { + return lhs; + } + + return AMDGpuPropertyOpModeTypes_t(static_cast(lhs) & static_cast(rhs)); +} + + +// +// Note: Due to the fact that we have different enum elements with the same +// number, keying a hash by the number is not an option; ie: +// - DevInfoTypes::kDevVendorID = 7 +// - MonitorTypes::kMonPowerCapDefault = 7 +// So, we are keying it by a unique key, based on their info types +// +const AMDGpuVerbList_t amdgpu_verb_check_list { + { AMDGpuVerbTypes_t::kNone, "None" }, + { AMDGpuVerbTypes_t::kSetGpuPciBandwidth, "amdsmi_set_gpu_pci_bandwidth" }, + { AMDGpuVerbTypes_t::kSetPowerCap, "amdsmi_set_power_cap" }, + { AMDGpuVerbTypes_t::kSetGpuPowerProfile, "amdsmi_set_gpu_power_profile" }, + { AMDGpuVerbTypes_t::kSetGpuClkRange, "amdsmi_set_gpu_clk_range" }, + { AMDGpuVerbTypes_t::kSetGpuOdClkInfo, "amdsmi_set_gpu_od_clk_info" }, + { AMDGpuVerbTypes_t::kSetGpuOdVoltInfo, "amdsmi_set_gpu_od_volt_info" }, + { AMDGpuVerbTypes_t::kSetGpuPerfLevelV1, "amdsmi_set_gpu_perf_level_v1" }, + { AMDGpuVerbTypes_t::kSetGpuPerfLevel, "amdsmi_set_gpu_perf_level" }, + { AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets, "amdsmi_get_gpu_power_profile_presets" }, + { AMDGpuVerbTypes_t::kResetGpu, "amdsmi_reset_gpu" }, + { AMDGpuVerbTypes_t::kSetGpuPerfDeterminismMode, "amdsmi_set_gpu_perf_determinism_mode" }, + { AMDGpuVerbTypes_t::kSetGpuFanSpeed, "amdsmi_set_gpu_fan_speed" }, + { AMDGpuVerbTypes_t::kResetGpuFan, "amdsmi_reset_gpu_fan" }, + { AMDGpuVerbTypes_t::kSetClkFreq, "amdsmi_set_clk_freq" }, + { AMDGpuVerbTypes_t::kSetGpuOverdriveLevelV1, "amdsmi_set_gpu_overdrive_level_v1" }, + { AMDGpuVerbTypes_t::kSetGpuOverdriveLevel, "amdsmi_set_gpu_overdrive_level" }, + { AMDGpuVerbTypes_t::kGetGpuFanRpms, "amdsmi_get_gpu_fan_rpms" }, + { AMDGpuVerbTypes_t::kGetGpuFanSpeed, "amdsmi_get_gpu_fan_speed" }, + { AMDGpuVerbTypes_t::kGetGpuFanSpeedMax, "amdsmi_get_gpu_fan_speed_max" }, + { AMDGpuVerbTypes_t::kGetGpuVoltMetric, "amdsmi_get_temp_metric" }, + { AMDGpuVerbTypes_t::kGetGpuOverDriveLevel, "amdsmi_get_gpu_overdrive_level" }, + { AMDGpuVerbTypes_t::kGetGpuOdVoltInfo, "amdsmi_get_gpu_od_volt_info" }, + { AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, "amdsmi_get_gpu_od_volt_curve_regions" } +}; + +const uint16_t kDevRevIDAll(0xFFFF); +const AMDGpuPropertyList_t amdgpu_property_reinforcement_list { + // + // {"Asic ID", {"Asic Rev. ID", "Unique Property ID", "Property Op.Mode", "Availability Flag"}} + // DevInfoTypes::kDevPCIEClk = rsmi_dev_pci_bandwidth_get; rsmi_dev_pci_bandwidth_set + // MonitorTypes::kMonPowerCapDefault = rsmi_dev_power_cap_default_get; + // DevInfoTypes::kDevPowerProfileMode = + // rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL = rsmi_dev_clk_range_set; + // + + // AMD Instinct MI210 + {0x740F, {0x02, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerProfileMode), + AMDGpuVerbTypes_t::kSetGpuPowerProfile, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + + // AMD MIxxx + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPCIEClk), + AMDGpuVerbTypes_t::kSetGpuPciBandwidth, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonPowerCapDefault), + AMDGpuVerbTypes_t::kSetPowerCap, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerProfileMode), + AMDGpuVerbTypes_t::kSetGpuPowerProfile, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL), + AMDGpuVerbTypes_t::kSetGpuClkRange, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL), + AMDGpuVerbTypes_t::kSetGpuOdClkInfo, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL), + AMDGpuVerbTypes_t::kSetGpuOdVoltInfo, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_AUTO), + AMDGpuVerbTypes_t::kSetGpuPerfLevelV1, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL), + AMDGpuVerbTypes_t::kSetGpuPerfLevel, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerProfileMode), + AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevGpuReset), + AMDGpuVerbTypes_t::kResetGpu, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_DETERMINISM), + AMDGpuVerbTypes_t::kSetGpuPerfDeterminismMode, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanSpeed), + AMDGpuVerbTypes_t::kSetGpuFanSpeed, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanCntrlEnable), + AMDGpuVerbTypes_t::kResetGpuFan, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kClkTypes, + rsmi_clk_type::RSMI_CLK_TYPE_FIRST), + AMDGpuVerbTypes_t::kSetClkFreq, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevOverDriveLevel), + AMDGpuVerbTypes_t::kSetGpuOverdriveLevel, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevOverDriveLevel), + AMDGpuVerbTypes_t::kSetGpuOverdriveLevelV1, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanRPMs), + AMDGpuVerbTypes_t::kGetGpuFanRpms, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanSpeed), + AMDGpuVerbTypes_t::kGetGpuFanSpeed, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonMaxFanSpeed), + AMDGpuVerbTypes_t::kGetGpuFanSpeedMax, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes, + rsmi_voltage_metric_t::RSMI_VOLT_CURRENT), + AMDGpuVerbTypes_t::kGetGpuVoltMetric, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevOverDriveLevel), + AMDGpuVerbTypes_t::kGetGpuOverDriveLevel, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerODVoltage), + AMDGpuVerbTypes_t::kGetGpuOdVoltInfo, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerODVoltage), + AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + } +}; + + +rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbTypes_t verb_type, rsmi_status_t actual_error_code) +{ + std::ostringstream osstream; + osstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + osstream << __PRETTY_FUNCTION__ << " actual error code: " << actual_error_code << "\n"; + LOG_TRACE(osstream); + + if (actual_error_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + return actual_error_code; + } + + // + // For property reinforcement query, the possible return values are: + // RSMI_STATUS_SUCCESS: + // - Property found in the reinforcement table, and it *should exist* + // RSMI_STATUS_NOT_SUPPORTED: + // - Property found in the reinforcement table, and it *should not* exist + // RSMI_STATUS_NO_DATA: + // - Could not find the correct dev_id and dev_revision info to build the filter + // RSMI_STATUS_UNKNOWN_ERROR: + // - The results are initialized with that. If that is returned, + // likely the reinforcement table does not contain any entries/rules for the + // dev_id in question. + // + auto amdgpu_property_query_result_hdlr = [](rsmi_status_t query_result) { + switch (query_result) { + case (rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR): + case (rsmi_status_t::RSMI_STATUS_NO_DATA): + return rsmi_status_t::RSMI_STATUS_NOT_FOUND; + break; + + case (rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED): + case (rsmi_status_t::RSMI_STATUS_SUCCESS): + return query_result; + break; + + default: + return rsmi_status_t::RSMI_STATUS_NOT_FOUND; + break; + } + }; + + /// + GET_DEV_FROM_INDX + osstream << __PRETTY_FUNCTION__ << "| ======= about to run property query =======" + << " [query filters: ]" + << " device: " << dv_ind + << " property/verb: " << static_cast(verb_type) << amdgpu_verb_check_list.at(verb_type); + auto reinforcement_query_result = dev->check_amdgpu_property_reinforcement_query(dv_ind, verb_type); + osstream << __PRETTY_FUNCTION__ << "| ======= result from property query =======" + << " query result: " << reinforcement_query_result; + + reinforcement_query_result = amdgpu_property_query_result_hdlr(reinforcement_query_result); + osstream << __PRETTY_FUNCTION__ << "| ======= result from property query =======" + << " query result: " << reinforcement_query_result; + + return reinforcement_query_result; +} + +void dump_amdgpu_property_reinforcement_list() +{ + std::ostringstream osstream; + osstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + if (!amdgpu_property_reinforcement_list.empty()) { + for (const auto& property : amdgpu_property_reinforcement_list) { + osstream << __PRETTY_FUNCTION__ + << " Asic ID: " << property.first + << " Asic Rev.ID: " << property.second.m_pci_rev_id + << " Property ID: " << property.second.m_property + << " Verb ID : " << static_cast(property.second.m_verb_id) + << " Verb Desc: " << amdgpu_verb_check_list.at(property.second.m_verb_id) + << " OpMode: " << static_cast(property.second.m_opmode) + << " OpMode Desc: " << amdgpu_opmode_check_list.at(property.second.m_opmode) + << " Flag Avail.: " << property.second.m_should_be_available; + } + osstream << __PRETTY_FUNCTION__ << "| ======= end ======="; + return; + } + + osstream << __PRETTY_FUNCTION__ << "amdgpu_property_reinforcement_list is empty"; + LOG_TRACE(osstream); +} + + +rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type) +{ + std::ostringstream osstream; + auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR); + + AMDGpuPropertyQuery_t amdgpu_property_query = [&]() { + AMDGpuPropertyQuery_t amdgpu_property_query_init{}; + amdgpu_property_query_init.m_asic_id = 0; + amdgpu_property_query_init.m_pci_rev_id = 0; + amdgpu_property_query_init.m_dev_idx = dev_idx; + amdgpu_property_query_init.m_property = 0; + amdgpu_property_query_init.m_verb_id = verb_type; + return amdgpu_property_query_init; + }(); + + auto build_asic_id_filters = [&](const AMDGpuPropertyQuery_t& amdgpu_query_validate, bool& is_filter_good) { + auto tmp_amdgpu_query = amdgpu_query_validate; + auto id_filter_result(rsmi_status_t::RSMI_STATUS_SUCCESS); + if (amdgpu_query_validate.m_asic_id == 0) { + id_filter_result = rsmi_dev_id_get(dev_idx, &tmp_amdgpu_query.m_asic_id); + if (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) { + id_filter_result = rsmi_dev_revision_get(dev_idx, &tmp_amdgpu_query.m_pci_rev_id); + } + } + is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) ? true : false; + return tmp_amdgpu_query; + }; + + // If the original amdgpu_query is missing parts of the filter, such as; + // asic_id, revision_id, we try to retrieve them based on the dev_idx. + // the property we are searching for, *must be present* . + osstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(osstream); + + bool is_proper_query(false); + amdgpu_property_query = build_asic_id_filters(amdgpu_property_query, is_proper_query); + if (!is_proper_query) { + rsmi_status = rsmi_status_t::RSMI_STATUS_NO_DATA; + osstream << __PRETTY_FUNCTION__ << "| ======= end =======" + << ", Missing Query Filters were not successfully retrieved: " + << " [query filters: ]" + << " device: " << dev_idx + << " asic id: " << amdgpu_property_query.m_asic_id + << " revision id: " << amdgpu_property_query.m_pci_rev_id + << " property: " << amdgpu_property_query.m_property + << " verb: " << static_cast(amdgpu_property_query.m_verb_id) + << " proper_query: " << is_proper_query + << " error: " << rsmi_status; + LOG_TRACE(osstream); + return rsmi_status; + } + + return run_amdgpu_property_reinforcement_query(amdgpu_property_query); +} + +rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query) +{ + std::ostringstream osstream; + auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR); + + auto contains = [](const uint16_t asic_id) { + return (amdgpu_property_reinforcement_list.find(asic_id) != amdgpu_property_reinforcement_list.end()); + }; + + auto ends_with = [](const std::string& value, const std::string& ending) { + if (value.size() < ending.size()) { + return false; + } + return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); + }; + + // Traverse through all values for a given key + osstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n"; + LOG_TRACE(osstream); + if (contains(amdgpu_property_query.m_asic_id)) { + osstream << __PRETTY_FUNCTION__ << " asic id found in table: " << amdgpu_property_query.m_asic_id << "\n"; + auto itr_begin = amdgpu_property_reinforcement_list.lower_bound(amdgpu_property_query.m_asic_id); + auto itr_end = amdgpu_property_reinforcement_list.upper_bound(amdgpu_property_query.m_asic_id); + while (itr_begin != itr_end) { + // Still same key, and... + if (itr_begin->first == amdgpu_property_query.m_asic_id) { + osstream << __PRETTY_FUNCTION__ << " asic id found: " << itr_begin->first << "\n"; + // Pci_rev_id matches the filter or ALL Revisions + if ((itr_begin->second.m_pci_rev_id == amdgpu_property_query.m_pci_rev_id) || + (itr_begin->second.m_pci_rev_id == kDevRevIDAll)) { + osstream << __PRETTY_FUNCTION__ << " asic rev.id found: " << itr_begin->second.m_pci_rev_id << "\n"; + // Do we have the property we are looking for? + if (((amdgpu_property_query.m_property != 0) && + (itr_begin->second.m_property == amdgpu_property_query.m_property)) || + ((amdgpu_property_query.m_verb_id != AMDGpuVerbTypes_t::kNone) && + (itr_begin->second.m_verb_id == amdgpu_property_query.m_verb_id))) { + osstream << __PRETTY_FUNCTION__ + << " property found: " << itr_begin->second.m_property + << " verb found: " << static_cast(itr_begin->second.m_verb_id) + << " " << amdgpu_verb_check_list.at(amdgpu_property_query.m_verb_id) + << " should_be_available: " << itr_begin->second.m_should_be_available << "\n"; + // and if we do, should we consider it available, or forcefully + // considered it unavailable + osstream << __PRETTY_FUNCTION__ << "| ======= validating =======" + << ", Property found in the table for this device and flagged as *Not Available* : " + << " [query filters: ]" + << " device: " << amdgpu_property_query.m_dev_idx + << " asic id: " << amdgpu_property_query.m_asic_id + << " revision id: " << amdgpu_property_query.m_pci_rev_id + << " reinf.tbl.rev. id: " << itr_begin->second.m_pci_rev_id; + // + // The property is set in the reinforcement table to 'it should not be available' + if (!itr_begin->second.m_should_be_available) { + // If the property is found and set to not available + // (rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED), + // it should be all good (rsmi_status_t::RSMI_STATUS_SUCCESS); + rsmi_status = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; + osstream << __PRETTY_FUNCTION__ + << " should_be_available: " << itr_begin->second.m_should_be_available + << " result: " << rsmi_status << "\n"; + LOG_TRACE(osstream); + return rsmi_status; + } + // + // The property is set in the reinforcement table to 'it should be available' + rsmi_status = rsmi_status_t::RSMI_STATUS_SUCCESS; + osstream << __PRETTY_FUNCTION__ + << " should_be_available: " << itr_begin->second.m_should_be_available + << " result: " << rsmi_status << "\n"; + LOG_TRACE(osstream); + return rsmi_status; + } + } + } + itr_begin++; + } + } + + osstream << __PRETTY_FUNCTION__ << "| ======= end =======" + << "Done searching for the Property in reinforcement table for this device: " + << " device: " << amdgpu_property_query.m_dev_idx + << " asic id: " << amdgpu_property_query.m_asic_id + << " revision id: " << amdgpu_property_query.m_pci_rev_id + << " property id: " << amdgpu_property_query.m_property + << " error: " << rsmi_status; + LOG_TRACE(osstream); + return rsmi_status; +} + + +} // namespace smi +} // namespace amd diff --git a/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc index 9157bf35f9..10b8297138 100755 --- a/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc @@ -106,7 +106,7 @@ void TestSysInfoRead::Run(void) { err = amdsmi_get_gpu_vbios_info(processor_handles_[i], &info); if (err != AMDSMI_STATUS_SUCCESS) { - if (err == AMDSMI_STATUS_FILE_ERROR) { + if ((err == AMDSMI_STATUS_FILE_ERROR) || (err == AMDSMI_STATUS_NOT_SUPPORTED)) { IF_VERB(STANDARD) { std::cout << "\t**VBIOS read: Not supported on this machine" << std::endl; diff --git a/projects/amdsmi/tests/amd_smi_test/rsmitst.exclude b/projects/amdsmi/tests/amd_smi_test/rsmitst.exclude index 3de139d45f..4b09cd5b29 100644 --- a/projects/amdsmi/tests/amd_smi_test/rsmitst.exclude +++ b/projects/amdsmi/tests/amd_smi_test/rsmitst.exclude @@ -55,6 +55,20 @@ FILTER[sienna_cichlid]=\ $BLACKLIST_ALL_ASICS\ "rsmitstReadWrite.TestPerfLevelReadWrite" +# SWDEV-391407 +FILTER[90400]=\ +$BLACKLIST_ALL_ASICS\ +"rsmitstReadOnly.TestVoltCurvRead:"\ +"rsmitstReadOnly.TestFrequenciesRead:"\ +"rsmitstReadWrite.TestFrequenciesReadWrite:"\ +"rsmitstReadWrite.TestPowerReadWrite" +FILTER[90401]=\ +$BLACKLIST_ALL_ASICS\ +"rsmitstReadOnly.TestVoltCurvRead:"\ +"rsmitstReadOnly.TestFrequenciesRead:"\ +"rsmitstReadWrite.TestFrequenciesReadWrite:"\ +"rsmitstReadWrite.TestPowerReadWrite" + # SWDEV-321166 FILTER[virtualization]=\ $BLACKLIST_ALL_ASICS\ @@ -63,4 +77,4 @@ $BLACKLIST_ALL_ASICS\ "rsmitstReadWrite.FanReadWrite:"\ "rsmitstReadWrite.TestOverdriveReadWrite:"\ "rsmitstReadWrite.TestPowerReadWrite:"\ -"rsmitstReadWrite.TestPowerCapReadWrite" \ No newline at end of file +"rsmitstReadWrite.TestPowerCapReadWrite" diff --git a/projects/amdsmi/tests/amd_smi_test/test_base.cc b/projects/amdsmi/tests/amd_smi_test/test_base.cc index b83f6a7a96..3069c102dc 100644 --- a/projects/amdsmi/tests/amd_smi_test/test_base.cc +++ b/projects/amdsmi/tests/amd_smi_test/test_base.cc @@ -173,6 +173,13 @@ void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) { std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl; } + err = amdsmi_get_gpu_revision(dv_ind, &val_ui16); + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Device Revision ID: 0x" << std::hex << + val_ui16 << std::endl; + } + amdsmi_board_info_t board_info; err = amdsmi_get_gpu_board_info(dv_ind, &board_info); CHK_ERR_ASRT(err) diff --git a/projects/amdsmi/tests/amd_smi_test/test_common.cc b/projects/amdsmi/tests/amd_smi_test/test_common.cc index 2aec2708bc..d880c109c6 100644 --- a/projects/amdsmi/tests/amd_smi_test/test_common.cc +++ b/projects/amdsmi/tests/amd_smi_test/test_common.cc @@ -283,6 +283,7 @@ void DumpMonitorInfo(const TestBase *test) { }; print_val_str(amd::smi::kDevDevID, "Device ID: "); + print_val_str(amd::smi::kDevDevRevID, "Dev.Rev.ID: "); print_val_str(amd::smi::kDevPerfLevel, "Performance Level: "); print_val_str(amd::smi::kDevOverDriveLevel, "OverDrive Level: "); print_vector(amd::smi::kDevGPUMClk, diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/id_info_read.cc b/projects/amdsmi/tests/rocm_smi_test/functional/id_info_read.cc new file mode 100755 index 0000000000..1988d951a1 --- /dev/null +++ b/projects/amdsmi/tests/rocm_smi_test/functional/id_info_read.cc @@ -0,0 +1,325 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2019, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#include +#include + +#include +#include + +#include "gtest/gtest.h" +#include "rocm_smi/rocm_smi.h" +#include "rocm_smi_test/functional/id_info_read.h" +#include "rocm_smi_test/test_common.h" + +TestIdInfoRead::TestIdInfoRead() : TestBase() { + set_title("RSMI ID Info Read Test"); + set_description("This test verifies that ID information such as the " + "device, subsystem and vendor IDs can be read properly."); +} + +TestIdInfoRead::~TestIdInfoRead(void) { +} + +void TestIdInfoRead::SetUp(void) { + TestBase::SetUp(); + + return; +} + +void TestIdInfoRead::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void TestIdInfoRead::DisplayResults(void) const { + TestBase::DisplayResults(); + return; +} + +void TestIdInfoRead::Close() { + // This will close handles opened within rsmitst utility calls and call + // rsmi_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + +static const uint32_t kBufferLen = 80; + +void TestIdInfoRead::Run(void) { + rsmi_status_t err; + uint16_t id; + uint64_t val_ui64; + uint32_t drm_render_minor; + + char buffer[kBufferLen]; + + TestBase::Run(); + if (setup_failed_) { + std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl; + return; + } + + for (uint32_t i = 0; i < num_monitor_devs(); ++i) { + IF_VERB(STANDARD) { + std::cout << "\t*************************" << std::endl; + std::cout << "\t**Device index: " << i << std::endl; + } + + // Get the device ID, name, vendor ID and vendor name for the device + err = rsmi_dev_id_get(i, &id); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + rsmi_status_t ret; + // Verify api support checking functionality is working + ret = rsmi_dev_id_get(i, nullptr); + ASSERT_EQ(ret, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + + IF_VERB(STANDARD) { + std::cout << "\t**Device ID: 0x" << std::hex << id << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_id_get(i, nullptr); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + // Get device Revision + err = rsmi_dev_revision_get(i, &id); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + rsmi_status_t ret; + // Verify api support checking functionality is working + ret = rsmi_dev_revision_get(i, nullptr); + ASSERT_EQ(ret, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + + IF_VERB(STANDARD) { + std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << id << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_revision_get(i, nullptr); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + + err = rsmi_dev_name_get(i, buffer, kBufferLen); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**Device Marketing name not found on this system." << + std::endl; + // Verify api support checking functionality is working + err = rsmi_dev_name_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Device Marketing name: " << buffer << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_name_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + err = rsmi_dev_brand_get(i, buffer, kBufferLen); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + // Verify api support checking functionality is working + err = rsmi_dev_brand_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Device Brand name: " << buffer << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_brand_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + err = rsmi_dev_vram_vendor_get(i, buffer, kBufferLen); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << + "\t**Vram Vendor string not supported on this system." << std::endl; + err = rsmi_dev_vram_vendor_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Device Vram Vendor name: " << buffer << std::endl; + } + err = rsmi_dev_vram_vendor_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + err = rsmi_dev_vendor_id_get(i, &id); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + // Verify api support checking functionality is working + err = rsmi_dev_vendor_id_get(i, nullptr); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Vendor ID: 0x" << std::hex << id << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_vendor_id_get(i, nullptr); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + err = rsmi_dev_drm_render_minor_get(i, &drm_render_minor); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + // Verify api support checking functionality is working + err = rsmi_dev_drm_render_minor_get(i, nullptr); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**DRM Render Minor: " << drm_render_minor << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_drm_render_minor_get(i, nullptr); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + err = rsmi_dev_vendor_name_get(i, buffer, kBufferLen); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**Device Vendor name string not found on this system." << + std::endl; + // Verify api support checking functionality is working + err = rsmi_dev_vendor_name_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Device Vendor name: " << buffer << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_vendor_name_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + + // Get the device ID, name, vendor ID and vendor name for the sub-device + err = rsmi_dev_subsystem_id_get(i, &id); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + // Verify api support checking functionality is working + err = rsmi_dev_subsystem_id_get(i, nullptr); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Subsystem ID: 0x" << std::hex << id << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_subsystem_id_get(i, nullptr); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + err = rsmi_dev_subsystem_name_get(i, buffer, kBufferLen); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**Subsystem name string not found on this system." << + std::endl; + // Verify api support checking functionality is working + err = rsmi_dev_subsystem_name_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Subsystem name: " << buffer << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_subsystem_name_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + err = rsmi_dev_subsystem_vendor_id_get(i, &id); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + // Verify api support checking functionality is working + err = rsmi_dev_subsystem_vendor_id_get(i, nullptr); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Sub-system Vendor ID: 0x" << std::hex << + id << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_subsystem_vendor_id_get(i, nullptr); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + err = rsmi_dev_vendor_name_get(i, buffer, kBufferLen); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << + "\t**Subsystem Vendor name string not found on this system." << + std::endl; + // Verify api support checking functionality is working + err = rsmi_dev_vendor_name_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Subsystem Vendor name: " << buffer << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_vendor_name_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + + err = rsmi_dev_pci_id_get(i, &val_ui64); + // Don't check for RSMI_STATUS_NOT_SUPPORTED since this should always be + // supported. It is not based on a sysfs file. + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**PCI ID (BDFID): 0x" << std::hex << val_ui64; + std::cout << " (" << std::dec << val_ui64 << ")" << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_pci_id_get(i, nullptr); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + + err = rsmi_dev_serial_number_get(i, buffer, kBufferLen); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + // Verify api support checking functionality is working + err = rsmi_dev_serial_number_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + + std::cout << + "\t**Serial Number string not supported on this system." << std::endl; + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Device Serial Number:" << buffer << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_serial_number_get(i, nullptr, kBufferLen); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + } +}