Merge remote-tracking branch 'rocmsmi/amd-staging' into amd-dev

Change-Id: I9c38b4facd472b877d1ad133f3176a023c890955
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/amdsmi commit: 936719eeb6]
Tento commit je obsažen v:
Galantsev, Dmitrii
2023-08-22 10:42:51 -05:00
33 změnil soubory, kde provedl 2462 přidání a 82 odebrání
+3
Zobrazit soubor
@@ -28,3 +28,6 @@ _toc.yml
_build/
_doxygen/
docBin/
# Simulated SYSFS - for early development or debug
device/
+3 -1
Zobrazit soubor
@@ -62,7 +62,7 @@ EOF
# confirm logrotate file exists in daily
if [ -f /etc/cron.daily/logrotate ]; then
# move logrotate daily to hourly
if [ -f /etc/cron.hourly/logrotate ]; then
if [ -d /etc/cron.hourly ]; then
sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
else
echo "[WARNING] Could find and configure hourly cron for $packageName's"\
@@ -77,6 +77,7 @@ EOF
"$packageName logs (when turned on) may not rotate properly."
fi
fi
return #done configuring for non-systemd timers
else
# Configure systemd timers - the typical setup for modern Linux logrotation setups
if [ -f /lib/systemd/system/logrotate.timer ]; then
@@ -102,6 +103,7 @@ EOF
echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\
"$packageName logs (when turned on) will not rotate properly."
fi
return #done configuring for systemd timers
fi
}
+1
Zobrazit soubor
@@ -38,6 +38,7 @@ rm_pyc() {
rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/amd_smi/__pycache__
}
case "$1" in
( remove | upgrade)
rm_ldconfig
+3 -1
Zobrazit soubor
@@ -62,7 +62,7 @@ EOF
# confirm logrotate file exists in daily
if [ -f /etc/cron.daily/logrotate ]; then
# move logrotate daily to hourly
if [ -f /etc/cron.hourly/logrotate ]; then
if [ -d /etc/cron.hourly ]; then
sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
else
echo "[WARNING] Could find and configure hourly cron for $packageName's"\
@@ -77,6 +77,7 @@ EOF
"$packageName logs (when turned on) may not rotate properly."
fi
fi
return #done configuring for non-systemd timers
else
# Configure systemd timers - the typical setup for modern Linux logrotation setups
if [ -f /lib/systemd/system/logrotate.timer ]; then
@@ -102,6 +103,7 @@ EOF
echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\
"$packageName logs (when turned on) will not rotate properly."
fi
return #done configuring for systemd timers
fi
}
+15
Zobrazit soubor
@@ -1456,6 +1456,21 @@ amdsmi_status_t amdsmi_get_processor_handle_from_bdf(amdsmi_bdf_t bdf, amdsmi_pr
*/
amdsmi_status_t amdsmi_get_gpu_id(amdsmi_processor_handle processor_handle, uint16_t *id);
/**
* @brief Get the device revision associated with the device
*
* @details Given a processor handle @p processor_handle and a pointer to a
* uint16_t @p revision to which the revision id will be written
*
* @param[in] processor_handle a processor handle
*
* @param[out] revision a pointer to uint16_t to which the device revision
* will be written
*
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
*/
amdsmi_status_t amdsmi_get_gpu_revision(amdsmi_processor_handle processor_handle, uint16_t *revision);
/**
* @brief Get the name string for a give vendor ID
*
+395
Zobrazit soubor
@@ -0,0 +1,395 @@
/*
* MIT License
*
* Copyright (c) 2020 Open Compute Project
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <assert.h>
#include <dirent.h>
#include <sstream>
#include <cstring>
#include <iostream>
#include <regex> // NOLINT
#include <map>
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_counters.h"
#include "rocm_smi/rocm_smi_kfd.h"
#include "rocm_smi/rocm_smi.h"
#include "oam/oam_mapi.h"
#include "oam/amd_oam.h"
static const std::map<int, const char *> err_map = {
{ AMDOAM_STATUS_INVALID_ARGS, "Invalid arguments" },
{ AMDOAM_STATUS_NOT_SUPPORTED, "Feature not supported" },
{ AMDOAM_STATUS_FILE_ERROR, "Problem accessing a file" },
{ AMDOAM_STATUS_PERMISSION, "Permission denied" },
{ AMDOAM_STATUS_OUT_OF_RESOURCES, "Not enough memory or other resource" },
{ AMDOAM_STATUS_INTERNAL_EXCEPTION, "An internal exception was caught" },
{ AMDOAM_STATUS_INPUT_OUT_OF_BOUNDS,
"The provided input is out of allowable or safe range" },
{ AMDOAM_STATUS_INIT_ERROR, "AMDOAM is not initialized or init failed" },
{ AMDOAM_STATUS_ERROR, "Generic error" },
{ AMDOAM_STATUS_NOT_FOUND, "An item was searched for but not found" }
};
#define TRY try {
#define CATCH } catch (...) {return handleRSMIException();}
static bool rsmi_initialized;
static int rsmi_status_to_amdoam_errorcode(rsmi_status_t status) {
if (status > RSMI_STATUS_INIT_ERROR)
return -AMDOAM_STATUS_ERROR;
else
return -1 * static_cast<int>(status);
}
static int handleRSMIException() {
rsmi_status_t ret = amd::smi::handleException();
return rsmi_status_to_amdoam_errorcode(ret);
}
int amdoam_get_error_description(int code, const char **description) {
if (description == nullptr)
return -AMDOAM_STATUS_INVALID_ARGS;
auto search = err_map.find(code);
if (search == err_map.end())
return -AMDOAM_STATUS_NOT_FOUND;
*description = search->second;
return AMDOAM_STATUS_SUCCESS;
}
int amdoam_init(void) {
TRY
rsmi_status_t status = rsmi_init(0);
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
rsmi_initialized = true;
return AMDOAM_STATUS_SUCCESS;
CATCH
}
int amdoam_free(void) {
rsmi_status_t status = rsmi_shut_down();
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
return AMDOAM_STATUS_SUCCESS;
}
int amdoam_discover_devices(uint32_t *device_count) {
rsmi_status_t status;
if (device_count == nullptr) {
return -AMDOAM_STATUS_INVALID_ARGS;
}
status = rsmi_num_monitor_devices(device_count);
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
return AMDOAM_STATUS_SUCCESS;
}
int amdoam_get_pci_properties(uint32_t device_id, oam_pci_info_t *pci_info) {
uint64_t bdfid;
TRY
if (pci_info == nullptr) {
return -AMDOAM_STATUS_INVALID_ARGS;
}
rsmi_status_t status = rsmi_dev_pci_id_get(device_id, &bdfid);
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
pci_info->domain = (uint16_t)(bdfid >> 32) & 0xffff;
pci_info->bus = (bdfid >> 8) & 0xff;
pci_info->device = (bdfid >> 3) & 0x1f;
pci_info->function = bdfid & 0x7;
CATCH
return AMDOAM_STATUS_SUCCESS;
}
int amdoam_get_dev_properties(uint32_t num_devices,
oam_dev_properties_t *devices) {
const size_t buf_size = 32;
char buf[buf_size] = "";
uint32_t dev_inx;
oam_dev_properties_t *dev = devices;
TRY
if (devices == nullptr)
return -AMDOAM_STATUS_INVALID_ARGS;
if (!rsmi_initialized)
return -AMDOAM_STATUS_INIT_ERROR;
for (dev_inx = 0; dev_inx < num_devices; dev_inx++) {
dev->device_id = dev_inx;
/* If fails to get any following properties, it's not treated as a deal
* breaker. Variable not filled means that property is not available on
* this device or AMD doesn't support that property.
*/
rsmi_dev_vendor_name_get(dev_inx, dev->device_vendor, DEVICE_VENDOR_LEN);
rsmi_dev_name_get(dev_inx, dev->device_name, DEVICE_NAME_LEN);
rsmi_dev_vbios_version_get(dev_inx, buf, buf_size);
if (std::strlen(buf) > 0) {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-truncation"
std::strncpy(dev->sku_name, &buf[4], 6);
std::strncpy(dev->board_name, buf, 12);
#pragma GCC diagnostic pop
}
rsmi_dev_serial_number_get(dev_inx, dev->board_serial_number,
BOARD_SERIAL_NUM_LEN);
++dev;
}
CATCH
return AMDOAM_STATUS_SUCCESS;
}
static uint32_t
get_num_sensors(std::string hwmon_path, std::string fn_reg) {
uint32_t sensor_max = 0;
std::string fn_reg_ex = "\\b" + fn_reg + "([0-9]+)([^ ]*)";
std::string fn;
std::smatch m;
int32_t temp = 0;
std::string s1("in");
std::regex re(fn_reg_ex);
auto hwmon_dir = opendir(hwmon_path.c_str());
assert(hwmon_dir != nullptr);
auto dentry = readdir(hwmon_dir);
while (dentry != nullptr) {
fn = dentry->d_name;
if (std::regex_search(fn, m, re)) {
std::string output = std::regex_replace(
fn,
std::regex("[^0-9]*([0-9]+).*"),
std::string("$1"));
temp = stoi(output);
assert(temp >= 0);
if (s1.compare(fn_reg) == 0)
++temp;
if (static_cast<uint32_t>(temp) > sensor_max)
sensor_max = static_cast<uint32_t>(temp);
}
dentry = readdir(hwmon_dir);
}
closedir(hwmon_dir);
return sensor_max;
}
int amdoam_get_sensors_count(uint32_t device_id,
oam_sensor_count_t *sensor_count) {
uint32_t dv_ind = device_id;
TRY
if (sensor_count == nullptr)
return -AMDOAM_STATUS_INVALID_ARGS;
GET_DEV_FROM_INDX
assert(dev->monitor() != nullptr);
std::string hwmon_path = dev->monitor()->path();
sensor_count->num_temperature_sensors = get_num_sensors(hwmon_path, "temp");
sensor_count->num_fans = get_num_sensors(hwmon_path, "fan");
sensor_count->num_voltage_sensors = get_num_sensors(hwmon_path, "in");
sensor_count->num_power_sensors = get_num_sensors(hwmon_path, "power");
sensor_count->num_current_sensors = get_num_sensors(hwmon_path, "current");
CATCH
return AMDOAM_STATUS_SUCCESS;
}
int amdoam_get_sensors_info(uint32_t device_id, oam_sensor_type_t type,
uint32_t num_sensors, oam_sensor_info_t sensor_info[]) {
uint32_t dv_ind = device_id;
std::string val_str;
uint32_t i;
rsmi_status_t status;
TRY
if ((sensor_info == nullptr) || (type >= OAM_SENSOR_TYPE_UNKNOWN))
return -AMDOAM_STATUS_INVALID_ARGS;
GET_DEV_FROM_INDX
assert(dev->monitor() != nullptr);
switch (type) {
case OAM_SENSOR_TYPE_POWER:
for (i = 0; i < num_sensors; i++) {
snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX,
"POWER_SENSOR_%u", i+1);
sensor_info[i].sensor_type = type;
status = rsmi_dev_power_ave_get(device_id, i,
reinterpret_cast<uint64_t*>(&sensor_info[i].value));
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
}
break;
case OAM_SENSOR_TYPE_VOLTAGE:
for (i = 0; i < num_sensors; i++) {
snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX,
"VOLTAGE_SENSOR_%u", i);
sensor_info[i].sensor_type = type;
status = rsmi_dev_volt_metric_get(device_id, RSMI_VOLT_TYPE_VDDGFX,
RSMI_VOLT_CURRENT, &sensor_info[i].value);
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
}
break;
case OAM_SENSOR_TYPE_TEMP:
for (i = 0; i < num_sensors; i++) {
snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX,
"TEMP_SENSOR_%u", i+1);
sensor_info[i].sensor_type = type;
status = rsmi_dev_temp_metric_get(device_id, i, RSMI_TEMP_CURRENT,
&sensor_info[i].value);
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
}
break;
case OAM_SENSOR_TYPE_FAN_SPEED:
for (i = 0; i < num_sensors; i++) {
snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX,
"FAN_SENSOR_%u", i+1);
sensor_info[i].sensor_type = type;
status = rsmi_dev_fan_speed_get(device_id, i, &sensor_info[i].value);
if (status != RSMI_STATUS_SUCCESS)
return rsmi_status_to_amdoam_errorcode(status);
}
break;
default:
return -AMDOAM_STATUS_NOT_SUPPORTED;
}
CATCH
return AMDOAM_STATUS_SUCCESS;
}
// TODO(x): This function doesn't work for OAM. It's just a version
// of rsmi_dev_ecc_count_get(), which has similar functionality.
// The purpose here is just to drive refactoring; e.g., making macros
// available and previously static functions global.
int
get_device_error_count(oam_dev_handle_t *handle,
oam_dev_error_count_t *count) {
std::vector<std::string> val_vec;
rsmi_status_t ret;
TRY
// TODO(x): replace with final code...
// Below, we are just returning errors for RSMI_GPU_BLOCK_GFX as a
// placeholder
(void)handle; // Just ignore for now
rsmi_gpu_block_t block = RSMI_GPU_BLOCK_GFX;
// The macro CHK_SUPPORT_VAR assumes the existence of a device index variable
// "dv_ind". Presumably, the device index will come from the "handle"
// pointer. Since I don't know how that will be implemented, for now we
// will just make up a device index:
uint32_t dv_ind = 0;
CHK_SUPPORT_VAR(count, block)
amd::smi::DevInfoTypes type;
switch (block) {
case RSMI_GPU_BLOCK_UMC:
type = amd::smi::kDevErrCntUMC;
break;
case RSMI_GPU_BLOCK_SDMA:
type = amd::smi::kDevErrCntSDMA;
break;
case RSMI_GPU_BLOCK_GFX:
type = amd::smi::kDevErrCntGFX;
break;
case RSMI_GPU_BLOCK_MMHUB:
type = amd::smi::kDevErrCntMMHUB;
break;
case RSMI_GPU_BLOCK_PCIE_BIF:
type = amd::smi::kDevErrCntPCIEBIF;
break;
case RSMI_GPU_BLOCK_HDP:
type = amd::smi::kDevErrCntHDP;
break;
case RSMI_GPU_BLOCK_XGMI_WAFL:
type = amd::smi::kDevErrCntXGMIWAFL;
break;
default:
return RSMI_STATUS_NOT_SUPPORTED;
}
DEVICE_MUTEX
ret = GetDevValueVec(type, dv_ind, &val_vec);
if (ret == RSMI_STATUS_FILE_ERROR) {
return RSMI_STATUS_NOT_SUPPORTED;
}
if (ret != RSMI_STATUS_SUCCESS) {
return static_cast<int>(ret);
}
assert(val_vec.size() == 2);
std::string junk;
std::istringstream fs1(val_vec[0]);
fs1 >> junk;
assert(junk == "ue:");
fs1 >> count->total_error_count;
std::istringstream fs2(val_vec[1]);
fs2 >> junk;
assert(junk == "ce:");
fs2 >> count->total_error_count;
return static_cast<int>(ret);
CATCH
}
+3
Zobrazit soubor
@@ -718,6 +718,9 @@ int main() {
ret = rsmi_dev_id_get(i, &val_ui16);
CHK_RSMI_RET_I(ret)
std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl;
ret = rsmi_dev_revision_get(i, &val_ui16);
CHK_RSMI_RET_I(ret)
std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << val_ui16 << std::endl;
char current_compute_partition[256];
current_compute_partition[0] = '\0';
+10
Zobrazit soubor
@@ -36,6 +36,12 @@ struct kfd_ioctl_get_version_args {
__u32 minor_version; /* from KFD */
};
struct kfd_ioctl_get_available_memory_args {
__u64 available; /* from KFD */
__u32 gpu_id; /* to KFD */
__u32 pad;
};
/* For kfd_ioctl_create_queue_args.queue_type. */
#define KFD_IOC_QUEUE_TYPE_COMPUTE 0x0
#define KFD_IOC_QUEUE_TYPE_SDMA 0x1
@@ -726,6 +732,10 @@ struct kfd_ioctl_cross_memory_copy_args {
#define AMDKFD_IOC_CROSS_MEMORY_COPY \
AMDKFD_IOWR(0x22, struct kfd_ioctl_cross_memory_copy_args)
#define AMDKFD_IOC_AVAILABLE_MEMORY \
AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args)
#define AMDKFD_COMMAND_START 0x01
#undef AMDKFD_COMMAND_END
#define AMDKFD_COMMAND_END 0x22
+15
Zobrazit soubor
@@ -1088,6 +1088,21 @@ rsmi_status_t rsmi_num_monitor_devices(uint32_t *num_devices);
*/
rsmi_status_t rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id);
/**
* @brief Get the device revision associated with the device
*
* @details Given a device index @p dv_ind and a pointer to a uint32_t to
* which the revision will be written
*
* @param[in] dv_ind a device index
*
* @param[inout] revision a pointer to uint32_t to which the device revision
* will be written
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*
*/
rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision);
/**
* @brief Get the SKU for a desired device associated with the device with
+9 -1
Zobrazit soubor
@@ -52,12 +52,14 @@
#include <vector>
#include <unordered_set>
#include <map>
#include <type_traits>
#include "rocm_smi/rocm_smi_monitor.h"
#include "rocm_smi/rocm_smi_power_mon.h"
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_counters.h"
#include "rocm_smi/rocm_smi_properties.h"
#include "shared_mutex.h" //NOLINT
namespace amd {
@@ -100,6 +102,7 @@ enum DevInfoTypes {
kDevOverDriveLevel,
kDevMemOverDriveLevel,
kDevDevID,
kDevDevRevID,
kDevDevProdName,
kDevDevProdNum,
kDevVendorID,
@@ -172,6 +175,7 @@ typedef struct {
std::vector<DevInfoTypes> variants;
} dev_depends_t;
class Device {
public:
explicit Device(std::string path, RocmSMI_env_vars const *e);
@@ -212,7 +216,7 @@ class Device {
void set_evt_notif_anon_fd(uint32_t fd) {
evt_notif_anon_fd_ = static_cast<int>(fd);}
int evt_notif_anon_fd(void) const {return evt_notif_anon_fd_;}
metrics_table_header_t & gpu_metrics_ver(void) {return gpu_metrics_ver_;}
metrics_table_header_t &gpu_metrics_ver(void) {return gpu_metrics_ver_;}
void fillSupportedFuncs(void);
void DumpSupportedFunctions(void);
bool DeviceAPISupported(std::string name, uint64_t variant,
@@ -220,6 +224,8 @@ class Device {
rsmi_status_t restartAMDGpuDriver(void);
rsmi_status_t storeDevicePartitions(uint32_t dv_ind);
template <typename T> std::string readBootPartitionState(uint32_t dv_ind);
rsmi_status_t check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type);
private:
std::shared_ptr<Monitor> monitor_;
@@ -240,6 +246,7 @@ class Device {
int readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
void *p_binary_data);
int writeDevInfoStr(DevInfoTypes type, std::string valStr);
rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query);
uint64_t bdfid_;
uint64_t kfd_gpu_id_;
std::unordered_set<rsmi_event_group_t,
@@ -252,6 +259,7 @@ class Device {
struct metrics_table_header_t gpu_metrics_ver_;
};
} // namespace smi
} // namespace amd
+4
Zobrazit soubor
@@ -80,6 +80,10 @@ class KFDNode {
uint32_t amdgpu_dev_index(void) const {return amdgpu_dev_index_;}
void set_amdgpu_dev_index(uint32_t val) {amdgpu_dev_index_ = val;}
// Get memory from kfd
int get_total_memory(uint64_t* total);
int get_used_memory(uint64_t* used);
private:
uint32_t node_indx_;
uint32_t amdgpu_dev_index_;
+1
Zobrazit soubor
@@ -100,6 +100,7 @@ typedef enum LOG_TYPE {
NO_LOG = 1,
CONSOLE = 2,
FILE_LOG = 3,
BOTH_FILE_AND_CONSOLE = 4
} LogType;
class Logger {
+1
Zobrazit soubor
@@ -115,6 +115,7 @@ class RocmSMI {
const RocmSMI_env_vars& getEnv(void);
void printEnvVarInfo(void);
bool isLoggingOn(void);
uint32_t getLogSetting(void);
static const std::map<amd::smi::DevInfoTypes, std::string> devInfoTypesStrings;
private:
+38
Zobrazit soubor
@@ -94,6 +94,44 @@ enum MonitorTypes {
kMonInvalid = 0xFFFFFFFF,
};
const std::map<MonitorTypes,std::string> monitorTypesToString {
{MonitorTypes::kMonName, "amd::smi::kMonName"},
{MonitorTypes::kMonTemp, "amd::smi::kMonName"},
{MonitorTypes::kMonFanSpeed, "amd::smi::kMonName"},
{MonitorTypes::kMonMaxFanSpeed, "amd::smi::kMonName"},
{MonitorTypes::kMonFanRPMs, "amd::smi::kMonName"},
{MonitorTypes::kMonFanCntrlEnable, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerCap, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerCapDefault, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerCapMax, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerCapMin, "amd::smi::kMonName"},
{MonitorTypes::kMonPowerAve, "amd::smi::kMonName"},
{MonitorTypes::kMonTempMax, "amd::smi::kMonName"},
{MonitorTypes::kMonTempMin, "amd::smi::kMonName"},
{MonitorTypes::kMonTempMaxHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempMinHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempCritical, "amd::smi::kMonName"},
{MonitorTypes::kMonTempCriticalHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempEmergency, "amd::smi::kMonName"},
{MonitorTypes::kMonTempEmergencyHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempCritMin, "amd::smi::kMonName"},
{MonitorTypes::kMonTempCritMinHyst, "amd::smi::kMonName"},
{MonitorTypes::kMonTempOffset, "amd::smi::kMonName"},
{MonitorTypes::kMonTempLowest, "amd::smi::kMonName"},
{MonitorTypes::kMonTempHighest, "amd::smi::kMonName"},
{MonitorTypes::kMonTempLabel, "amd::smi::kMonName"},
{MonitorTypes::kMonVolt, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltMax, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltMinCrit, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltMin, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltMaxCrit, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltAverage, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltLowest, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltHighest, "amd::smi::kMonName"},
{MonitorTypes::kMonVoltLabel, "amd::smi::kMonName"},
{MonitorTypes::kMonInvalid, "amd::smi::kMonName"},
};
class Monitor {
public:
+160
Zobrazit soubor
@@ -0,0 +1,160 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_PROPERTIES_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_PROPERTIES_H_
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi.h"
#include <cstdint>
#include <map>
namespace amd {
namespace smi {
//
// Property reinforcement check list
//
using AMDGpuPropertyId_t = uint32_t;
using AMDGpuDevIdx_t = uint32_t;
using AMDGpuVerbId_t = uint32_t;
using AMDGpuAsicId_t = uint16_t;
using AMDGpuAsicRevId_t = uint16_t;
using AMDGpuOpModeType_t = uint8_t;
enum class AMDGpuVerbTypes_t : AMDGpuVerbId_t
{
kNone = 0,
kSetGpuPciBandwidth,
kSetPowerCap,
kSetGpuPowerProfile,
kSetGpuClkRange,
kSetGpuOdClkInfo,
kSetGpuOdVoltInfo,
kSetGpuPerfLevelV1,
kSetGpuPerfLevel,
kGetGpuPowerProfilePresets,
kResetGpu,
kSetGpuPerfDeterminismMode,
kSetGpuFanSpeed,
kResetGpuFan,
kSetClkFreq,
kSetGpuOverdriveLevelV1,
kSetGpuOverdriveLevel,
kGetGpuFanRpms,
kGetGpuFanSpeed,
kGetGpuFanSpeedMax,
kGetGpuVoltMetric,
kGetGpuOverDriveLevel,
kGetGpuOdVoltInfo,
kGetGpuOdVoltCurveRegions,
};
using AMDGpuVerbList_t = std::map<AMDGpuVerbTypes_t, std::string>;
enum class AMDGpuPropertyTypesOffset_t : AMDGpuPropertyId_t
{
kNone = 0,
kDevInfoTypes = (0x1000 << 0),
kMonitorTypes = (0x1000 << 1),
kPerfTypes = (0x1000 << 2),
kClkTypes = (0x1000 << 3),
kVoltMetricTypes = (0x1000 << 4),
};
using AMDGpuPropertyOffsetType = std::underlying_type<AMDGpuPropertyTypesOffset_t>::type;
using AMDGpuPropertyTypesOffsetList_t = std::map<AMDGpuPropertyTypesOffset_t, std::string>;
AMDGpuPropertyTypesOffset_t operator| (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs);
AMDGpuPropertyTypesOffset_t operator& (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs);
enum class AMDGpuPropertyOpModeTypes_t : AMDGpuOpModeType_t
{
kBareMetal = (0x1 << 0),
kSrIov = (0x1 << 1),
kBoth = (0x1 << 2),
};
using AMDGpuPropertyOpModeType = std::underlying_type<AMDGpuPropertyOpModeTypes_t>::type;
using AMDGpuOpModeList_t = std::map<AMDGpuPropertyOpModeTypes_t, std::string>;
AMDGpuPropertyOpModeTypes_t operator| (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs);
AMDGpuPropertyOpModeTypes_t operator& (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs);
struct AMDGpuProperties_t
{
AMDGpuAsicRevId_t m_pci_rev_id;
AMDGpuPropertyId_t m_property;
AMDGpuVerbTypes_t m_verb_id;
AMDGpuPropertyOpModeTypes_t m_opmode;
bool m_should_be_available;
};
using AMDGpuPropertyList_t = std::multimap<AMDGpuAsicId_t, AMDGpuProperties_t>;
struct AMDGpuPropertyQuery_t
{
AMDGpuAsicId_t m_asic_id;
AMDGpuAsicRevId_t m_pci_rev_id;
AMDGpuDevIdx_t m_dev_idx;
AMDGpuPropertyId_t m_property;
AMDGpuVerbTypes_t m_verb_id;
};
//
AMDGpuPropertyId_t make_unique_property_id(AMDGpuPropertyTypesOffset_t type_offset, AMDGpuPropertyId_t property_id);
AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id);
rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind,
AMDGpuVerbTypes_t dev_info_type,
rsmi_status_t actual_error_code);
void dump_amdgpu_property_reinforcement_list();
} // namespace smi
} // namespace amd
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_DEVICE_H_
+51 -1
Zobrazit soubor
@@ -48,6 +48,9 @@
#include <string>
#include <cstdint>
#include <vector>
#include <sstream>
#include <iomanip>
#include <type_traits>
#include "rocm_smi/rocm_smi_device.h"
@@ -84,6 +87,8 @@ std::tuple<bool, std::string> readTmpFile(
std::string stateName,
std::string parameterName);
void displayAppTmpFilesContent(void);
std::string debugVectorContent(std::vector<std::string> v);
std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v);
rsmi_status_t handleException();
rsmi_status_t
GetDevValueVec(amd::smi::DevInfoTypes type,
@@ -94,8 +99,53 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type,
rsmi_status_t ErrnoToRsmiStatus(int err);
std::string getRSMIStatusString(rsmi_status_t ret);
std::tuple<bool, std::string, std::string, std::string, std::string,
std::string, std::string, std::string> getSystemDetails(void);
std::string, std::string, std::string, std::string>
getSystemDetails(void);
void logSystemDetails(void);
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str);
void logHexDump(const char *desc, const void *addr, const size_t len,
size_t perLine);
bool isSystemBigEndian();
template <typename T>
std::string print_int_as_hex(T i, bool showHexNotation=true) {
std::stringstream ss;
if (showHexNotation) {
ss << "0x" << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex;
} else {
ss << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex;
}
if (std::is_same<std::uint8_t, T>::value) {
ss << static_cast<unsigned int>(i|0);
} else if (std::is_same<std::int8_t, T>::value) {
ss << static_cast<int>(static_cast<uint8_t>(i|0));
} else if (std::is_signed<T>::value) {
ss << static_cast<long long int>(i | 0);
} else {
ss << static_cast<unsigned long long int>(i | 0);
}
ss << std::dec;
return ss.str();
};
template <typename T>
std::string print_unsigned_int(T i) {
std::stringstream ss;
ss << static_cast<unsigned long long int>(i | 0);
return ss.str();
}
template <typename T>
std::string print_unsigned_hex_and_int(T i, std::string heading="") {
std::stringstream ss;
if (heading.empty() == false) {
ss << "\n" << heading << " = ";
}
ss << "Hex (MSB): " << print_int_as_hex(i) << ", "
<< "Unsigned int: " << print_unsigned_int(i) << ", "
<< "Byte Size: " << sizeof(T);
return ss.str();
}
struct pthread_wrap {
public:
+104 -32
Zobrazit soubor
@@ -47,7 +47,7 @@ headerString = ' ROCm System Management Interface '
footerString = ' End of ROCm SMI Log '
# Output formatting
appWidth = 84
appWidth = 100
deviceList = []
# Enable or disable serialized format
@@ -112,19 +112,10 @@ def formatCsv(deviceList):
if outputType == 'system':
jsonobj = json.loads(jsondata)
keylist = header
for record in jsonobj:
my_string += str(record)
for key in keylist:
if key == 'system':
tempstr = str(jsonobj[record])
tempstr = tempstr[tempstr.find('\'')+1:]
tempstr = tempstr[:tempstr.find('\'')]
# Force output device type to 'system'
my_string += ',%s\nsystem,%s' % (tempstr, jsonobj[record][tempstr])
my_string += '\n'
# Force output device type to 'system'
if my_string.startswith('system'):
my_string = 'device' + my_string[6:]
for record in jsonobj['system']:
my_string += "\"%s\", \"%s\"\n" % (record, jsonobj['system'][record])
# add header
my_string = "name, value\n" + my_string
return my_string
headerkeys = []
# Separate device-specific information from system-level information
@@ -249,6 +240,17 @@ def getId(device):
return hex(dv_id.value)
def getRev(device):
""" Return the hexadecimal value of a device's Revision
@param device: DRM device identifier
"""
dv_rev = c_short()
ret = rocmsmi.rsmi_dev_revision_get(device, byref(dv_rev))
if rsmi_ret_ok(ret, device, 'get_device_rev'):
return hex(dv_rev.value)
def getMaxPower(device):
""" Return the maximum power cap of a given device
@@ -391,6 +393,25 @@ def getTemp(device, sensor):
return temp.value / 1000
return 'N/A'
def findFirstAvailableTemp(device):
""" Discovers the first available device temperature to display
Returns a tuple of (temp_type, temp_value) for the device specified
@param device: DRM device identifier
"""
temp = c_int64(0)
metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT
ret_temp = "N/A"
ret_temp_type = "(Unknown)"
for i, templist_val in enumerate(temp_type_lst):
ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), i, metric, byref(temp))
if rsmi_ret_ok(ret, device, 'get_temp_metric_' + templist_val, silent=True):
ret_temp = temp.value / 1000
ret_temp_type = '(' + templist_val.capitalize() + ')'
break
else:
continue
return (ret_temp_type, ret_temp)
def getVbiosVersion(device):
""" Returns the VBIOS version for a given device
@@ -399,7 +420,9 @@ def getVbiosVersion(device):
"""
vbios = create_string_buffer(256)
ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256)
if rsmi_ret_ok(ret, device):
if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
return "Unsupported"
elif rsmi_ret_ok(ret, device):
return vbios.value.decode()
@@ -425,7 +448,7 @@ def getComputePartition(device):
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode():
return str(currentComputePartition.value.decode())
return "UNKNOWN"
return "N/A"
def getMemoryPartition(device):
@@ -437,7 +460,7 @@ def getMemoryPartition(device):
ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256)
if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode():
return str(currentNPSMode.value.decode())
return "UNKNOWN"
return "N/A"
def print2DArray(dataArray):
@@ -537,16 +560,23 @@ def printEventList(device, delay, eventList):
data = rsmi_evt_notification_data_t(1)
rocmsmi.rsmi_event_notification_get(delay, byref(num_elements), byref(data))
if len(data.message) > 0:
print2DArray([['\rGPU[%d]:\t' % (device), ctime().split()[3], notification_type_names[data.event.value - 1],
print2DArray([['\rGPU[%d]:\t' % (data.dv_ind), ctime().split()[3], notification_type_names[data.event.value - 1],
data.message.decode('utf8') + '\r']])
def printLog(device, metricName, value=None, extraSpace=False):
def printLog(device, metricName, value=None, extraSpace=False, useItalics=False):
""" Print out to the SMI log
@param device: DRM device identifier
@param metricName: Title of the item to print to the log
@param value: The item's value to print to the log
"""
red = '\033[91m'
green = '\033[92m'
blue = '\033[94m'
bold = '\033[1m'
italics = '\033[3m'
underline = '\033[4m'
end = '\033[0m'
global PRINT_JSON
if PRINT_JSON:
if value is not None and device is not None:
@@ -563,6 +593,8 @@ def printLog(device, metricName, value=None, extraSpace=False):
# Force thread safe printing
lock = multiprocessing.Lock()
lock.acquire()
if useItalics:
logstr = italics + logstr + end
if extraSpace:
print('\n' + logstr + '\n', end='', flush=True)
else:
@@ -1353,7 +1385,7 @@ def setPowerOverDrive(deviceList, value, autoRespond):
RETCODE = 1
continue
if new_power_cap.value == current_power_cap.value:
printErrLog(device,'Max power was already at: {}W'.format(new_power_cap.value / 1000000))
printLog(device,'Max power was already at: {}W'.format(new_power_cap.value / 1000000))
if current_power_cap.value < default_power_cap.value:
current_power_cap.value = default_power_cap.value
@@ -1540,18 +1572,39 @@ def showAllConcise(deviceList):
print('ERROR: Cannot print JSON/CSV output for concise output')
sys.exit(1)
printLogSpacer(' Concise Info ')
header = ['GPU', 'Temp (DieEdge)', 'AvgPwr', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
deviceList.sort()
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
available_temp_type = temp_type.lower()
available_temp_type = available_temp_type.replace('(', '')
available_temp_type = available_temp_type.replace(')', '')
header = ['GPU', 'Temp', 'AvgPwr', 'Partitions', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
subheader = ['', temp_type, '', '(Mem, Compute)', '', '', '', '', '', '', '']
# add additional spaces to match header
for idx, item in enumerate(subheader):
header_size = len(header[idx])
subheader_size = len(subheader[idx])
if header_size != subheader_size:
numSpacesToFill_subheader = header_size - subheader_size
numSpacesToFill_header = subheader_size - header_size
#take pos spaces to mean, we need to match size of the other
if numSpacesToFill_subheader > 0:
subheader[idx] = subheader[idx] + (' ' * numSpacesToFill_subheader)
if numSpacesToFill_header > 0:
header[idx] = header[idx] + (' ' * numSpacesToFill_header)
head_widths = [len(head) + 2 for head in header]
values = {}
degree_sign = u'\N{DEGREE SIGN}'
for device in deviceList:
temp = str(getTemp(device, 'edge'))
if temp != 'N/A':
temp += 'c'
temp_val = str(getTemp(device, available_temp_type))
if temp_val != 'N/A':
temp_val += degree_sign + 'C'
avgPwr = str(getPower(device))
if avgPwr != '0.0' and avgPwr != 'N/A':
avgPwr += 'W'
else:
avgPwr = 'N/A'
combined_partition = (getMemoryPartition(device) + ", "
+ getComputePartition(device))
concise = True
sclk = showCurrentClocks([device], 'sclk', concise)
mclk = showCurrentClocks([device], 'mclk', concise)
@@ -1575,7 +1628,9 @@ def showAllConcise(deviceList):
mem_use_pct='Unsupported'
if vram_used != None and vram_total != None and float(vram_total) != 0:
mem_use_pct = '% 3.0f%%' % (100 * (float(vram_used) / float(vram_total)))
values['card%s' % (str(device))] = [device, temp, avgPwr, sclk, mclk, fan, str(perf).lower(), pwrCap,
values['card%s' % (str(device))] = [device, temp_val, avgPwr,
combined_partition, sclk, mclk,
fan, str(perf).lower(), pwrCap,
mem_use_pct, gpu_busy]
val_widths = {}
for device in deviceList:
@@ -1585,6 +1640,9 @@ def showAllConcise(deviceList):
for col in range(len(val_widths[device])):
max_widths[col] = max(max_widths[col], val_widths[device][col])
printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), header)), None)
printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader)),
None, useItalics=True)
printLogSpacer(fill='=')
for device in deviceList:
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
zip(range(len(max_widths)), values['card%s' % (str(device))])), None)
@@ -1601,19 +1659,23 @@ def showAllConciseHw(deviceList):
print('ERROR: Cannot print JSON/CSV output for concise hardware output')
sys.exit(1)
printLogSpacer(' Concise Hardware Info ')
header = ['GPU', 'DID', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS']
header = ['GPU', 'DID', 'DREV', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS']
head_widths = [len(head) + 2 for head in header]
values = {}
for device in deviceList:
gpuid = getId(device)
if str(gpuid).startswith('0x'):
gpuid = str(gpuid)[2:]
gpurev = getRev(device)
if str(gpurev).startswith('0x'):
gpurev = str(gpurev)[2:]
gfxRas = getRasEnablement(device, 'GFX')
sdmaRas = getRasEnablement(device, 'SDMA')
umcRas = getRasEnablement(device, 'UMC')
vbios = getVbiosVersion(device)
bus = getBus(device)
values['card%s' % (str(device))] = [device, gpuid, gfxRas, sdmaRas, umcRas, vbios, bus]
values['card%s' % (str(device))] = [device, gpuid, gpurev, gfxRas, sdmaRas, umcRas, vbios, bus]
val_widths = {}
for device in deviceList:
val_widths[device] = [len(str(val)) + 2 for val in values['card%s' % (str(device))]]
@@ -1952,6 +2014,7 @@ def showId(deviceList):
printLogSpacer(' ID ')
for device in deviceList:
printLog(device, 'GPU ID', getId(device))
printLog(device, 'GPU Rev', getRev(device))
printLogSpacer()
@@ -2272,8 +2335,12 @@ def showProductName(deviceList):
# if rsmi_ret_ok(ret, device) and sku.value.decode():
# device_sku = sku.value.decode()
# Retrieve the device SKU as a substring from VBIOS
device_sku = ""
ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256)
if rsmi_ret_ok(ret, device, 'get_vbios_version') and vbios.value.decode():
if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
device_sku = "Unsupported"
printLog(device, 'Card SKU', '\t\t' + device_sku)
elif rsmi_ret_ok(ret, device, 'get_vbios_version') and vbios.value.decode():
# Device SKU is just the characters in between the two '-' in vbios_version
if vbios.value.decode().count('-') == 2 and len(str(vbios.value.decode().split('-')[1])) > 1:
device_sku = vbios.value.decode().split('-')[1]
@@ -2535,7 +2602,7 @@ def showEvents(deviceList, eventTypes):
break
def printTempGraph(deviceList, delay):
def printTempGraph(deviceList, delay, temp_type):
# deviceList must be in ascending order
deviceList.sort()
devices = 0
@@ -2549,7 +2616,7 @@ def printTempGraph(deviceList, delay):
terminalWidth = os.get_terminal_size()[0]
printStrings = list()
for device in deviceList:
temp = getTemp(device, 'edge')
temp = getTemp(device, temp_type)
if temp == 'N/A':
percentage = 0
else:
@@ -2622,11 +2689,16 @@ def getGraphColor(percentage):
def showTempGraph(deviceList):
printLogSpacer(' Temperature Graph ')
deviceList.sort()
(temp_type, temp_value) = findFirstAvailableTemp(deviceList[0])
printLogSpacer(' Temperature Graph ' + temp_type + ' ')
temp_type = temp_type.lower()
temp_type = temp_type.replace('(', '')
temp_type = temp_type.replace(')', '')
# Start a thread for constantly printing
try:
# Create a thread (call print function, devices, delay in ms)
_thread.start_new_thread(printTempGraph, (deviceList, 150))
_thread.start_new_thread(printTempGraph, (deviceList, 150, temp_type))
except Exception as e:
printErrLog(device, 'Unable to start new thread. %s' % (e))
# Catch user input for program termination
+12 -4
Zobrazit soubor
@@ -11,8 +11,16 @@ import os
# Use ROCm installation path if running from standard installation
# With File Reorg rsmiBindings.py will be installed in /opt/rocm/libexec/rocm_smi.
# relative path changed accordingly
path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@'
# relative path changed accordingly.
# if ROCM_SMI_LIB_PATH is set, we can load 'librocm_smi64.so' from that location
#
path_librocm = str()
rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH')
if (rocm_smi_lib_path != None):
path_librocm = rocm_smi_lib_path
else:
path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@'
if not os.path.isfile(path_librocm):
print('Unable to find %s . Trying /opt/rocm*' % path_librocm)
for root, dirs, files in os.walk('/opt', followlinks=True):
@@ -22,9 +30,10 @@ if not os.path.isfile(path_librocm):
print('Using lib from %s' % path_librocm)
else:
print('Unable to find librocm_smi64.so.@VERSION_MAJOR@')
else:
print('Library loaded from: %s ' % path_librocm)
# ----------> TODO: Support static libs as well as SO
try:
cdll.LoadLibrary(path_librocm)
rocmsmi = CDLL(path_librocm)
@@ -36,7 +45,6 @@ except OSError:
.format('\33[33m', '\033[0m'))
exit()
# Device ID
dv_id = c_uint64()
# GPU ID
+108 -8
Zobrazit soubor
@@ -78,6 +78,7 @@
#include "rocm_smi/rocm_smi_logger.h"
using namespace ROCmLogging;
using namespace amd::smi;
static const uint32_t kMaxOverdriveLevel = 20;
static const float kEnergyCounterResolution = 15.3f;
@@ -632,7 +633,7 @@ rsmi_status_t
rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
rsmi_error_count_t *ec) {
std::vector<std::string> val_vec;
rsmi_status_t ret;
rsmi_status_t ret(RSMI_STATUS_NOT_SUPPORTED);
std::ostringstream ss;
TRY
@@ -673,8 +674,8 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
default:
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", default case -> reporting RSMI_STATUS_NOT_SUPPORTED"
<< amd::smi::getRSMIStatusString(ret);
<< ", default case -> reporting "
<< amd::smi::getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED);
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
@@ -682,6 +683,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
DEVICE_MUTEX
ret = GetDevValueVec(type, dv_ind, &val_vec);
if (val_vec.size() != 2 ) ret = RSMI_STATUS_FILE_ERROR;
if (ret == RSMI_STATUS_FILE_ERROR || val_vec.size() != 2) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
@@ -698,8 +700,6 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
return ret;
}
assert(val_vec.size() == 2);
std::string junk;
std::istringstream fs1(val_vec[0]);
@@ -820,6 +820,21 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) {
return ret;
}
rsmi_status_t
rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision) {
std::ostringstream outss;
rsmi_status_t ret;
outss << __PRETTY_FUNCTION__ << "| ======= start =======";
LOG_TRACE(outss);
CHK_SUPPORT_NAME_ONLY(revision)
ret = get_id(dv_ind, amd::smi::kDevDevRevID, revision);
outss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
LOG_TRACE(outss);
return ret;
}
rsmi_status_t
rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *id) {
TRY
@@ -2503,7 +2518,16 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
}
if (temperature == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: temperature was a null ptr reference"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
LOG_ERROR(ss);
return RSMI_STATUS_INVALID_ARGS;
}
// The HBM temperature is retreived from the gpu_metrics
@@ -2512,12 +2536,32 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|| sensor_type == RSMI_TEMP_TYPE_HBM_2
|| sensor_type == RSMI_TEMP_TYPE_HBM_3) {
if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: To retreive HBM temp, we only support metric = "
<< "RSMI_TEMP_CURRENT"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
rsmi_gpu_metrics_t gpu_metrics;
ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics);
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: rsmi_dev_gpu_metrics_info_get returned "
<< getRSMIStatusString(ret)
<< " | Returning = "
<< getRSMIStatusString(ret) << " |";
LOG_ERROR(ss);
return ret;
}
@@ -2537,11 +2581,28 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
default:
return RSMI_STATUS_INVALID_ARGS;
}
if (val_ui16 == UINT16_MAX)
if (val_ui16 == UINT16_MAX) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: Reached UINT16 max value, overflow"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
else
} else
*temperature = val_ui16 * CENTRIGRADE_TO_MILLI_CENTIGRADE;
ss << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Success "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Data: " << *temperature
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_SUCCESS) << " | ";
LOG_INFO(ss);
return RSMI_STATUS_SUCCESS;
} // end HBM temperature
@@ -2550,6 +2611,15 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
GET_DEV_FROM_INDX
if (dev->monitor() == nullptr) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Cause: monitor returned nullptr"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
std::shared_ptr<amd::smi::Monitor> m = dev->monitor();
@@ -2563,6 +2633,15 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
CHK_API_SUPPORT_ONLY(temperature, metric, sensor_index)
ret = get_dev_mon_value(mon_type, dv_ind, sensor_index, temperature);
ss << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Success "
<< " | Device #: " << dv_ind
<< " | Sensor_index: " << sensor_index
<< " | Type: " << monitorTypesToString.at(mon_type)
<< " | Data: " << *temperature
<< " | Returning = "
<< getRSMIStatusString(ret) << " | ";
LOG_INFO(ss);
return ret;
CATCH
@@ -2995,6 +3074,7 @@ rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t reserved,
DEVICE_MUTEX
rsmi_status_t ret = get_power_profiles(dv_ind, status, nullptr);
return ret;
CATCH
}
@@ -3015,6 +3095,7 @@ rsmi_dev_power_profile_set(uint32_t dv_ind, uint32_t dummy,
return RSMI_STATUS_NOT_SUPPORTED;
}
rsmi_status_t ret = set_power_profile(dv_ind, profile);
return ret;
CATCH
}
@@ -3052,6 +3133,14 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
DEVICE_MUTEX
ret = get_dev_value_int(mem_type_file, dv_ind, total);
// Fallback to KFD reported memory if VRAM total is 0
if (mem_type == RSMI_MEM_TYPE_VRAM && *total == 0) {
GET_DEV_AND_KFDNODE_FROM_INDX
if (kfd_node->get_total_memory(total) == 0 && *total > 0) {
return RSMI_STATUS_SUCCESS;
}
}
return ret;
CATCH
}
@@ -3088,6 +3177,17 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
DEVICE_MUTEX
ret = get_dev_value_int(mem_type_file, dv_ind, used);
// Fallback to KFD reported memory if no VRAM
if (mem_type == RSMI_MEM_TYPE_VRAM && *used == 0) {
GET_DEV_AND_KFDNODE_FROM_INDX
uint64_t total = 0;
ret = get_dev_value_int(amd::smi::kDevMemTotVRAM, dv_ind, &total);
if (total != 0) return ret; // do not need to fallback
if ( kfd_node->get_used_memory(used) == 0 ) {
return RSMI_STATUS_SUCCESS;
}
}
return ret;
CATCH
}
+15 -3
Zobrazit soubor
@@ -59,6 +59,7 @@
#include <algorithm>
#include <iterator>
#include <cstring>
#include <type_traits>
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_device.h"
@@ -85,6 +86,7 @@ static const char *kDevPerfLevelFName = "power_dpm_force_performance_level";
static const char *kDevDevProdNameFName = "product_name";
static const char *kDevDevProdNumFName = "product_number";
static const char *kDevDevIDFName = "device";
static const char *kDevDevRevIDFName = "revision";
static const char *kDevVendorIDFName = "vendor";
static const char *kDevSubSysDevIDFName = "subsystem_device";
static const char *kDevSubSysVendorIDFName = "subsystem_vendor";
@@ -238,6 +240,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevDevProdName, kDevDevProdNameFName},
{kDevDevProdNum, kDevDevProdNumFName},
{kDevDevID, kDevDevIDFName},
{kDevDevRevID, kDevDevRevIDFName},
{kDevVendorID, kDevVendorIDFName},
{kDevSubSysDevID, kDevSubSysDevIDFName},
{kDevSubSysVendorID, kDevSubSysVendorIDFName},
@@ -374,12 +377,13 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
// Functions with only mandatory dependencies
{"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}},
{"rsmi_dev_id_get", {{kDevDevIDFName}, {}}},
{"rsmi_dev_revision_get", {{kDevDevRevIDFName}, {}}},
{"rsmi_dev_vendor_id_get", {{kDevVendorIDFName}, {}}},
{"rsmi_dev_name_get", {{kDevVendorIDFName,
kDevDevIDFName}, {}}},
{"rsmi_dev_sku_get", {{kDevDevProdNumFName}, {}}},
{"rsmi_dev_brand_get", {{kDevVendorIDFName}, {}}},
{"rsmi_dev_brand_get", {{kDevVendorIDFName,
kDevVBiosVerFName}, {}}},
{"rsmi_dev_vendor_name_get", {{kDevVendorIDFName}, {}}},
{"rsmi_dev_serial_number_get", {{kDevSerialNumberFName}, {}}},
{"rsmi_dev_subsystem_id_get", {{kDevSubSysDevIDFName}, {}}},
@@ -823,7 +827,12 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
}
ss << "Successfully read DevInfoBinary for DevInfoType ("
<< RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS ("
<< sysfs_path << "), returning binaryData = " << p_binary_data;
<< sysfs_path << "), returning binaryData = " << p_binary_data
<< "; byte_size = " << std::dec << static_cast<int>(b_size);
std::string metricDescription = "AMD SMI GPU METRICS (16-byte width), "
+ sysfs_path;
logHexDump(metricDescription.c_str(), p_binary_data, b_size, 16);
LOG_INFO(ss);
return 0;
}
@@ -888,6 +897,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
switch (type) {
case kDevDevID:
case kDevDevRevID:
case kDevSubSysDevID:
case kDevSubSysVendorID:
case kDevVendorID:
@@ -1025,6 +1035,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) {
case kDevDevProdName:
case kDevDevProdNum:
case kDevDevID:
case kDevDevRevID:
case kDevSubSysDevID:
case kDevSubSysVendorID:
case kDevVendorID:
@@ -1375,6 +1386,7 @@ std::string Device::readBootPartitionState<rsmi_nps_mode_type_t>(
return boot_state;
}
#undef RET_IF_NONZERO
} // namespace smi
} // namespace amd
+218
Zobrazit soubor
@@ -61,6 +61,10 @@
#include "rocm_smi/rocm_smi_monitor.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_logger.h"
using namespace ROCmLogging;
using namespace amd::smi;
#define TRY try {
#define CATCH } catch (...) {return amd::smi::handleException();}
@@ -140,6 +144,196 @@ typedef struct {
} rsmi_gpu_metrics_v_1_3;
// log current gpu_metrics file content read
// any metrics value can be a nullptr
void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header,
const rsmi_gpu_metrics_v_1_2 *rsmi_gpu_metrics_v_1_2,
const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3,
const rsmi_gpu_metrics_t *rsmi_gpu_metrics) {
if (RocmSMI::getInstance().isLoggingOn() == false) {
return;
}
std::ostringstream ss;
if (gpu_metrics_table_header != nullptr) {
ss
/* Common Header */
<< print_unsigned_hex_and_int(
gpu_metrics_table_header->structure_size,
"gpu_metrics_table_header->structure_size")
<< print_unsigned_hex_and_int(
gpu_metrics_table_header->format_revision,
"gpu_metrics_table_header->format_revision")
<< print_unsigned_hex_and_int(
gpu_metrics_table_header->content_revision,
"gpu_metrics_table_header->content_revision");
LOG_DEBUG(ss);
}
if (rsmi_gpu_metrics == nullptr) {
return;
} else {
// do nothing - continue
}
ss
/* Common Header */
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->common_header.structure_size,
"rsmi_gpu_metrics->common_header.structure_size")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->common_header.format_revision,
"rsmi_gpu_metrics->common_header.format_revision")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->common_header.content_revision,
"rsmi_gpu_metrics->common_header.content_revision")
/* Temperature */
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->temperature_edge,
"rsmi_gpu_metrics->temperature_edge")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->temperature_hotspot,
"rsmi_gpu_metrics->temperature_hotspot")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->temperature_mem,
"rsmi_gpu_metrics->temperature_mem")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->temperature_vrgfx,
"rsmi_gpu_metrics->temperature_vrgfx")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->temperature_vrsoc,
"rsmi_gpu_metrics->temperature_vrsoc")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->temperature_vrmem,
"rsmi_gpu_metrics->temperature_vrmem")
/* Utilization */
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->average_gfx_activity,
"rsmi_gpu_metrics->average_gfx_activity")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->average_umc_activity,
"rsmi_gpu_metrics->average_umc_activity")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->average_mm_activity,
"rsmi_gpu_metrics->average_mm_activity")
/* Power/Energy */
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->average_socket_power,
"rsmi_gpu_metrics->average_socket_power")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->energy_accumulator,
"rsmi_gpu_metrics->energy_accumulator")
/* Driver attached timestamp (in ns) */
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->system_clock_counter,
"rsmi_gpu_metrics->system_clock_counter")
/* Average clocks */
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->average_gfxclk_frequency,
"rsmi_gpu_metrics->average_gfxclk_frequency")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->average_socclk_frequency,
"rsmi_gpu_metrics->average_socclk_frequency")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->average_uclk_frequency,
"rsmi_gpu_metrics->average_uclk_frequency")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->average_vclk0_frequency,
"rsmi_gpu_metrics->average_vclk0_frequency")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->average_dclk0_frequency,
"rsmi_gpu_metrics->average_dclk0_frequency")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->average_vclk1_frequency,
"rsmi_gpu_metrics->average_vclk1_frequency")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->average_dclk1_frequency,
"rsmi_gpu_metrics->average_dclk1_frequency")
/* Current clocks */
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->current_gfxclk,
"rsmi_gpu_metrics->current_gfxclk")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->current_socclk,
"rsmi_gpu_metrics->current_socclk")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->current_uclk,
"rsmi_gpu_metrics->current_uclk")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->current_vclk0,
"rsmi_gpu_metrics->current_vclk0")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->current_dclk0,
"rsmi_gpu_metrics->current_dclk0")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->current_vclk1,
"rsmi_gpu_metrics->current_vclk1")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->current_dclk1,
"rsmi_gpu_metrics->current_dclk1")
/* Throttle status */
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->throttle_status,
"rsmi_gpu_metrics->throttle_status")
/* Fans */
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->current_fan_speed,
"rsmi_gpu_metrics->current_fan_speed")
/* Link width/speed */
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->pcie_link_width,
"rsmi_gpu_metrics->pcie_link_width")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->pcie_link_speed,
"rsmi_gpu_metrics->pcie_link_speed")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->padding,
"rsmi_gpu_metrics->padding")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->gfx_activity_acc,
"rsmi_gpu_metrics->gfx_activity_acc")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->mem_actvity_acc,
"rsmi_gpu_metrics->mem_actvity_acc");
for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) {
ss << print_unsigned_hex_and_int(
rsmi_gpu_metrics->temperature_hbm[i],
"rsmi_gpu_metrics->temperature_hbm[" + std::to_string(i) + "]");
}
if (rsmi_gpu_metrics_v_1_2 != nullptr) {
/* PMFW attached timestamp (10ns resolution) */
ss
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics_v_1_2->firmware_timestamp,
"rsmi_gpu_metrics_v_1_2->firmware_timestamp");
}
if (gpu_metrics_v_1_3 != nullptr) {
/* PMFW attached timestamp (10ns resolution) */
ss
<< print_unsigned_hex_and_int(
gpu_metrics_v_1_3->firmware_timestamp,
"gpu_metrics_v_1_3->firmware_timestamp")
/* Voltage (mV) */
<< print_unsigned_hex_and_int(
gpu_metrics_v_1_3->voltage_soc,
"gpu_metrics_v_1_3->voltage_soc")
<< print_unsigned_hex_and_int(
gpu_metrics_v_1_3->gfx_voltage,
"gpu_metrics_v_1_3->voltage_gfx")
<< print_unsigned_hex_and_int(
gpu_metrics_v_1_3->mem_voltage,
"gpu_metrics_v_1_3->voltage_mem")
<< print_unsigned_hex_and_int(
gpu_metrics_v_1_3->padding1,
"gpu_metrics_v_1_3->padding1")
/* Throttle status (ASIC independent) */
<< print_unsigned_hex_and_int(
gpu_metrics_v_1_3->indep_throttle_status,
"gpu_metrics_v_1_3->indep_throttle_status");
}
LOG_DEBUG(ss);
}
static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind,
rsmi_gpu_metrics_t *data, uint8_t content_v) {
assert(content_v != RSMI_GPU_METRICS_API_CONTENT_VER_1 &&
@@ -269,16 +463,28 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
rsmi_gpu_metrics_v_1_3 smu_v_1_3;
rsmi_status_t ret;
std::ostringstream ss;
if (!dev->gpu_metrics_ver().structure_size) {
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
sizeof(struct metrics_table_header_t), &dev->gpu_metrics_ver());
log_gpu_metrics(&dev->gpu_metrics_ver(), nullptr, nullptr, nullptr);
if (ret != RSMI_STATUS_SUCCESS) {
ss << "Returning = " << getRSMIStatusString(ret)
<< ",\ndev->gpu_metrics_ver().structure_size = "
<< print_unsigned_int(dev->gpu_metrics_ver().structure_size)
<< ", could not read common header";
LOG_ERROR(ss);
return ret;
}
}
// only supports gpu_metrics_v1_x version
if (dev->gpu_metrics_ver().format_revision != 1) {
ss << "Returning = " << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED)
<< ",\ndev->gpu_metrics_ver().format_revision = "
<< print_unsigned_int(dev->gpu_metrics_ver().format_revision)
<< " was not equal to 1";
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
@@ -290,19 +496,31 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
RSMI_GPU_METRICS_API_CONTENT_VER_1) {
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
sizeof(rsmi_gpu_metrics_t), smu);
ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_1";
LOG_DEBUG(ss);
log_gpu_metrics(nullptr, nullptr, nullptr, smu);
} else if (dev->gpu_metrics_ver().content_revision ==
RSMI_GPU_METRICS_API_CONTENT_VER_2) {
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
sizeof(rsmi_gpu_metrics_v_1_2), &smu_v_1_2);
map_gpu_metrics_1_2_to_rsmi_gpu_metrics_t(&smu_v_1_2, smu);
ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_2";
LOG_DEBUG(ss);
log_gpu_metrics(nullptr, &smu_v_1_2, nullptr, smu);
} else if (dev->gpu_metrics_ver().content_revision ==
RSMI_GPU_METRICS_API_CONTENT_VER_3) {
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
sizeof(rsmi_gpu_metrics_v_1_3), &smu_v_1_3);
map_gpu_metrics_1_3_to_rsmi_gpu_metrics_t(&smu_v_1_3, smu);
ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_3";
LOG_DEBUG(ss);
log_gpu_metrics(nullptr, nullptr, &smu_v_1_3, smu);
} else {
ret = GetGPUMetricsFormat1(dv_ind, smu,
dev->gpu_metrics_ver().content_revision);
ss << __PRETTY_FUNCTION__ << " | GetGPUMetricsFormat1";
LOG_DEBUG(ss);
log_gpu_metrics(nullptr, nullptr, nullptr, smu);
}
if (ret != RSMI_STATUS_SUCCESS) {
+92
Zobrazit soubor
@@ -43,6 +43,9 @@
#include <assert.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <fcntl.h>
#include <dirent.h>
#include <algorithm>
@@ -770,6 +773,95 @@ KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth,
return 0;
}
// /sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties
// size_in_bytes 68702699520
int KFDNode::get_total_memory(uint64_t* total) {
if (total == nullptr) return EINVAL;
*total = 0;
std::string f_path = kKFDNodesPathRoot;
f_path += "/";
f_path += std::to_string(node_indx_);
f_path += "/mem_banks";
auto kfd_node_dir = opendir(f_path.c_str());
if (kfd_node_dir == nullptr) {
return errno;
}
auto dentry = readdir(kfd_node_dir);
while (dentry != nullptr) {
if (dentry->d_name[0] == '.') {
dentry = readdir(kfd_node_dir);
continue;
}
if (!is_number(dentry->d_name)) {
dentry = readdir(kfd_node_dir);
continue;
}
// read "size_in_bytes 68702699520" line
const std::string size_in_bytes_property = "size_in_bytes ";
std::string memory_bank_file = f_path + "/"
+ dentry->d_name + "/properties";
std::ifstream fs(memory_bank_file);
if (!fs) {
dentry = readdir(kfd_node_dir);
continue;
}
std::string line;
while (std::getline(fs, line)) {
if (line.substr(0, size_in_bytes_property.length())
== size_in_bytes_property) {
auto bytes = line.substr(size_in_bytes_property.length());
try {
*total += std::stol(bytes);
break;
} catch(...) {
dentry = readdir(kfd_node_dir);
continue;
}
}
} // end loop for lines in property file
} // end loop for mem_bank directory
if (closedir(kfd_node_dir)) {
std::string err_str = "Failed to close KFD node directory ";
err_str += f_path;
err_str += ".";
perror(err_str.c_str());
return 1;
}
return 0;
}
// ioctl on kfd node device
int KFDNode::get_used_memory(uint64_t* used) {
if (used == nullptr) return EINVAL;
static const char *kPathKFDIoctl = "/dev/kfd";
int kfd_fd = open(kPathKFDIoctl, O_RDWR | O_CLOEXEC);
if (kfd_fd <= 0) {
return 1;
}
struct kfd_ioctl_get_available_memory_args mem = {0, 0, 0};
mem.gpu_id = gpu_id_;
if (ioctl(kfd_fd, AMDKFD_IOC_AVAILABLE_MEMORY , &mem) != 0) {
close(kfd_fd);
return 1;
}
close(kfd_fd);
// used = total - available
uint64_t total = 0;
int ret = get_total_memory(&total);
if (ret == 0 && total > 0 && mem.available < total) {
*used = total - mem.available;
return 0;
}
return 1;
}
} // namespace smi
} // namespace amd
+44 -1
Zobrazit soubor
@@ -177,6 +177,9 @@ void Logger::error(const char* text) throw() {
logIntoFile(data);
} else if (m_LogType == CONSOLE) {
logOnConsole(data);
} else if (m_LogType == BOTH_FILE_AND_CONSOLE) {
logOnConsole(data);
logIntoFile(data);
}
}
@@ -208,6 +211,9 @@ void Logger::alarm(const char* text) throw() {
logIntoFile(data);
} else if (m_LogType == CONSOLE) {
logOnConsole(data);
} else if (m_LogType == BOTH_FILE_AND_CONSOLE) {
logOnConsole(data);
logIntoFile(data);
}
}
@@ -239,6 +245,9 @@ void Logger::always(const char* text) throw() {
logIntoFile(data);
} else if (m_LogType == CONSOLE) {
logOnConsole(data);
} else if (m_LogType == BOTH_FILE_AND_CONSOLE) {
logOnConsole(data);
logIntoFile(data);
}
}
@@ -303,6 +312,10 @@ void Logger::info(const char* text) throw() {
logIntoFile(data);
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_INFO)) {
logOnConsole(data);
} else if ((m_LogType == BOTH_FILE_AND_CONSOLE)
&& (m_LogLevel >= LOG_LEVEL_INFO)) {
logOnConsole(data);
logIntoFile(data);
}
}
@@ -333,6 +346,10 @@ void Logger::trace(const char* text) throw() {
logIntoFile(data);
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_TRACE)) {
logOnConsole(data);
} else if ((m_LogType == BOTH_FILE_AND_CONSOLE)
&& (m_LogLevel >= LOG_LEVEL_TRACE)) {
logOnConsole(data);
logIntoFile(data);
}
}
@@ -363,6 +380,10 @@ void Logger::debug(const char* text) throw() {
logIntoFile(data);
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_DEBUG)) {
logOnConsole(data);
} else if ((m_LogType == BOTH_FILE_AND_CONSOLE)
&& (m_LogLevel >= LOG_LEVEL_DEBUG)) {
logOnConsole(data);
logIntoFile(data);
}
}
@@ -424,6 +445,9 @@ std::string Logger::getLogSettings() {
case CONSOLE:
logSettings += "LogType = CONSOLE";
break;
case BOTH_FILE_AND_CONSOLE:
logSettings += "LogType = BOTH_FILE_AND_CONSOLE";
break;
default:
logSettings += "LogType = <undefined>";
}
@@ -471,7 +495,26 @@ void Logger::initialize_resources() {
}
m_File.open(logFileName.c_str(), std::ios::out | std::ios::app);
m_LogLevel = LOG_LEVEL_TRACE;
m_LogType = FILE_LOG;
// RSMI_LOGGING = 1, output to logs only
// RSMI_LOGGING = 2, output to console only
// RSMI_LOGGING = 3, output to logs and console
switch (amd::smi::RocmSMI::getInstance().getLogSetting()) {
case 0:
m_LogType = NO_LOG;
break;
case 1:
m_LogType = FILE_LOG;
break;
case 2:
m_LogType = CONSOLE;
break;
case 3:
m_LogType = BOTH_FILE_AND_CONSOLE;
break;
default:
m_LogType = NO_LOG;
break;
}
if (!m_File.is_open()) {
std::cout << "WARNING: Issue opening log file (" << logFileName
<< ") to write." << std::endl;
+60 -11
Zobrazit soubor
@@ -84,6 +84,7 @@ amd::smi::RocmSMI::devInfoTypesStrings = {
{amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"},
{amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"},
{amd::smi::kDevDevID, amdSMI + "kDevDevID"},
{amd::smi::kDevDevRevID, amdSMI + "kDevDevRevID"},
{amd::smi::kDevDevProdName, amdSMI + "kDevDevProdName"},
{amd::smi::kDevDevProdNum, amdSMI + "kDevDevProdNum"},
{amd::smi::kDevVendorID, amdSMI + "kDevVendorID"},
@@ -169,6 +170,7 @@ static uint32_t GetDeviceIndex(const std::string s) {
// computed for cardX.
// On success, return drm_minor which is >= 128 otherwise return 0
static uint32_t GetDrmRenderMinor(const std::string s) {
std::ostringstream ss;
std::string drm_path = s;
int drm_minor = 0;
const std::string render_file_prefix = "renderD";
@@ -194,6 +196,10 @@ static uint32_t GetDrmRenderMinor(const std::string s) {
if (closedir(drm_dir)) {
return 0;
}
ss << __PRETTY_FUNCTION__ << " | Discovered drmRenderMinor = "
<< std::to_string(drm_minor) << " | For drm_path = " << drm_path << " | ";
LOG_DEBUG(ss);
return static_cast<uint32_t>(drm_minor);
}
@@ -376,11 +382,15 @@ RocmSMI::Initialize(uint64_t flags) {
// Remove any drm nodes that don't have a corresponding readable kfd node.
// kfd nodes will not be added if their properties file is not readable.
std::ostringstream ss;
auto dev_iter = devices_.begin();
while (dev_iter != devices_.end()) {
uint64_t bdfid = (*dev_iter)->bdfid();
if (tmp_map.find(bdfid) == tmp_map.end()) {
ss << __PRETTY_FUNCTION__ << " | removing device = "
<< (*dev_iter)->path();
dev_iter = devices_.erase(dev_iter);
LOG_DEBUG(ss);
continue;
}
dev_iter++;
@@ -410,6 +420,9 @@ RocmSMI::Initialize(uint64_t flags) {
}
// Leaving below to help debug temp file issues
// displayAppTmpFilesContent();
std::string amdGPUDeviceList = displayAllDevicePaths(devices_);
ss << __PRETTY_FUNCTION__ << " | current device paths = " << amdGPUDeviceList;
LOG_DEBUG(ss);
}
void
@@ -457,17 +470,21 @@ static uint32_t GetEnvVarUInteger(const char *ev_str) {
// provides a way to get env variable detail in both debug & release
// helps enable full logging
static bool getRSMIEnvVar_LoggingEnabled(const char *ev_str) {
bool isLoggingEnabled = false;
// RSMI_LOGGING = 1, output to logs only
// RSMI_LOGGING = 2, output to console only
// RSMI_LOGGING = 3, output to logs and console
static uint32_t getRSMIEnvVar_LoggingEnabled(const char *ev_str) {
uint32_t ret = 0;
ev_str = getenv(ev_str);
if (ev_str != nullptr) {
isLoggingEnabled = true;
int ev_ret = atoi(ev_str);
ret = static_cast<uint32_t>(ev_ret);
}
return isLoggingEnabled;
return ret;
}
static std::unordered_set<uint32_t> GetEnvVarUIntegerSets(const char *ev_str) {
static inline std::unordered_set<uint32_t> GetEnvVarUIntegerSets(
const char *ev_str) {
std::unordered_set<uint32_t> returnSet;
#ifndef DEBUG
(void)ev_str;
@@ -518,7 +535,16 @@ const RocmSMI_env_vars& RocmSMI::getEnv(void) {
}
bool RocmSMI::isLoggingOn(void) {
bool isLoggingOn = false;
GetEnvVariables();
if (this->env_vars_.logging_on > 0
&& this->env_vars_.logging_on <= 3) {
isLoggingOn = true;
}
return isLoggingOn;
}
uint32_t RocmSMI::getLogSetting() {
return this->env_vars_.logging_on;
}
@@ -543,7 +569,9 @@ void RocmSMI::printEnvVarInfo(void) {
<< ((env_vars_.debug_inf_loop == 0) ? "<undefined>"
: std::to_string(env_vars_.debug_inf_loop))
<< std::endl;
bool isLoggingOn = (env_vars_.logging_on) ? true : false;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
<< getLogSetting() << std::endl;
bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
<< (isLoggingOn ? "true" : "false") << std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {";
@@ -630,6 +658,9 @@ RocmSMI::FindMonitor(std::string monitor_path) {
}
void
RocmSMI::AddToDeviceList(std::string dev_name) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
auto dev_path = std::string(kPathDRMRoot);
dev_path += "/";
dev_path += dev_name;
@@ -646,6 +677,10 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
GetSupportedEventGroups(card_indx, dev->supported_event_groups());
devices_.push_back(dev);
ss << __PRETTY_FUNCTION__ << " | Adding to device list dev_name = "
<< dev_name << " | path = " << dev_path
<< " | card index = " << std::to_string(card_indx) << " | ";
LOG_DEBUG(ss);
return;
}
@@ -653,16 +688,26 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
static const uint32_t kAmdGpuId = 0x1002;
static bool isAMDGPU(std::string dev_path) {
bool isAmdGpu = false;
std::ostringstream ss;
std::string vend_path = dev_path + "/device/vendor";
if (!FileExists(vend_path.c_str())) {
return false;
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
"is an amdgpu device - FALSE");
LOG_DEBUG(ss);
return isAmdGpu;
}
std::ifstream fs;
fs.open(vend_path);
if (!fs.is_open()) {
return false;
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
"is an amdgpu device - FALSE");
LOG_DEBUG(ss);
return isAmdGpu;
}
uint32_t vendor_id;
@@ -672,9 +717,13 @@ static bool isAMDGPU(std::string dev_path) {
fs.close();
if (vendor_id == kAmdGpuId) {
return true;
isAmdGpu = true;
}
return false;
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
"is an amdgpu device - FALSE");
LOG_DEBUG(ss);
return isAmdGpu;
}
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
+11
Zobrazit soubor
@@ -313,6 +313,7 @@ int Monitor::writeMonitor(MonitorTypes type, uint32_t sensor_id,
// This string version should work for all valid monitor types
int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id,
std::string *val) {
std::ostringstream ss;
assert(val != nullptr);
std::string temp_str;
@@ -320,11 +321,21 @@ int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id,
DBG_FILE_ERROR(sysfs_path, (std::string *)nullptr)
int ret = ReadSysfsStr(sysfs_path, val);
ss << __PRETTY_FUNCTION__
<< " | Success | Read hwmon file: " << sysfs_path
<< " | Type: " << monitorTypesToString.at(type)
<< " | Sensor id: " << std::to_string(sensor_id)
<< " | Data: " << *val
<< " | Returning: " << std::to_string(ret) << " |";
LOG_INFO(ss);
return ret;
}
int32_t
Monitor::setTempSensorLabelMap(void) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
std::string type_str;
int ret;
+172 -17
Zobrazit soubor
@@ -52,11 +52,14 @@
#include <string>
#include <cstring>
#include <cstdint>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <vector>
#include <regex>
#include <iomanip>
#include <type_traits>
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_utils.h"
@@ -103,7 +106,7 @@ bool FileExists(char const *filename) {
return (stat(filename, &buf) == 0);
}
static void debugFilesDiscovered(std::vector<std::string> files) {
static inline void debugFilesDiscovered(std::vector<std::string> files) {
std::ostringstream ss;
int numberOfFilesFound = static_cast<int>(files.size());
ss << "fileName.size() = " << numberOfFilesFound
@@ -204,9 +207,10 @@ int ReadSysfsStr(std::string path, std::string *retStr) {
if (!fs.is_open()) {
ret = errno;
errno = 0;
oss << "Could not read SYSFS file (" << path << ")"
<< ", returning " << std::to_string(ret) << " ("
<< std::strerror(ret) << ")";
oss << __PRETTY_FUNCTION__
<< " | Fail | Cause: file does not exist or permissions issue"
<< " | SYSFS file: " << path
<< " | Returning: " << std::strerror(ret) << " |";
LOG_ERROR(oss);
return ret;
}
@@ -457,9 +461,13 @@ rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName,
}
chmod(fileName, S_IRUSR|S_IRGRP|S_IROTH);
write(fd, storageData.c_str(), storageData.size());
ssize_t rc_write = write(fd, storageData.c_str(), storageData.size());
close(fd);
return RSMI_STATUS_SUCCESS;
if (rc_write == -1) {
return RSMI_STATUS_FILE_ERROR;
} else {
return RSMI_STATUS_SUCCESS;
}
}
std::vector<std::string> getListOfAppTmpFiles() {
@@ -531,19 +539,39 @@ void displayAppTmpFilesContent() {
}
// Used to debug vector string list and their content
void displayVectorContent(std::vector<std::string> v) {
std::cout << "Vector = {";
std::string debugVectorContent(std::vector<std::string> v) {
std::ostringstream ss;
ss << "Vector = {";
if (v.size() > 0) {
for (auto it=v.begin(); it < v.end(); it++) {
std::cout << *it;
ss << *it;
auto temp_it = it;
if(++temp_it != v.end()) {
std::cout << ", ";
ss << ", ";
}
}
} else {
std::cout << "}" << std::endl;
}
ss << "}" << std::endl;
return ss.str();
}
// Used to debug vector string list and their content
std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v) {
std::ostringstream ss;
ss << "Vector = {";
if (v.size() > 0) {
for (auto it=v.begin(); it < v.end(); it++) {
ss << (*it)->path();
auto temp_it = it;
if(++temp_it != v.end()) {
ss << ", ";
}
}
}
ss << "}" << std::endl;
return ss.str();
}
// Attempts to read application specific temporary file
@@ -595,14 +623,20 @@ std::string getRSMIStatusString(rsmi_status_t ret) {
// string domainName = domain name of the the system's node on the network
// string os_distribution = pretty name of os distribution
// (typically found in /etc/*-release file)
// string endianness = system's endianness.
// Expressed as big endian or little endian.
// Big Endian (BE), multi-bit symbols encoded as big endian (MSB first)
// Little Endian (LE), multi-bit symbols encoded as little endian (LSB first)
std::tuple<bool, std::string, std::string, std::string, std::string,
std::string, std::string, std::string> getSystemDetails(void) {
std::string, std::string, std::string, std::string>
getSystemDetails(void) {
struct utsname buf;
bool errorDetected = false;
std::string temp_data;
std::string sysname, nodename, release, version, machine;
std::string domainName = "<undefined>";
std::string os_distribution = "<undefined>";
std::string endianness = "<undefined>";
if (uname(&buf) < 0) {
errorDetected = true;
@@ -630,8 +664,16 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
}
}
}
if (isSystemBigEndian()) {
endianness = "Big Endian, multi-bit symbols encoded as"
" big endian (MSB first)";
} else {
endianness = "Little Endian, multi-bit symbols encoded as"
" little endian (LSB first)";
}
return std::make_tuple(errorDetected, sysname, nodename, release,
version, machine, domainName, os_distribution);
version, machine, domainName, os_distribution,
endianness);
}
// If logging is enabled through RSMI_LOGGING environment variable.
@@ -639,9 +681,10 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
void logSystemDetails(void) {
std::ostringstream ss;
bool errorDetected;
std::string sysname, node, release, version, machine, domain, distName;
std::string sysname, node, release, version, machine, domain, distName,
endianness;
std::tie(errorDetected, sysname, node, release, version, machine, domain,
distName) = getSystemDetails();
distName, endianness) = getSystemDetails();
if (errorDetected == false) {
ss << "====== Gathered system details ============\n"
<< "SYSTEM NAME: " << sysname << "\n"
@@ -650,7 +693,8 @@ void logSystemDetails(void) {
<< "RELEASE: " << release << "\n"
<< "VERSION: " << version << "\n"
<< "MACHINE TYPE: " << machine << "\n"
<< "DOMAIN: " << domain << "\n";
<< "DOMAIN: " << domain << "\n"
<< "ENDIANNESS: " << endianness << "\n";
LOG_INFO(ss);
} else {
ss << "====== Gathered system details ============\n"
@@ -659,5 +703,116 @@ void logSystemDetails(void) {
}
}
// Usage:
// logHexDump(desc, addr, len, bytesPerLine);
// desc: if non-NULL, printed as a description before hex dump.
// addr: the address to start dumping from.
// len: the number of bytes to dump.
// bytesPerLine: number of bytes on each output line.
void logHexDump(
const char *desc, const void *addr, const size_t len, size_t bytesPerLine) {
// UNCOMMENT: printf lines if you want to see directly to stdout
std::ostringstream ss;
// Silently ignore per-line values.
if (bytesPerLine < 4 || bytesPerLine > 64) bytesPerLine = 16;
size_t i;
unsigned char buff[bytesPerLine + 1];
const unsigned char *pc // ptr to data (char, 1 byte sized data)
= (const unsigned char *) addr;
// Output description if given.
// if (desc != NULL) printf("%s:\n", desc);
if (desc != NULL) ss << "\n" << desc << "\n";
// Length checks.
if (len == 0) {
// printf(" ZERO LENGTH\n");
ss << " ZERO LENGTH\n";
LOG_ERROR(ss);
return;
}
std::string endianness = "<undefined>";
if (isSystemBigEndian()) {
endianness = "** System is Big Endian, multi-bit symbols encoded as"
" big endian (MSB first) **";
} else {
endianness = "** System is Little Endian, multi-bit symbols encoded as"
" little endian (LSB first) **";
}
ss << "\t" << endianness << "\n";
// Process every byte in the data.
for (i = 0; i < len; i++) {
// Multiple of bytesPerLine means new or first line (with line offset).
if ((i % bytesPerLine) == 0) {
// Only print previous-line ASCII buffer for lines beyond first.
// if (i != 0) printf(" %s\n", buff);
if (i != 0) ss << " " << buff << "\n";
// Output the offset of current line.
// printf(" %08lx ", i);
ss << " " << std::setw(8) << std::setfill('0') << std::hex << i << " ";
}
// Now the hex code for the specific character.
// printf(" %02x", pc[i]);
ss << " " << std::setw(2) << std::setfill('0') << std::hex
<< static_cast<unsigned>(pc[i]);
// And buffer a printable ASCII character for later.
// x20 = 32 || x7e = 126 (ascii table range)
if ((pc[i] < 0x20) || (pc[i] > 0x7e)) { // isprint() may be better.
buff[i % bytesPerLine] = '.';
} else {
buff[i % bytesPerLine] = pc[i];
}
buff[(i % bytesPerLine) + 1] = '\0';
}
// Pad out last line if not exactly bytesPerLine characters.
while ((i % bytesPerLine) != 0) {
// printf(" ");
ss << " ";
i++;
}
// And print the final ASCII buffer.
// printf(" %s\n", buff);
ss << " " << buff << "\n";
LOG_DEBUG(ss);
}
bool isSystemBigEndian() {
int n = 1;
bool isBigEndian = true;
if (*(char *)&n == 1) {
isBigEndian = false;
}
return isBigEndian;
}
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str)
{
auto result = rsmi_status_t::RSMI_STATUS_SUCCESS;
auto bus_id = static_cast<uint8_t>((bdf_id & 0x0000FF00) >> 8);
auto dev_id = static_cast<uint8_t>((bdf_id & 0x000000F8) >> 3);
auto func_id = static_cast<uint8_t>(bdf_id & 0x00000003);
bfd_str = std::string();
if (!(bus_id > 0)) {
result = rsmi_status_t::RSMI_STATUS_NO_DATA;
return result;
}
std::stringstream bdf_sstream;
bdf_sstream << std::hex << std::setfill('0') << std::setw(sizeof(uint8_t) * 2) << +bus_id << ":";
bdf_sstream << std::hex << std::setfill('0') << std::setw(sizeof(uint8_t) * 2) << +dev_id << ".";
bdf_sstream << std::hex << std::setfill('0') << +func_id;
bfd_str = bdf_sstream.str();
return result;
}
} // namespace smi
} // namespace amd
+5
Zobrazit soubor
@@ -578,6 +578,11 @@ amdsmi_status_t amdsmi_get_gpu_id(amdsmi_processor_handle processor_handle,
return rsmi_wrapper(rsmi_dev_id_get, processor_handle, id);
}
amdsmi_status_t amdsmi_get_gpu_revision(amdsmi_processor_handle processor_handle,
uint16_t *revision) {
return rsmi_wrapper(rsmi_dev_revision_get, processor_handle, revision);
}
// TODO(bliu) : add fw info from libdrm
amdsmi_status_t amdsmi_get_fw_info(amdsmi_processor_handle processor_handle,
amdsmi_fw_info_t *info) {
+560
Zobrazit soubor
@@ -0,0 +1,560 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include "rocm_smi/rocm_smi_properties.h"
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi_logger.h"
#include <algorithm>
#include <cassert>
#include <sstream>
//
// Property reinforcement check list
//
// NOTE: This is a *temporary solution* until we get a better approach, likely
// a driver API that can give us the capabilities of a GPU in question.
//
namespace amd {
namespace smi {
const AMDGpuOpModeList_t amdgpu_opmode_check_list {
{AMDGpuPropertyOpModeTypes_t::kBareMetal, "Bare Metal"},
{AMDGpuPropertyOpModeTypes_t::kSrIov, "SR-IOV"},
{AMDGpuPropertyOpModeTypes_t::kBoth, "Both"},
};
const AMDGpuPropertyTypesOffsetList_t amdgpu_typeoffset_check_list {
{AMDGpuPropertyTypesOffset_t::kNone, "None"},
{AMDGpuPropertyTypesOffset_t::kDevInfoTypes, "Device Info Type"},
{AMDGpuPropertyTypesOffset_t::kMonitorTypes, "Monitor Type"},
{AMDGpuPropertyTypesOffset_t::kPerfTypes, "Performance Type"},
{AMDGpuPropertyTypesOffset_t::kClkTypes, "Clock Type"},
{AMDGpuPropertyTypesOffset_t::kVoltMetricTypes, "Volt Metric Type"},
};
AMDGpuPropertyId_t make_unique_property_id(AMDGpuPropertyTypesOffset_t type_offset, AMDGpuPropertyId_t property_id) {
return (static_cast<AMDGpuPropertyOffsetType>(type_offset) | (property_id));
}
AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id) {
const auto property_type_offset_mask =
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kDevInfoTypes) |
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kMonitorTypes) |
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kPerfTypes) |
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kClkTypes) |
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes);
auto property_type_offset = (static_cast<AMDGpuPropertyOffsetType>(property_type_offset_mask) & (property_id));
auto property_type_id = (static_cast<AMDGpuPropertyOffsetType>(property_id) & ~(property_type_offset_mask));
return property_type_id;
}
AMDGpuPropertyTypesOffset_t operator| (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs)
{
if (lhs == rhs) {
return lhs;
}
return AMDGpuPropertyTypesOffset_t(static_cast<AMDGpuPropertyOffsetType>(lhs) | static_cast<AMDGpuPropertyOffsetType>(rhs));
}
AMDGpuPropertyTypesOffset_t operator& (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs)
{
if (lhs == rhs) {
return lhs;
}
return AMDGpuPropertyTypesOffset_t(static_cast<AMDGpuPropertyOffsetType>(lhs) & static_cast<AMDGpuPropertyOffsetType>(rhs));
}
AMDGpuPropertyOpModeTypes_t operator| (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs)
{
if (lhs == rhs) {
return lhs;
}
return AMDGpuPropertyOpModeTypes_t(static_cast<AMDGpuPropertyOpModeType>(lhs) | static_cast<AMDGpuPropertyOpModeType>(rhs));
}
AMDGpuPropertyOpModeTypes_t operator& (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs)
{
if (lhs == rhs) {
return lhs;
}
return AMDGpuPropertyOpModeTypes_t(static_cast<AMDGpuPropertyOpModeType>(lhs) & static_cast<AMDGpuPropertyOpModeType>(rhs));
}
//
// Note: Due to the fact that we have different enum elements with the same
// number, keying a hash by the number is not an option; ie:
// - DevInfoTypes::kDevVendorID = 7
// - MonitorTypes::kMonPowerCapDefault = 7
// So, we are keying it by a unique key, based on their info types
//
const AMDGpuVerbList_t amdgpu_verb_check_list {
{ AMDGpuVerbTypes_t::kNone, "None" },
{ AMDGpuVerbTypes_t::kSetGpuPciBandwidth, "amdsmi_set_gpu_pci_bandwidth" },
{ AMDGpuVerbTypes_t::kSetPowerCap, "amdsmi_set_power_cap" },
{ AMDGpuVerbTypes_t::kSetGpuPowerProfile, "amdsmi_set_gpu_power_profile" },
{ AMDGpuVerbTypes_t::kSetGpuClkRange, "amdsmi_set_gpu_clk_range" },
{ AMDGpuVerbTypes_t::kSetGpuOdClkInfo, "amdsmi_set_gpu_od_clk_info" },
{ AMDGpuVerbTypes_t::kSetGpuOdVoltInfo, "amdsmi_set_gpu_od_volt_info" },
{ AMDGpuVerbTypes_t::kSetGpuPerfLevelV1, "amdsmi_set_gpu_perf_level_v1" },
{ AMDGpuVerbTypes_t::kSetGpuPerfLevel, "amdsmi_set_gpu_perf_level" },
{ AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets, "amdsmi_get_gpu_power_profile_presets" },
{ AMDGpuVerbTypes_t::kResetGpu, "amdsmi_reset_gpu" },
{ AMDGpuVerbTypes_t::kSetGpuPerfDeterminismMode, "amdsmi_set_gpu_perf_determinism_mode" },
{ AMDGpuVerbTypes_t::kSetGpuFanSpeed, "amdsmi_set_gpu_fan_speed" },
{ AMDGpuVerbTypes_t::kResetGpuFan, "amdsmi_reset_gpu_fan" },
{ AMDGpuVerbTypes_t::kSetClkFreq, "amdsmi_set_clk_freq" },
{ AMDGpuVerbTypes_t::kSetGpuOverdriveLevelV1, "amdsmi_set_gpu_overdrive_level_v1" },
{ AMDGpuVerbTypes_t::kSetGpuOverdriveLevel, "amdsmi_set_gpu_overdrive_level" },
{ AMDGpuVerbTypes_t::kGetGpuFanRpms, "amdsmi_get_gpu_fan_rpms" },
{ AMDGpuVerbTypes_t::kGetGpuFanSpeed, "amdsmi_get_gpu_fan_speed" },
{ AMDGpuVerbTypes_t::kGetGpuFanSpeedMax, "amdsmi_get_gpu_fan_speed_max" },
{ AMDGpuVerbTypes_t::kGetGpuVoltMetric, "amdsmi_get_temp_metric" },
{ AMDGpuVerbTypes_t::kGetGpuOverDriveLevel, "amdsmi_get_gpu_overdrive_level" },
{ AMDGpuVerbTypes_t::kGetGpuOdVoltInfo, "amdsmi_get_gpu_od_volt_info" },
{ AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, "amdsmi_get_gpu_od_volt_curve_regions" }
};
const uint16_t kDevRevIDAll(0xFFFF);
const AMDGpuPropertyList_t amdgpu_property_reinforcement_list {
//
// {"Asic ID", {"Asic Rev. ID", "Unique Property ID", "Property Op.Mode", "Availability Flag"}}
// DevInfoTypes::kDevPCIEClk = rsmi_dev_pci_bandwidth_get; rsmi_dev_pci_bandwidth_set
// MonitorTypes::kMonPowerCapDefault = rsmi_dev_power_cap_default_get;
// DevInfoTypes::kDevPowerProfileMode =
// rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL = rsmi_dev_clk_range_set;
//
// AMD Instinct MI210
{0x740F, {0x02,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevPowerProfileMode),
AMDGpuVerbTypes_t::kSetGpuPowerProfile,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
// AMD MIxxx
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevPCIEClk),
AMDGpuVerbTypes_t::kSetGpuPciBandwidth,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonPowerCapDefault),
AMDGpuVerbTypes_t::kSetPowerCap,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevPowerProfileMode),
AMDGpuVerbTypes_t::kSetGpuPowerProfile,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL),
AMDGpuVerbTypes_t::kSetGpuClkRange,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL),
AMDGpuVerbTypes_t::kSetGpuOdClkInfo,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL),
AMDGpuVerbTypes_t::kSetGpuOdVoltInfo,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_AUTO),
AMDGpuVerbTypes_t::kSetGpuPerfLevelV1,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL),
AMDGpuVerbTypes_t::kSetGpuPerfLevel,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevPowerProfileMode),
AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevGpuReset),
AMDGpuVerbTypes_t::kResetGpu,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_DETERMINISM),
AMDGpuVerbTypes_t::kSetGpuPerfDeterminismMode,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonFanSpeed),
AMDGpuVerbTypes_t::kSetGpuFanSpeed,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonFanCntrlEnable),
AMDGpuVerbTypes_t::kResetGpuFan,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kClkTypes,
rsmi_clk_type::RSMI_CLK_TYPE_FIRST),
AMDGpuVerbTypes_t::kSetClkFreq,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevOverDriveLevel),
AMDGpuVerbTypes_t::kSetGpuOverdriveLevel,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevOverDriveLevel),
AMDGpuVerbTypes_t::kSetGpuOverdriveLevelV1,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonFanRPMs),
AMDGpuVerbTypes_t::kGetGpuFanRpms,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonFanSpeed),
AMDGpuVerbTypes_t::kGetGpuFanSpeed,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonMaxFanSpeed),
AMDGpuVerbTypes_t::kGetGpuFanSpeedMax,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes,
rsmi_voltage_metric_t::RSMI_VOLT_CURRENT),
AMDGpuVerbTypes_t::kGetGpuVoltMetric,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevOverDriveLevel),
AMDGpuVerbTypes_t::kGetGpuOverDriveLevel,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevPowerODVoltage),
AMDGpuVerbTypes_t::kGetGpuOdVoltInfo,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevPowerODVoltage),
AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions,
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
}
};
rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbTypes_t verb_type, rsmi_status_t actual_error_code)
{
std::ostringstream osstream;
osstream << __PRETTY_FUNCTION__ << "| ======= start =======";
osstream << __PRETTY_FUNCTION__ << " actual error code: " << actual_error_code << "\n";
LOG_TRACE(osstream);
if (actual_error_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
return actual_error_code;
}
//
// For property reinforcement query, the possible return values are:
// RSMI_STATUS_SUCCESS:
// - Property found in the reinforcement table, and it *should exist*
// RSMI_STATUS_NOT_SUPPORTED:
// - Property found in the reinforcement table, and it *should not* exist
// RSMI_STATUS_NO_DATA:
// - Could not find the correct dev_id and dev_revision info to build the filter
// RSMI_STATUS_UNKNOWN_ERROR:
// - The results are initialized with that. If that is returned,
// likely the reinforcement table does not contain any entries/rules for the
// dev_id in question.
//
auto amdgpu_property_query_result_hdlr = [](rsmi_status_t query_result) {
switch (query_result) {
case (rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR):
case (rsmi_status_t::RSMI_STATUS_NO_DATA):
return rsmi_status_t::RSMI_STATUS_NOT_FOUND;
break;
case (rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED):
case (rsmi_status_t::RSMI_STATUS_SUCCESS):
return query_result;
break;
default:
return rsmi_status_t::RSMI_STATUS_NOT_FOUND;
break;
}
};
///
GET_DEV_FROM_INDX
osstream << __PRETTY_FUNCTION__ << "| ======= about to run property query ======="
<< " [query filters: ]"
<< " device: " << dv_ind
<< " property/verb: " << static_cast<AMDGpuVerbId_t>(verb_type) << amdgpu_verb_check_list.at(verb_type);
auto reinforcement_query_result = dev->check_amdgpu_property_reinforcement_query(dv_ind, verb_type);
osstream << __PRETTY_FUNCTION__ << "| ======= result from property query ======="
<< " query result: " << reinforcement_query_result;
reinforcement_query_result = amdgpu_property_query_result_hdlr(reinforcement_query_result);
osstream << __PRETTY_FUNCTION__ << "| ======= result from property query ======="
<< " query result: " << reinforcement_query_result;
return reinforcement_query_result;
}
void dump_amdgpu_property_reinforcement_list()
{
std::ostringstream osstream;
osstream << __PRETTY_FUNCTION__ << "| ======= start =======";
if (!amdgpu_property_reinforcement_list.empty()) {
for (const auto& property : amdgpu_property_reinforcement_list) {
osstream << __PRETTY_FUNCTION__
<< " Asic ID: " << property.first
<< " Asic Rev.ID: " << property.second.m_pci_rev_id
<< " Property ID: " << property.second.m_property
<< " Verb ID : " << static_cast<AMDGpuVerbId_t>(property.second.m_verb_id)
<< " Verb Desc: " << amdgpu_verb_check_list.at(property.second.m_verb_id)
<< " OpMode: " << static_cast<AMDGpuOpModeType_t>(property.second.m_opmode)
<< " OpMode Desc: " << amdgpu_opmode_check_list.at(property.second.m_opmode)
<< " Flag Avail.: " << property.second.m_should_be_available;
}
osstream << __PRETTY_FUNCTION__ << "| ======= end =======";
return;
}
osstream << __PRETTY_FUNCTION__ << "amdgpu_property_reinforcement_list is empty";
LOG_TRACE(osstream);
}
rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type)
{
std::ostringstream osstream;
auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR);
AMDGpuPropertyQuery_t amdgpu_property_query = [&]() {
AMDGpuPropertyQuery_t amdgpu_property_query_init{};
amdgpu_property_query_init.m_asic_id = 0;
amdgpu_property_query_init.m_pci_rev_id = 0;
amdgpu_property_query_init.m_dev_idx = dev_idx;
amdgpu_property_query_init.m_property = 0;
amdgpu_property_query_init.m_verb_id = verb_type;
return amdgpu_property_query_init;
}();
auto build_asic_id_filters = [&](const AMDGpuPropertyQuery_t& amdgpu_query_validate, bool& is_filter_good) {
auto tmp_amdgpu_query = amdgpu_query_validate;
auto id_filter_result(rsmi_status_t::RSMI_STATUS_SUCCESS);
if (amdgpu_query_validate.m_asic_id == 0) {
id_filter_result = rsmi_dev_id_get(dev_idx, &tmp_amdgpu_query.m_asic_id);
if (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) {
id_filter_result = rsmi_dev_revision_get(dev_idx, &tmp_amdgpu_query.m_pci_rev_id);
}
}
is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) ? true : false;
return tmp_amdgpu_query;
};
// If the original amdgpu_query is missing parts of the filter, such as;
// asic_id, revision_id, we try to retrieve them based on the dev_idx.
// the property we are searching for, *must be present* .
osstream << __PRETTY_FUNCTION__ << "| ======= start =======";
LOG_TRACE(osstream);
bool is_proper_query(false);
amdgpu_property_query = build_asic_id_filters(amdgpu_property_query, is_proper_query);
if (!is_proper_query) {
rsmi_status = rsmi_status_t::RSMI_STATUS_NO_DATA;
osstream << __PRETTY_FUNCTION__ << "| ======= end ======="
<< ", Missing Query Filters were not successfully retrieved: "
<< " [query filters: ]"
<< " device: " << dev_idx
<< " asic id: " << amdgpu_property_query.m_asic_id
<< " revision id: " << amdgpu_property_query.m_pci_rev_id
<< " property: " << amdgpu_property_query.m_property
<< " verb: " << static_cast<AMDGpuVerbId_t>(amdgpu_property_query.m_verb_id)
<< " proper_query: " << is_proper_query
<< " error: " << rsmi_status;
LOG_TRACE(osstream);
return rsmi_status;
}
return run_amdgpu_property_reinforcement_query(amdgpu_property_query);
}
rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query)
{
std::ostringstream osstream;
auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR);
auto contains = [](const uint16_t asic_id) {
return (amdgpu_property_reinforcement_list.find(asic_id) != amdgpu_property_reinforcement_list.end());
};
auto ends_with = [](const std::string& value, const std::string& ending) {
if (value.size() < ending.size()) {
return false;
}
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
};
// Traverse through all values for a given key
osstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n";
LOG_TRACE(osstream);
if (contains(amdgpu_property_query.m_asic_id)) {
osstream << __PRETTY_FUNCTION__ << " asic id found in table: " << amdgpu_property_query.m_asic_id << "\n";
auto itr_begin = amdgpu_property_reinforcement_list.lower_bound(amdgpu_property_query.m_asic_id);
auto itr_end = amdgpu_property_reinforcement_list.upper_bound(amdgpu_property_query.m_asic_id);
while (itr_begin != itr_end) {
// Still same key, and...
if (itr_begin->first == amdgpu_property_query.m_asic_id) {
osstream << __PRETTY_FUNCTION__ << " asic id found: " << itr_begin->first << "\n";
// Pci_rev_id matches the filter or ALL Revisions
if ((itr_begin->second.m_pci_rev_id == amdgpu_property_query.m_pci_rev_id) ||
(itr_begin->second.m_pci_rev_id == kDevRevIDAll)) {
osstream << __PRETTY_FUNCTION__ << " asic rev.id found: " << itr_begin->second.m_pci_rev_id << "\n";
// Do we have the property we are looking for?
if (((amdgpu_property_query.m_property != 0) &&
(itr_begin->second.m_property == amdgpu_property_query.m_property)) ||
((amdgpu_property_query.m_verb_id != AMDGpuVerbTypes_t::kNone) &&
(itr_begin->second.m_verb_id == amdgpu_property_query.m_verb_id))) {
osstream << __PRETTY_FUNCTION__
<< " property found: " << itr_begin->second.m_property
<< " verb found: " << static_cast<AMDGpuVerbId_t>(itr_begin->second.m_verb_id)
<< " " << amdgpu_verb_check_list.at(amdgpu_property_query.m_verb_id)
<< " should_be_available: " << itr_begin->second.m_should_be_available << "\n";
// and if we do, should we consider it available, or forcefully
// considered it unavailable
osstream << __PRETTY_FUNCTION__ << "| ======= validating ======="
<< ", Property found in the table for this device and flagged as *Not Available* : "
<< " [query filters: ]"
<< " device: " << amdgpu_property_query.m_dev_idx
<< " asic id: " << amdgpu_property_query.m_asic_id
<< " revision id: " << amdgpu_property_query.m_pci_rev_id
<< " reinf.tbl.rev. id: " << itr_begin->second.m_pci_rev_id;
//
// The property is set in the reinforcement table to 'it should not be available'
if (!itr_begin->second.m_should_be_available) {
// If the property is found and set to not available
// (rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED),
// it should be all good (rsmi_status_t::RSMI_STATUS_SUCCESS);
rsmi_status = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED;
osstream << __PRETTY_FUNCTION__
<< " should_be_available: " << itr_begin->second.m_should_be_available
<< " result: " << rsmi_status << "\n";
LOG_TRACE(osstream);
return rsmi_status;
}
//
// The property is set in the reinforcement table to 'it should be available'
rsmi_status = rsmi_status_t::RSMI_STATUS_SUCCESS;
osstream << __PRETTY_FUNCTION__
<< " should_be_available: " << itr_begin->second.m_should_be_available
<< " result: " << rsmi_status << "\n";
LOG_TRACE(osstream);
return rsmi_status;
}
}
}
itr_begin++;
}
}
osstream << __PRETTY_FUNCTION__ << "| ======= end ======="
<< "Done searching for the Property in reinforcement table for this device: "
<< " device: " << amdgpu_property_query.m_dev_idx
<< " asic id: " << amdgpu_property_query.m_asic_id
<< " revision id: " << amdgpu_property_query.m_pci_rev_id
<< " property id: " << amdgpu_property_query.m_property
<< " error: " << rsmi_status;
LOG_TRACE(osstream);
return rsmi_status;
}
} // namespace smi
} // namespace amd
+1 -1
Zobrazit soubor
@@ -106,7 +106,7 @@ void TestSysInfoRead::Run(void) {
err = amdsmi_get_gpu_vbios_info(processor_handles_[i], &info);
if (err != AMDSMI_STATUS_SUCCESS) {
if (err == AMDSMI_STATUS_FILE_ERROR) {
if ((err == AMDSMI_STATUS_FILE_ERROR) || (err == AMDSMI_STATUS_NOT_SUPPORTED)) {
IF_VERB(STANDARD) {
std::cout << "\t**VBIOS read: Not supported on this machine"
<< std::endl;
+15 -1
Zobrazit soubor
@@ -55,6 +55,20 @@ FILTER[sienna_cichlid]=\
$BLACKLIST_ALL_ASICS\
"rsmitstReadWrite.TestPerfLevelReadWrite"
# SWDEV-391407
FILTER[90400]=\
$BLACKLIST_ALL_ASICS\
"rsmitstReadOnly.TestVoltCurvRead:"\
"rsmitstReadOnly.TestFrequenciesRead:"\
"rsmitstReadWrite.TestFrequenciesReadWrite:"\
"rsmitstReadWrite.TestPowerReadWrite"
FILTER[90401]=\
$BLACKLIST_ALL_ASICS\
"rsmitstReadOnly.TestVoltCurvRead:"\
"rsmitstReadOnly.TestFrequenciesRead:"\
"rsmitstReadWrite.TestFrequenciesReadWrite:"\
"rsmitstReadWrite.TestPowerReadWrite"
# SWDEV-321166
FILTER[virtualization]=\
$BLACKLIST_ALL_ASICS\
@@ -63,4 +77,4 @@ $BLACKLIST_ALL_ASICS\
"rsmitstReadWrite.FanReadWrite:"\
"rsmitstReadWrite.TestOverdriveReadWrite:"\
"rsmitstReadWrite.TestPowerReadWrite:"\
"rsmitstReadWrite.TestPowerCapReadWrite"
"rsmitstReadWrite.TestPowerCapReadWrite"
+7
Zobrazit soubor
@@ -173,6 +173,13 @@ void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) {
std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl;
}
err = amdsmi_get_gpu_revision(dv_ind, &val_ui16);
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Device Revision ID: 0x" << std::hex <<
val_ui16 << std::endl;
}
amdsmi_board_info_t board_info;
err = amdsmi_get_gpu_board_info(dv_ind, &board_info);
CHK_ERR_ASRT(err)
+1
Zobrazit soubor
@@ -283,6 +283,7 @@ void DumpMonitorInfo(const TestBase *test) {
};
print_val_str(amd::smi::kDevDevID, "Device ID: ");
print_val_str(amd::smi::kDevDevRevID, "Dev.Rev.ID: ");
print_val_str(amd::smi::kDevPerfLevel, "Performance Level: ");
print_val_str(amd::smi::kDevOverDriveLevel, "OverDrive Level: ");
print_vector(amd::smi::kDevGPUMClk,
+325
Zobrazit soubor
@@ -0,0 +1,325 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2019, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <stdint.h>
#include <stddef.h>
#include <iostream>
#include <string>
#include "gtest/gtest.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi_test/functional/id_info_read.h"
#include "rocm_smi_test/test_common.h"
TestIdInfoRead::TestIdInfoRead() : TestBase() {
set_title("RSMI ID Info Read Test");
set_description("This test verifies that ID information such as the "
"device, subsystem and vendor IDs can be read properly.");
}
TestIdInfoRead::~TestIdInfoRead(void) {
}
void TestIdInfoRead::SetUp(void) {
TestBase::SetUp();
return;
}
void TestIdInfoRead::DisplayTestInfo(void) {
TestBase::DisplayTestInfo();
}
void TestIdInfoRead::DisplayResults(void) const {
TestBase::DisplayResults();
return;
}
void TestIdInfoRead::Close() {
// This will close handles opened within rsmitst utility calls and call
// rsmi_shut_down(), so it should be done after other hsa cleanup
TestBase::Close();
}
static const uint32_t kBufferLen = 80;
void TestIdInfoRead::Run(void) {
rsmi_status_t err;
uint16_t id;
uint64_t val_ui64;
uint32_t drm_render_minor;
char buffer[kBufferLen];
TestBase::Run();
if (setup_failed_) {
std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl;
return;
}
for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
IF_VERB(STANDARD) {
std::cout << "\t*************************" << std::endl;
std::cout << "\t**Device index: " << i << std::endl;
}
// Get the device ID, name, vendor ID and vendor name for the device
err = rsmi_dev_id_get(i, &id);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
rsmi_status_t ret;
// Verify api support checking functionality is working
ret = rsmi_dev_id_get(i, nullptr);
ASSERT_EQ(ret, RSMI_STATUS_NOT_SUPPORTED);
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Device ID: 0x" << std::hex << id << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_id_get(i, nullptr);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
// Get device Revision
err = rsmi_dev_revision_get(i, &id);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
rsmi_status_t ret;
// Verify api support checking functionality is working
ret = rsmi_dev_revision_get(i, nullptr);
ASSERT_EQ(ret, RSMI_STATUS_NOT_SUPPORTED);
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << id << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_revision_get(i, nullptr);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
err = rsmi_dev_name_get(i, buffer, kBufferLen);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout << "\t**Device Marketing name not found on this system." <<
std::endl;
// Verify api support checking functionality is working
err = rsmi_dev_name_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Device Marketing name: " << buffer << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_name_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
err = rsmi_dev_brand_get(i, buffer, kBufferLen);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
// Verify api support checking functionality is working
err = rsmi_dev_brand_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Device Brand name: " << buffer << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_brand_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
err = rsmi_dev_vram_vendor_get(i, buffer, kBufferLen);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout <<
"\t**Vram Vendor string not supported on this system." << std::endl;
err = rsmi_dev_vram_vendor_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Device Vram Vendor name: " << buffer << std::endl;
}
err = rsmi_dev_vram_vendor_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
err = rsmi_dev_vendor_id_get(i, &id);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
// Verify api support checking functionality is working
err = rsmi_dev_vendor_id_get(i, nullptr);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Vendor ID: 0x" << std::hex << id << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_vendor_id_get(i, nullptr);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
err = rsmi_dev_drm_render_minor_get(i, &drm_render_minor);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
// Verify api support checking functionality is working
err = rsmi_dev_drm_render_minor_get(i, nullptr);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**DRM Render Minor: " << drm_render_minor << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_drm_render_minor_get(i, nullptr);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
err = rsmi_dev_vendor_name_get(i, buffer, kBufferLen);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout << "\t**Device Vendor name string not found on this system." <<
std::endl;
// Verify api support checking functionality is working
err = rsmi_dev_vendor_name_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Device Vendor name: " << buffer << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_vendor_name_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
// Get the device ID, name, vendor ID and vendor name for the sub-device
err = rsmi_dev_subsystem_id_get(i, &id);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
// Verify api support checking functionality is working
err = rsmi_dev_subsystem_id_get(i, nullptr);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Subsystem ID: 0x" << std::hex << id << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_subsystem_id_get(i, nullptr);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
err = rsmi_dev_subsystem_name_get(i, buffer, kBufferLen);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout << "\t**Subsystem name string not found on this system." <<
std::endl;
// Verify api support checking functionality is working
err = rsmi_dev_subsystem_name_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Subsystem name: " << buffer << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_subsystem_name_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
err = rsmi_dev_subsystem_vendor_id_get(i, &id);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
// Verify api support checking functionality is working
err = rsmi_dev_subsystem_vendor_id_get(i, nullptr);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Sub-system Vendor ID: 0x" << std::hex <<
id << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_subsystem_vendor_id_get(i, nullptr);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
err = rsmi_dev_vendor_name_get(i, buffer, kBufferLen);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout <<
"\t**Subsystem Vendor name string not found on this system." <<
std::endl;
// Verify api support checking functionality is working
err = rsmi_dev_vendor_name_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Subsystem Vendor name: " << buffer << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_vendor_name_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
err = rsmi_dev_pci_id_get(i, &val_ui64);
// Don't check for RSMI_STATUS_NOT_SUPPORTED since this should always be
// supported. It is not based on a sysfs file.
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**PCI ID (BDFID): 0x" << std::hex << val_ui64;
std::cout << " (" << std::dec << val_ui64 << ")" << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_pci_id_get(i, nullptr);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
err = rsmi_dev_serial_number_get(i, buffer, kBufferLen);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
// Verify api support checking functionality is working
err = rsmi_dev_serial_number_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
std::cout <<
"\t**Serial Number string not supported on this system." << std::endl;
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Device Serial Number:" << buffer << std::endl;
}
// Verify api support checking functionality is working
err = rsmi_dev_serial_number_get(i, nullptr, kBufferLen);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
}
}