Merge remote-tracking branch 'rocmsmi/amd-staging' into amd-dev
Change-Id: I9c38b4facd472b877d1ad133f3176a023c890955
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
[ROCm/amdsmi commit: 936719eeb6]
Tento commit je obsažen v:
@@ -28,3 +28,6 @@ _toc.yml
|
||||
_build/
|
||||
_doxygen/
|
||||
docBin/
|
||||
|
||||
# Simulated SYSFS - for early development or debug
|
||||
device/
|
||||
|
||||
@@ -62,7 +62,7 @@ EOF
|
||||
# confirm logrotate file exists in daily
|
||||
if [ -f /etc/cron.daily/logrotate ]; then
|
||||
# move logrotate daily to hourly
|
||||
if [ -f /etc/cron.hourly/logrotate ]; then
|
||||
if [ -d /etc/cron.hourly ]; then
|
||||
sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
|
||||
else
|
||||
echo "[WARNING] Could find and configure hourly cron for $packageName's"\
|
||||
@@ -77,6 +77,7 @@ EOF
|
||||
"$packageName logs (when turned on) may not rotate properly."
|
||||
fi
|
||||
fi
|
||||
return #done configuring for non-systemd timers
|
||||
else
|
||||
# Configure systemd timers - the typical setup for modern Linux logrotation setups
|
||||
if [ -f /lib/systemd/system/logrotate.timer ]; then
|
||||
@@ -102,6 +103,7 @@ EOF
|
||||
echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\
|
||||
"$packageName logs (when turned on) will not rotate properly."
|
||||
fi
|
||||
return #done configuring for systemd timers
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
@@ -38,6 +38,7 @@ rm_pyc() {
|
||||
rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/amd_smi/__pycache__
|
||||
}
|
||||
|
||||
|
||||
case "$1" in
|
||||
( remove | upgrade)
|
||||
rm_ldconfig
|
||||
|
||||
@@ -62,7 +62,7 @@ EOF
|
||||
# confirm logrotate file exists in daily
|
||||
if [ -f /etc/cron.daily/logrotate ]; then
|
||||
# move logrotate daily to hourly
|
||||
if [ -f /etc/cron.hourly/logrotate ]; then
|
||||
if [ -d /etc/cron.hourly ]; then
|
||||
sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
|
||||
else
|
||||
echo "[WARNING] Could find and configure hourly cron for $packageName's"\
|
||||
@@ -77,6 +77,7 @@ EOF
|
||||
"$packageName logs (when turned on) may not rotate properly."
|
||||
fi
|
||||
fi
|
||||
return #done configuring for non-systemd timers
|
||||
else
|
||||
# Configure systemd timers - the typical setup for modern Linux logrotation setups
|
||||
if [ -f /lib/systemd/system/logrotate.timer ]; then
|
||||
@@ -102,6 +103,7 @@ EOF
|
||||
echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\
|
||||
"$packageName logs (when turned on) will not rotate properly."
|
||||
fi
|
||||
return #done configuring for systemd timers
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
@@ -1456,6 +1456,21 @@ amdsmi_status_t amdsmi_get_processor_handle_from_bdf(amdsmi_bdf_t bdf, amdsmi_pr
|
||||
*/
|
||||
amdsmi_status_t amdsmi_get_gpu_id(amdsmi_processor_handle processor_handle, uint16_t *id);
|
||||
|
||||
/**
|
||||
* @brief Get the device revision associated with the device
|
||||
*
|
||||
* @details Given a processor handle @p processor_handle and a pointer to a
|
||||
* uint16_t @p revision to which the revision id will be written
|
||||
*
|
||||
* @param[in] processor_handle a processor handle
|
||||
*
|
||||
* @param[out] revision a pointer to uint16_t to which the device revision
|
||||
* will be written
|
||||
*
|
||||
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
|
||||
*/
|
||||
amdsmi_status_t amdsmi_get_gpu_revision(amdsmi_processor_handle processor_handle, uint16_t *revision);
|
||||
|
||||
/**
|
||||
* @brief Get the name string for a give vendor ID
|
||||
*
|
||||
|
||||
Spustitelný soubor
+395
@@ -0,0 +1,395 @@
|
||||
/*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2020 Open Compute Project
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <dirent.h>
|
||||
#include <sstream>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <regex> // NOLINT
|
||||
#include <map>
|
||||
|
||||
#include "rocm_smi/rocm_smi_common.h"
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_device.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
#include "rocm_smi/rocm_smi_counters.h"
|
||||
#include "rocm_smi/rocm_smi_kfd.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
#include "oam/oam_mapi.h"
|
||||
#include "oam/amd_oam.h"
|
||||
|
||||
static const std::map<int, const char *> err_map = {
|
||||
{ AMDOAM_STATUS_INVALID_ARGS, "Invalid arguments" },
|
||||
{ AMDOAM_STATUS_NOT_SUPPORTED, "Feature not supported" },
|
||||
{ AMDOAM_STATUS_FILE_ERROR, "Problem accessing a file" },
|
||||
{ AMDOAM_STATUS_PERMISSION, "Permission denied" },
|
||||
{ AMDOAM_STATUS_OUT_OF_RESOURCES, "Not enough memory or other resource" },
|
||||
{ AMDOAM_STATUS_INTERNAL_EXCEPTION, "An internal exception was caught" },
|
||||
{ AMDOAM_STATUS_INPUT_OUT_OF_BOUNDS,
|
||||
"The provided input is out of allowable or safe range" },
|
||||
{ AMDOAM_STATUS_INIT_ERROR, "AMDOAM is not initialized or init failed" },
|
||||
{ AMDOAM_STATUS_ERROR, "Generic error" },
|
||||
{ AMDOAM_STATUS_NOT_FOUND, "An item was searched for but not found" }
|
||||
};
|
||||
|
||||
#define TRY try {
|
||||
#define CATCH } catch (...) {return handleRSMIException();}
|
||||
|
||||
static bool rsmi_initialized;
|
||||
|
||||
static int rsmi_status_to_amdoam_errorcode(rsmi_status_t status) {
|
||||
if (status > RSMI_STATUS_INIT_ERROR)
|
||||
return -AMDOAM_STATUS_ERROR;
|
||||
else
|
||||
return -1 * static_cast<int>(status);
|
||||
}
|
||||
|
||||
static int handleRSMIException() {
|
||||
rsmi_status_t ret = amd::smi::handleException();
|
||||
return rsmi_status_to_amdoam_errorcode(ret);
|
||||
}
|
||||
|
||||
int amdoam_get_error_description(int code, const char **description) {
|
||||
if (description == nullptr)
|
||||
return -AMDOAM_STATUS_INVALID_ARGS;
|
||||
|
||||
auto search = err_map.find(code);
|
||||
if (search == err_map.end())
|
||||
return -AMDOAM_STATUS_NOT_FOUND;
|
||||
|
||||
*description = search->second;
|
||||
return AMDOAM_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
int amdoam_init(void) {
|
||||
TRY
|
||||
|
||||
rsmi_status_t status = rsmi_init(0);
|
||||
|
||||
if (status != RSMI_STATUS_SUCCESS)
|
||||
return rsmi_status_to_amdoam_errorcode(status);
|
||||
|
||||
rsmi_initialized = true;
|
||||
return AMDOAM_STATUS_SUCCESS;
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
int amdoam_free(void) {
|
||||
rsmi_status_t status = rsmi_shut_down();
|
||||
|
||||
if (status != RSMI_STATUS_SUCCESS)
|
||||
return rsmi_status_to_amdoam_errorcode(status);
|
||||
|
||||
return AMDOAM_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
int amdoam_discover_devices(uint32_t *device_count) {
|
||||
rsmi_status_t status;
|
||||
|
||||
if (device_count == nullptr) {
|
||||
return -AMDOAM_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
status = rsmi_num_monitor_devices(device_count);
|
||||
if (status != RSMI_STATUS_SUCCESS)
|
||||
return rsmi_status_to_amdoam_errorcode(status);
|
||||
|
||||
return AMDOAM_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
int amdoam_get_pci_properties(uint32_t device_id, oam_pci_info_t *pci_info) {
|
||||
uint64_t bdfid;
|
||||
|
||||
TRY
|
||||
if (pci_info == nullptr) {
|
||||
return -AMDOAM_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
rsmi_status_t status = rsmi_dev_pci_id_get(device_id, &bdfid);
|
||||
if (status != RSMI_STATUS_SUCCESS)
|
||||
return rsmi_status_to_amdoam_errorcode(status);
|
||||
|
||||
pci_info->domain = (uint16_t)(bdfid >> 32) & 0xffff;
|
||||
pci_info->bus = (bdfid >> 8) & 0xff;
|
||||
pci_info->device = (bdfid >> 3) & 0x1f;
|
||||
pci_info->function = bdfid & 0x7;
|
||||
CATCH
|
||||
|
||||
return AMDOAM_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
int amdoam_get_dev_properties(uint32_t num_devices,
|
||||
oam_dev_properties_t *devices) {
|
||||
const size_t buf_size = 32;
|
||||
char buf[buf_size] = "";
|
||||
uint32_t dev_inx;
|
||||
oam_dev_properties_t *dev = devices;
|
||||
|
||||
TRY
|
||||
if (devices == nullptr)
|
||||
return -AMDOAM_STATUS_INVALID_ARGS;
|
||||
if (!rsmi_initialized)
|
||||
return -AMDOAM_STATUS_INIT_ERROR;
|
||||
|
||||
for (dev_inx = 0; dev_inx < num_devices; dev_inx++) {
|
||||
dev->device_id = dev_inx;
|
||||
/* If fails to get any following properties, it's not treated as a deal
|
||||
* breaker. Variable not filled means that property is not available on
|
||||
* this device or AMD doesn't support that property.
|
||||
*/
|
||||
rsmi_dev_vendor_name_get(dev_inx, dev->device_vendor, DEVICE_VENDOR_LEN);
|
||||
rsmi_dev_name_get(dev_inx, dev->device_name, DEVICE_NAME_LEN);
|
||||
rsmi_dev_vbios_version_get(dev_inx, buf, buf_size);
|
||||
if (std::strlen(buf) > 0) {
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wstringop-truncation"
|
||||
std::strncpy(dev->sku_name, &buf[4], 6);
|
||||
std::strncpy(dev->board_name, buf, 12);
|
||||
#pragma GCC diagnostic pop
|
||||
}
|
||||
rsmi_dev_serial_number_get(dev_inx, dev->board_serial_number,
|
||||
BOARD_SERIAL_NUM_LEN);
|
||||
++dev;
|
||||
}
|
||||
CATCH
|
||||
|
||||
return AMDOAM_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
get_num_sensors(std::string hwmon_path, std::string fn_reg) {
|
||||
uint32_t sensor_max = 0;
|
||||
std::string fn_reg_ex = "\\b" + fn_reg + "([0-9]+)([^ ]*)";
|
||||
std::string fn;
|
||||
std::smatch m;
|
||||
int32_t temp = 0;
|
||||
std::string s1("in");
|
||||
std::regex re(fn_reg_ex);
|
||||
auto hwmon_dir = opendir(hwmon_path.c_str());
|
||||
assert(hwmon_dir != nullptr);
|
||||
auto dentry = readdir(hwmon_dir);
|
||||
while (dentry != nullptr) {
|
||||
fn = dentry->d_name;
|
||||
if (std::regex_search(fn, m, re)) {
|
||||
std::string output = std::regex_replace(
|
||||
fn,
|
||||
std::regex("[^0-9]*([0-9]+).*"),
|
||||
std::string("$1"));
|
||||
temp = stoi(output);
|
||||
|
||||
assert(temp >= 0);
|
||||
|
||||
if (s1.compare(fn_reg) == 0)
|
||||
++temp;
|
||||
if (static_cast<uint32_t>(temp) > sensor_max)
|
||||
sensor_max = static_cast<uint32_t>(temp);
|
||||
}
|
||||
dentry = readdir(hwmon_dir);
|
||||
}
|
||||
|
||||
closedir(hwmon_dir);
|
||||
return sensor_max;
|
||||
}
|
||||
|
||||
|
||||
int amdoam_get_sensors_count(uint32_t device_id,
|
||||
oam_sensor_count_t *sensor_count) {
|
||||
uint32_t dv_ind = device_id;
|
||||
|
||||
TRY
|
||||
if (sensor_count == nullptr)
|
||||
return -AMDOAM_STATUS_INVALID_ARGS;
|
||||
GET_DEV_FROM_INDX
|
||||
assert(dev->monitor() != nullptr);
|
||||
std::string hwmon_path = dev->monitor()->path();
|
||||
sensor_count->num_temperature_sensors = get_num_sensors(hwmon_path, "temp");
|
||||
sensor_count->num_fans = get_num_sensors(hwmon_path, "fan");
|
||||
sensor_count->num_voltage_sensors = get_num_sensors(hwmon_path, "in");
|
||||
sensor_count->num_power_sensors = get_num_sensors(hwmon_path, "power");
|
||||
sensor_count->num_current_sensors = get_num_sensors(hwmon_path, "current");
|
||||
CATCH
|
||||
|
||||
return AMDOAM_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
int amdoam_get_sensors_info(uint32_t device_id, oam_sensor_type_t type,
|
||||
uint32_t num_sensors, oam_sensor_info_t sensor_info[]) {
|
||||
uint32_t dv_ind = device_id;
|
||||
std::string val_str;
|
||||
uint32_t i;
|
||||
rsmi_status_t status;
|
||||
|
||||
TRY
|
||||
if ((sensor_info == nullptr) || (type >= OAM_SENSOR_TYPE_UNKNOWN))
|
||||
return -AMDOAM_STATUS_INVALID_ARGS;
|
||||
GET_DEV_FROM_INDX
|
||||
assert(dev->monitor() != nullptr);
|
||||
switch (type) {
|
||||
case OAM_SENSOR_TYPE_POWER:
|
||||
for (i = 0; i < num_sensors; i++) {
|
||||
snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX,
|
||||
"POWER_SENSOR_%u", i+1);
|
||||
sensor_info[i].sensor_type = type;
|
||||
status = rsmi_dev_power_ave_get(device_id, i,
|
||||
reinterpret_cast<uint64_t*>(&sensor_info[i].value));
|
||||
if (status != RSMI_STATUS_SUCCESS)
|
||||
return rsmi_status_to_amdoam_errorcode(status);
|
||||
}
|
||||
break;
|
||||
|
||||
case OAM_SENSOR_TYPE_VOLTAGE:
|
||||
for (i = 0; i < num_sensors; i++) {
|
||||
snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX,
|
||||
"VOLTAGE_SENSOR_%u", i);
|
||||
sensor_info[i].sensor_type = type;
|
||||
status = rsmi_dev_volt_metric_get(device_id, RSMI_VOLT_TYPE_VDDGFX,
|
||||
RSMI_VOLT_CURRENT, &sensor_info[i].value);
|
||||
if (status != RSMI_STATUS_SUCCESS)
|
||||
return rsmi_status_to_amdoam_errorcode(status);
|
||||
}
|
||||
break;
|
||||
|
||||
case OAM_SENSOR_TYPE_TEMP:
|
||||
for (i = 0; i < num_sensors; i++) {
|
||||
snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX,
|
||||
"TEMP_SENSOR_%u", i+1);
|
||||
sensor_info[i].sensor_type = type;
|
||||
status = rsmi_dev_temp_metric_get(device_id, i, RSMI_TEMP_CURRENT,
|
||||
&sensor_info[i].value);
|
||||
if (status != RSMI_STATUS_SUCCESS)
|
||||
return rsmi_status_to_amdoam_errorcode(status);
|
||||
}
|
||||
break;
|
||||
|
||||
case OAM_SENSOR_TYPE_FAN_SPEED:
|
||||
for (i = 0; i < num_sensors; i++) {
|
||||
snprintf(sensor_info[i].sensor_name, OAM_SENSOR_NAME_MAX,
|
||||
"FAN_SENSOR_%u", i+1);
|
||||
sensor_info[i].sensor_type = type;
|
||||
status = rsmi_dev_fan_speed_get(device_id, i, &sensor_info[i].value);
|
||||
if (status != RSMI_STATUS_SUCCESS)
|
||||
return rsmi_status_to_amdoam_errorcode(status);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
return -AMDOAM_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
CATCH
|
||||
|
||||
return AMDOAM_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// TODO(x): This function doesn't work for OAM. It's just a version
|
||||
// of rsmi_dev_ecc_count_get(), which has similar functionality.
|
||||
// The purpose here is just to drive refactoring; e.g., making macros
|
||||
// available and previously static functions global.
|
||||
int
|
||||
get_device_error_count(oam_dev_handle_t *handle,
|
||||
oam_dev_error_count_t *count) {
|
||||
std::vector<std::string> val_vec;
|
||||
rsmi_status_t ret;
|
||||
|
||||
TRY
|
||||
// TODO(x): replace with final code...
|
||||
// Below, we are just returning errors for RSMI_GPU_BLOCK_GFX as a
|
||||
// placeholder
|
||||
(void)handle; // Just ignore for now
|
||||
|
||||
rsmi_gpu_block_t block = RSMI_GPU_BLOCK_GFX;
|
||||
|
||||
// The macro CHK_SUPPORT_VAR assumes the existence of a device index variable
|
||||
// "dv_ind". Presumably, the device index will come from the "handle"
|
||||
// pointer. Since I don't know how that will be implemented, for now we
|
||||
// will just make up a device index:
|
||||
uint32_t dv_ind = 0;
|
||||
CHK_SUPPORT_VAR(count, block)
|
||||
|
||||
amd::smi::DevInfoTypes type;
|
||||
switch (block) {
|
||||
case RSMI_GPU_BLOCK_UMC:
|
||||
type = amd::smi::kDevErrCntUMC;
|
||||
break;
|
||||
|
||||
case RSMI_GPU_BLOCK_SDMA:
|
||||
type = amd::smi::kDevErrCntSDMA;
|
||||
break;
|
||||
|
||||
case RSMI_GPU_BLOCK_GFX:
|
||||
type = amd::smi::kDevErrCntGFX;
|
||||
break;
|
||||
|
||||
case RSMI_GPU_BLOCK_MMHUB:
|
||||
type = amd::smi::kDevErrCntMMHUB;
|
||||
break;
|
||||
|
||||
case RSMI_GPU_BLOCK_PCIE_BIF:
|
||||
type = amd::smi::kDevErrCntPCIEBIF;
|
||||
break;
|
||||
|
||||
case RSMI_GPU_BLOCK_HDP:
|
||||
type = amd::smi::kDevErrCntHDP;
|
||||
break;
|
||||
|
||||
case RSMI_GPU_BLOCK_XGMI_WAFL:
|
||||
type = amd::smi::kDevErrCntXGMIWAFL;
|
||||
break;
|
||||
|
||||
default:
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
DEVICE_MUTEX
|
||||
|
||||
ret = GetDevValueVec(type, dv_ind, &val_vec);
|
||||
|
||||
if (ret == RSMI_STATUS_FILE_ERROR) {
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return static_cast<int>(ret);
|
||||
}
|
||||
|
||||
assert(val_vec.size() == 2);
|
||||
|
||||
std::string junk;
|
||||
std::istringstream fs1(val_vec[0]);
|
||||
|
||||
fs1 >> junk;
|
||||
assert(junk == "ue:");
|
||||
fs1 >> count->total_error_count;
|
||||
|
||||
std::istringstream fs2(val_vec[1]);
|
||||
|
||||
fs2 >> junk;
|
||||
assert(junk == "ce:");
|
||||
fs2 >> count->total_error_count;
|
||||
|
||||
return static_cast<int>(ret);
|
||||
CATCH
|
||||
}
|
||||
@@ -718,6 +718,9 @@ int main() {
|
||||
ret = rsmi_dev_id_get(i, &val_ui16);
|
||||
CHK_RSMI_RET_I(ret)
|
||||
std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl;
|
||||
ret = rsmi_dev_revision_get(i, &val_ui16);
|
||||
CHK_RSMI_RET_I(ret)
|
||||
std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << val_ui16 << std::endl;
|
||||
|
||||
char current_compute_partition[256];
|
||||
current_compute_partition[0] = '\0';
|
||||
|
||||
@@ -36,6 +36,12 @@ struct kfd_ioctl_get_version_args {
|
||||
__u32 minor_version; /* from KFD */
|
||||
};
|
||||
|
||||
struct kfd_ioctl_get_available_memory_args {
|
||||
__u64 available; /* from KFD */
|
||||
__u32 gpu_id; /* to KFD */
|
||||
__u32 pad;
|
||||
};
|
||||
|
||||
/* For kfd_ioctl_create_queue_args.queue_type. */
|
||||
#define KFD_IOC_QUEUE_TYPE_COMPUTE 0x0
|
||||
#define KFD_IOC_QUEUE_TYPE_SDMA 0x1
|
||||
@@ -726,6 +732,10 @@ struct kfd_ioctl_cross_memory_copy_args {
|
||||
#define AMDKFD_IOC_CROSS_MEMORY_COPY \
|
||||
AMDKFD_IOWR(0x22, struct kfd_ioctl_cross_memory_copy_args)
|
||||
|
||||
|
||||
#define AMDKFD_IOC_AVAILABLE_MEMORY \
|
||||
AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args)
|
||||
|
||||
#define AMDKFD_COMMAND_START 0x01
|
||||
#undef AMDKFD_COMMAND_END
|
||||
#define AMDKFD_COMMAND_END 0x22
|
||||
|
||||
@@ -1088,6 +1088,21 @@ rsmi_status_t rsmi_num_monitor_devices(uint32_t *num_devices);
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id);
|
||||
|
||||
/**
|
||||
* @brief Get the device revision associated with the device
|
||||
*
|
||||
* @details Given a device index @p dv_ind and a pointer to a uint32_t to
|
||||
* which the revision will be written
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @param[inout] revision a pointer to uint32_t to which the device revision
|
||||
* will be written
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
|
||||
*
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision);
|
||||
|
||||
/**
|
||||
* @brief Get the SKU for a desired device associated with the device with
|
||||
|
||||
@@ -52,12 +52,14 @@
|
||||
#include <vector>
|
||||
#include <unordered_set>
|
||||
#include <map>
|
||||
#include <type_traits>
|
||||
|
||||
#include "rocm_smi/rocm_smi_monitor.h"
|
||||
#include "rocm_smi/rocm_smi_power_mon.h"
|
||||
#include "rocm_smi/rocm_smi_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_counters.h"
|
||||
#include "rocm_smi/rocm_smi_properties.h"
|
||||
#include "shared_mutex.h" //NOLINT
|
||||
|
||||
namespace amd {
|
||||
@@ -100,6 +102,7 @@ enum DevInfoTypes {
|
||||
kDevOverDriveLevel,
|
||||
kDevMemOverDriveLevel,
|
||||
kDevDevID,
|
||||
kDevDevRevID,
|
||||
kDevDevProdName,
|
||||
kDevDevProdNum,
|
||||
kDevVendorID,
|
||||
@@ -172,6 +175,7 @@ typedef struct {
|
||||
std::vector<DevInfoTypes> variants;
|
||||
} dev_depends_t;
|
||||
|
||||
|
||||
class Device {
|
||||
public:
|
||||
explicit Device(std::string path, RocmSMI_env_vars const *e);
|
||||
@@ -212,7 +216,7 @@ class Device {
|
||||
void set_evt_notif_anon_fd(uint32_t fd) {
|
||||
evt_notif_anon_fd_ = static_cast<int>(fd);}
|
||||
int evt_notif_anon_fd(void) const {return evt_notif_anon_fd_;}
|
||||
metrics_table_header_t & gpu_metrics_ver(void) {return gpu_metrics_ver_;}
|
||||
metrics_table_header_t &gpu_metrics_ver(void) {return gpu_metrics_ver_;}
|
||||
void fillSupportedFuncs(void);
|
||||
void DumpSupportedFunctions(void);
|
||||
bool DeviceAPISupported(std::string name, uint64_t variant,
|
||||
@@ -220,6 +224,8 @@ class Device {
|
||||
rsmi_status_t restartAMDGpuDriver(void);
|
||||
rsmi_status_t storeDevicePartitions(uint32_t dv_ind);
|
||||
template <typename T> std::string readBootPartitionState(uint32_t dv_ind);
|
||||
rsmi_status_t check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type);
|
||||
|
||||
|
||||
private:
|
||||
std::shared_ptr<Monitor> monitor_;
|
||||
@@ -240,6 +246,7 @@ class Device {
|
||||
int readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
|
||||
void *p_binary_data);
|
||||
int writeDevInfoStr(DevInfoTypes type, std::string valStr);
|
||||
rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query);
|
||||
uint64_t bdfid_;
|
||||
uint64_t kfd_gpu_id_;
|
||||
std::unordered_set<rsmi_event_group_t,
|
||||
@@ -252,6 +259,7 @@ class Device {
|
||||
struct metrics_table_header_t gpu_metrics_ver_;
|
||||
};
|
||||
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -80,6 +80,10 @@ class KFDNode {
|
||||
uint32_t amdgpu_dev_index(void) const {return amdgpu_dev_index_;}
|
||||
void set_amdgpu_dev_index(uint32_t val) {amdgpu_dev_index_ = val;}
|
||||
|
||||
// Get memory from kfd
|
||||
int get_total_memory(uint64_t* total);
|
||||
int get_used_memory(uint64_t* used);
|
||||
|
||||
private:
|
||||
uint32_t node_indx_;
|
||||
uint32_t amdgpu_dev_index_;
|
||||
|
||||
@@ -100,6 +100,7 @@ typedef enum LOG_TYPE {
|
||||
NO_LOG = 1,
|
||||
CONSOLE = 2,
|
||||
FILE_LOG = 3,
|
||||
BOTH_FILE_AND_CONSOLE = 4
|
||||
} LogType;
|
||||
|
||||
class Logger {
|
||||
|
||||
@@ -115,6 +115,7 @@ class RocmSMI {
|
||||
const RocmSMI_env_vars& getEnv(void);
|
||||
void printEnvVarInfo(void);
|
||||
bool isLoggingOn(void);
|
||||
uint32_t getLogSetting(void);
|
||||
static const std::map<amd::smi::DevInfoTypes, std::string> devInfoTypesStrings;
|
||||
|
||||
private:
|
||||
|
||||
@@ -94,6 +94,44 @@ enum MonitorTypes {
|
||||
kMonInvalid = 0xFFFFFFFF,
|
||||
};
|
||||
|
||||
const std::map<MonitorTypes,std::string> monitorTypesToString {
|
||||
{MonitorTypes::kMonName, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTemp, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonFanSpeed, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonMaxFanSpeed, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonFanRPMs, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonFanCntrlEnable, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonPowerCap, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonPowerCapDefault, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonPowerCapMax, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonPowerCapMin, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonPowerAve, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempMax, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempMin, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempMaxHyst, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempMinHyst, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempCritical, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempCriticalHyst, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempEmergency, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempEmergencyHyst, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempCritMin, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempCritMinHyst, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempOffset, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempLowest, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempHighest, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonTempLabel, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVolt, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltMax, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltMinCrit, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltMin, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltMaxCrit, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltAverage, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltLowest, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltHighest, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonVoltLabel, "amd::smi::kMonName"},
|
||||
{MonitorTypes::kMonInvalid, "amd::smi::kMonName"},
|
||||
};
|
||||
|
||||
|
||||
class Monitor {
|
||||
public:
|
||||
|
||||
@@ -0,0 +1,160 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_PROPERTIES_H_
|
||||
#define INCLUDE_ROCM_SMI_ROCM_SMI_PROPERTIES_H_
|
||||
|
||||
#include "rocm_smi/rocm_smi_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
|
||||
|
||||
namespace amd {
|
||||
namespace smi {
|
||||
|
||||
//
|
||||
// Property reinforcement check list
|
||||
//
|
||||
using AMDGpuPropertyId_t = uint32_t;
|
||||
using AMDGpuDevIdx_t = uint32_t;
|
||||
using AMDGpuVerbId_t = uint32_t;
|
||||
using AMDGpuAsicId_t = uint16_t;
|
||||
using AMDGpuAsicRevId_t = uint16_t;
|
||||
using AMDGpuOpModeType_t = uint8_t;
|
||||
|
||||
enum class AMDGpuVerbTypes_t : AMDGpuVerbId_t
|
||||
{
|
||||
kNone = 0,
|
||||
kSetGpuPciBandwidth,
|
||||
kSetPowerCap,
|
||||
kSetGpuPowerProfile,
|
||||
kSetGpuClkRange,
|
||||
kSetGpuOdClkInfo,
|
||||
kSetGpuOdVoltInfo,
|
||||
kSetGpuPerfLevelV1,
|
||||
kSetGpuPerfLevel,
|
||||
kGetGpuPowerProfilePresets,
|
||||
kResetGpu,
|
||||
kSetGpuPerfDeterminismMode,
|
||||
kSetGpuFanSpeed,
|
||||
kResetGpuFan,
|
||||
kSetClkFreq,
|
||||
kSetGpuOverdriveLevelV1,
|
||||
kSetGpuOverdriveLevel,
|
||||
kGetGpuFanRpms,
|
||||
kGetGpuFanSpeed,
|
||||
kGetGpuFanSpeedMax,
|
||||
kGetGpuVoltMetric,
|
||||
kGetGpuOverDriveLevel,
|
||||
kGetGpuOdVoltInfo,
|
||||
kGetGpuOdVoltCurveRegions,
|
||||
};
|
||||
using AMDGpuVerbList_t = std::map<AMDGpuVerbTypes_t, std::string>;
|
||||
|
||||
|
||||
enum class AMDGpuPropertyTypesOffset_t : AMDGpuPropertyId_t
|
||||
{
|
||||
kNone = 0,
|
||||
kDevInfoTypes = (0x1000 << 0),
|
||||
kMonitorTypes = (0x1000 << 1),
|
||||
kPerfTypes = (0x1000 << 2),
|
||||
kClkTypes = (0x1000 << 3),
|
||||
kVoltMetricTypes = (0x1000 << 4),
|
||||
};
|
||||
|
||||
using AMDGpuPropertyOffsetType = std::underlying_type<AMDGpuPropertyTypesOffset_t>::type;
|
||||
using AMDGpuPropertyTypesOffsetList_t = std::map<AMDGpuPropertyTypesOffset_t, std::string>;
|
||||
AMDGpuPropertyTypesOffset_t operator| (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs);
|
||||
AMDGpuPropertyTypesOffset_t operator& (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs);
|
||||
|
||||
|
||||
enum class AMDGpuPropertyOpModeTypes_t : AMDGpuOpModeType_t
|
||||
{
|
||||
kBareMetal = (0x1 << 0),
|
||||
kSrIov = (0x1 << 1),
|
||||
kBoth = (0x1 << 2),
|
||||
};
|
||||
|
||||
using AMDGpuPropertyOpModeType = std::underlying_type<AMDGpuPropertyOpModeTypes_t>::type;
|
||||
using AMDGpuOpModeList_t = std::map<AMDGpuPropertyOpModeTypes_t, std::string>;
|
||||
AMDGpuPropertyOpModeTypes_t operator| (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs);
|
||||
AMDGpuPropertyOpModeTypes_t operator& (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs);
|
||||
|
||||
|
||||
struct AMDGpuProperties_t
|
||||
{
|
||||
AMDGpuAsicRevId_t m_pci_rev_id;
|
||||
AMDGpuPropertyId_t m_property;
|
||||
AMDGpuVerbTypes_t m_verb_id;
|
||||
AMDGpuPropertyOpModeTypes_t m_opmode;
|
||||
bool m_should_be_available;
|
||||
};
|
||||
using AMDGpuPropertyList_t = std::multimap<AMDGpuAsicId_t, AMDGpuProperties_t>;
|
||||
|
||||
struct AMDGpuPropertyQuery_t
|
||||
{
|
||||
AMDGpuAsicId_t m_asic_id;
|
||||
AMDGpuAsicRevId_t m_pci_rev_id;
|
||||
AMDGpuDevIdx_t m_dev_idx;
|
||||
AMDGpuPropertyId_t m_property;
|
||||
AMDGpuVerbTypes_t m_verb_id;
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
AMDGpuPropertyId_t make_unique_property_id(AMDGpuPropertyTypesOffset_t type_offset, AMDGpuPropertyId_t property_id);
|
||||
AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id);
|
||||
|
||||
rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind,
|
||||
AMDGpuVerbTypes_t dev_info_type,
|
||||
rsmi_status_t actual_error_code);
|
||||
|
||||
void dump_amdgpu_property_reinforcement_list();
|
||||
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_DEVICE_H_
|
||||
@@ -48,6 +48,9 @@
|
||||
#include <string>
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <iomanip>
|
||||
#include <type_traits>
|
||||
|
||||
#include "rocm_smi/rocm_smi_device.h"
|
||||
|
||||
@@ -84,6 +87,8 @@ std::tuple<bool, std::string> readTmpFile(
|
||||
std::string stateName,
|
||||
std::string parameterName);
|
||||
void displayAppTmpFilesContent(void);
|
||||
std::string debugVectorContent(std::vector<std::string> v);
|
||||
std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v);
|
||||
rsmi_status_t handleException();
|
||||
rsmi_status_t
|
||||
GetDevValueVec(amd::smi::DevInfoTypes type,
|
||||
@@ -94,8 +99,53 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type,
|
||||
rsmi_status_t ErrnoToRsmiStatus(int err);
|
||||
std::string getRSMIStatusString(rsmi_status_t ret);
|
||||
std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string> getSystemDetails(void);
|
||||
std::string, std::string, std::string, std::string>
|
||||
getSystemDetails(void);
|
||||
void logSystemDetails(void);
|
||||
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str);
|
||||
void logHexDump(const char *desc, const void *addr, const size_t len,
|
||||
size_t perLine);
|
||||
bool isSystemBigEndian();
|
||||
template <typename T>
|
||||
std::string print_int_as_hex(T i, bool showHexNotation=true) {
|
||||
std::stringstream ss;
|
||||
if (showHexNotation) {
|
||||
ss << "0x" << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex;
|
||||
} else {
|
||||
ss << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex;
|
||||
}
|
||||
|
||||
if (std::is_same<std::uint8_t, T>::value) {
|
||||
ss << static_cast<unsigned int>(i|0);
|
||||
} else if (std::is_same<std::int8_t, T>::value) {
|
||||
ss << static_cast<int>(static_cast<uint8_t>(i|0));
|
||||
} else if (std::is_signed<T>::value) {
|
||||
ss << static_cast<long long int>(i | 0);
|
||||
} else {
|
||||
ss << static_cast<unsigned long long int>(i | 0);
|
||||
}
|
||||
ss << std::dec;
|
||||
return ss.str();
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
std::string print_unsigned_int(T i) {
|
||||
std::stringstream ss;
|
||||
ss << static_cast<unsigned long long int>(i | 0);
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::string print_unsigned_hex_and_int(T i, std::string heading="") {
|
||||
std::stringstream ss;
|
||||
if (heading.empty() == false) {
|
||||
ss << "\n" << heading << " = ";
|
||||
}
|
||||
ss << "Hex (MSB): " << print_int_as_hex(i) << ", "
|
||||
<< "Unsigned int: " << print_unsigned_int(i) << ", "
|
||||
<< "Byte Size: " << sizeof(T);
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
struct pthread_wrap {
|
||||
public:
|
||||
|
||||
@@ -47,7 +47,7 @@ headerString = ' ROCm System Management Interface '
|
||||
footerString = ' End of ROCm SMI Log '
|
||||
|
||||
# Output formatting
|
||||
appWidth = 84
|
||||
appWidth = 100
|
||||
deviceList = []
|
||||
|
||||
# Enable or disable serialized format
|
||||
@@ -112,19 +112,10 @@ def formatCsv(deviceList):
|
||||
if outputType == 'system':
|
||||
jsonobj = json.loads(jsondata)
|
||||
keylist = header
|
||||
for record in jsonobj:
|
||||
my_string += str(record)
|
||||
for key in keylist:
|
||||
if key == 'system':
|
||||
tempstr = str(jsonobj[record])
|
||||
tempstr = tempstr[tempstr.find('\'')+1:]
|
||||
tempstr = tempstr[:tempstr.find('\'')]
|
||||
# Force output device type to 'system'
|
||||
my_string += ',%s\nsystem,%s' % (tempstr, jsonobj[record][tempstr])
|
||||
my_string += '\n'
|
||||
# Force output device type to 'system'
|
||||
if my_string.startswith('system'):
|
||||
my_string = 'device' + my_string[6:]
|
||||
for record in jsonobj['system']:
|
||||
my_string += "\"%s\", \"%s\"\n" % (record, jsonobj['system'][record])
|
||||
# add header
|
||||
my_string = "name, value\n" + my_string
|
||||
return my_string
|
||||
headerkeys = []
|
||||
# Separate device-specific information from system-level information
|
||||
@@ -249,6 +240,17 @@ def getId(device):
|
||||
return hex(dv_id.value)
|
||||
|
||||
|
||||
def getRev(device):
|
||||
""" Return the hexadecimal value of a device's Revision
|
||||
|
||||
@param device: DRM device identifier
|
||||
"""
|
||||
dv_rev = c_short()
|
||||
ret = rocmsmi.rsmi_dev_revision_get(device, byref(dv_rev))
|
||||
if rsmi_ret_ok(ret, device, 'get_device_rev'):
|
||||
return hex(dv_rev.value)
|
||||
|
||||
|
||||
def getMaxPower(device):
|
||||
""" Return the maximum power cap of a given device
|
||||
|
||||
@@ -391,6 +393,25 @@ def getTemp(device, sensor):
|
||||
return temp.value / 1000
|
||||
return 'N/A'
|
||||
|
||||
def findFirstAvailableTemp(device):
|
||||
""" Discovers the first available device temperature to display
|
||||
|
||||
Returns a tuple of (temp_type, temp_value) for the device specified
|
||||
@param device: DRM device identifier
|
||||
"""
|
||||
temp = c_int64(0)
|
||||
metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT
|
||||
ret_temp = "N/A"
|
||||
ret_temp_type = "(Unknown)"
|
||||
for i, templist_val in enumerate(temp_type_lst):
|
||||
ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), i, metric, byref(temp))
|
||||
if rsmi_ret_ok(ret, device, 'get_temp_metric_' + templist_val, silent=True):
|
||||
ret_temp = temp.value / 1000
|
||||
ret_temp_type = '(' + templist_val.capitalize() + ')'
|
||||
break
|
||||
else:
|
||||
continue
|
||||
return (ret_temp_type, ret_temp)
|
||||
|
||||
def getVbiosVersion(device):
|
||||
""" Returns the VBIOS version for a given device
|
||||
@@ -399,7 +420,9 @@ def getVbiosVersion(device):
|
||||
"""
|
||||
vbios = create_string_buffer(256)
|
||||
ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256)
|
||||
if rsmi_ret_ok(ret, device):
|
||||
if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
return "Unsupported"
|
||||
elif rsmi_ret_ok(ret, device):
|
||||
return vbios.value.decode()
|
||||
|
||||
|
||||
@@ -425,7 +448,7 @@ def getComputePartition(device):
|
||||
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
|
||||
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode():
|
||||
return str(currentComputePartition.value.decode())
|
||||
return "UNKNOWN"
|
||||
return "N/A"
|
||||
|
||||
|
||||
def getMemoryPartition(device):
|
||||
@@ -437,7 +460,7 @@ def getMemoryPartition(device):
|
||||
ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256)
|
||||
if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode():
|
||||
return str(currentNPSMode.value.decode())
|
||||
return "UNKNOWN"
|
||||
return "N/A"
|
||||
|
||||
|
||||
def print2DArray(dataArray):
|
||||
@@ -537,16 +560,23 @@ def printEventList(device, delay, eventList):
|
||||
data = rsmi_evt_notification_data_t(1)
|
||||
rocmsmi.rsmi_event_notification_get(delay, byref(num_elements), byref(data))
|
||||
if len(data.message) > 0:
|
||||
print2DArray([['\rGPU[%d]:\t' % (device), ctime().split()[3], notification_type_names[data.event.value - 1],
|
||||
print2DArray([['\rGPU[%d]:\t' % (data.dv_ind), ctime().split()[3], notification_type_names[data.event.value - 1],
|
||||
data.message.decode('utf8') + '\r']])
|
||||
|
||||
def printLog(device, metricName, value=None, extraSpace=False):
|
||||
def printLog(device, metricName, value=None, extraSpace=False, useItalics=False):
|
||||
""" Print out to the SMI log
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param metricName: Title of the item to print to the log
|
||||
@param value: The item's value to print to the log
|
||||
"""
|
||||
red = '\033[91m'
|
||||
green = '\033[92m'
|
||||
blue = '\033[94m'
|
||||
bold = '\033[1m'
|
||||
italics = '\033[3m'
|
||||
underline = '\033[4m'
|
||||
end = '\033[0m'
|
||||
global PRINT_JSON
|
||||
if PRINT_JSON:
|
||||
if value is not None and device is not None:
|
||||
@@ -563,6 +593,8 @@ def printLog(device, metricName, value=None, extraSpace=False):
|
||||
# Force thread safe printing
|
||||
lock = multiprocessing.Lock()
|
||||
lock.acquire()
|
||||
if useItalics:
|
||||
logstr = italics + logstr + end
|
||||
if extraSpace:
|
||||
print('\n' + logstr + '\n', end='', flush=True)
|
||||
else:
|
||||
@@ -1353,7 +1385,7 @@ def setPowerOverDrive(deviceList, value, autoRespond):
|
||||
RETCODE = 1
|
||||
continue
|
||||
if new_power_cap.value == current_power_cap.value:
|
||||
printErrLog(device,'Max power was already at: {}W'.format(new_power_cap.value / 1000000))
|
||||
printLog(device,'Max power was already at: {}W'.format(new_power_cap.value / 1000000))
|
||||
|
||||
if current_power_cap.value < default_power_cap.value:
|
||||
current_power_cap.value = default_power_cap.value
|
||||
@@ -1540,18 +1572,39 @@ def showAllConcise(deviceList):
|
||||
print('ERROR: Cannot print JSON/CSV output for concise output')
|
||||
sys.exit(1)
|
||||
printLogSpacer(' Concise Info ')
|
||||
header = ['GPU', 'Temp (DieEdge)', 'AvgPwr', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
|
||||
deviceList.sort()
|
||||
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
|
||||
available_temp_type = temp_type.lower()
|
||||
available_temp_type = available_temp_type.replace('(', '')
|
||||
available_temp_type = available_temp_type.replace(')', '')
|
||||
header = ['GPU', 'Temp', 'AvgPwr', 'Partitions', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%']
|
||||
subheader = ['', temp_type, '', '(Mem, Compute)', '', '', '', '', '', '', '']
|
||||
# add additional spaces to match header
|
||||
for idx, item in enumerate(subheader):
|
||||
header_size = len(header[idx])
|
||||
subheader_size = len(subheader[idx])
|
||||
if header_size != subheader_size:
|
||||
numSpacesToFill_subheader = header_size - subheader_size
|
||||
numSpacesToFill_header = subheader_size - header_size
|
||||
#take pos spaces to mean, we need to match size of the other
|
||||
if numSpacesToFill_subheader > 0:
|
||||
subheader[idx] = subheader[idx] + (' ' * numSpacesToFill_subheader)
|
||||
if numSpacesToFill_header > 0:
|
||||
header[idx] = header[idx] + (' ' * numSpacesToFill_header)
|
||||
head_widths = [len(head) + 2 for head in header]
|
||||
values = {}
|
||||
degree_sign = u'\N{DEGREE SIGN}'
|
||||
for device in deviceList:
|
||||
temp = str(getTemp(device, 'edge'))
|
||||
if temp != 'N/A':
|
||||
temp += 'c'
|
||||
temp_val = str(getTemp(device, available_temp_type))
|
||||
if temp_val != 'N/A':
|
||||
temp_val += degree_sign + 'C'
|
||||
avgPwr = str(getPower(device))
|
||||
if avgPwr != '0.0' and avgPwr != 'N/A':
|
||||
avgPwr += 'W'
|
||||
else:
|
||||
avgPwr = 'N/A'
|
||||
combined_partition = (getMemoryPartition(device) + ", "
|
||||
+ getComputePartition(device))
|
||||
concise = True
|
||||
sclk = showCurrentClocks([device], 'sclk', concise)
|
||||
mclk = showCurrentClocks([device], 'mclk', concise)
|
||||
@@ -1575,7 +1628,9 @@ def showAllConcise(deviceList):
|
||||
mem_use_pct='Unsupported'
|
||||
if vram_used != None and vram_total != None and float(vram_total) != 0:
|
||||
mem_use_pct = '% 3.0f%%' % (100 * (float(vram_used) / float(vram_total)))
|
||||
values['card%s' % (str(device))] = [device, temp, avgPwr, sclk, mclk, fan, str(perf).lower(), pwrCap,
|
||||
values['card%s' % (str(device))] = [device, temp_val, avgPwr,
|
||||
combined_partition, sclk, mclk,
|
||||
fan, str(perf).lower(), pwrCap,
|
||||
mem_use_pct, gpu_busy]
|
||||
val_widths = {}
|
||||
for device in deviceList:
|
||||
@@ -1585,6 +1640,9 @@ def showAllConcise(deviceList):
|
||||
for col in range(len(val_widths[device])):
|
||||
max_widths[col] = max(max_widths[col], val_widths[device][col])
|
||||
printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), header)), None)
|
||||
printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader)),
|
||||
None, useItalics=True)
|
||||
printLogSpacer(fill='=')
|
||||
for device in deviceList:
|
||||
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
|
||||
zip(range(len(max_widths)), values['card%s' % (str(device))])), None)
|
||||
@@ -1601,19 +1659,23 @@ def showAllConciseHw(deviceList):
|
||||
print('ERROR: Cannot print JSON/CSV output for concise hardware output')
|
||||
sys.exit(1)
|
||||
printLogSpacer(' Concise Hardware Info ')
|
||||
header = ['GPU', 'DID', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS']
|
||||
header = ['GPU', 'DID', 'DREV', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS']
|
||||
head_widths = [len(head) + 2 for head in header]
|
||||
values = {}
|
||||
for device in deviceList:
|
||||
gpuid = getId(device)
|
||||
if str(gpuid).startswith('0x'):
|
||||
gpuid = str(gpuid)[2:]
|
||||
gpurev = getRev(device)
|
||||
if str(gpurev).startswith('0x'):
|
||||
gpurev = str(gpurev)[2:]
|
||||
|
||||
gfxRas = getRasEnablement(device, 'GFX')
|
||||
sdmaRas = getRasEnablement(device, 'SDMA')
|
||||
umcRas = getRasEnablement(device, 'UMC')
|
||||
vbios = getVbiosVersion(device)
|
||||
bus = getBus(device)
|
||||
values['card%s' % (str(device))] = [device, gpuid, gfxRas, sdmaRas, umcRas, vbios, bus]
|
||||
values['card%s' % (str(device))] = [device, gpuid, gpurev, gfxRas, sdmaRas, umcRas, vbios, bus]
|
||||
val_widths = {}
|
||||
for device in deviceList:
|
||||
val_widths[device] = [len(str(val)) + 2 for val in values['card%s' % (str(device))]]
|
||||
@@ -1952,6 +2014,7 @@ def showId(deviceList):
|
||||
printLogSpacer(' ID ')
|
||||
for device in deviceList:
|
||||
printLog(device, 'GPU ID', getId(device))
|
||||
printLog(device, 'GPU Rev', getRev(device))
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
@@ -2272,8 +2335,12 @@ def showProductName(deviceList):
|
||||
# if rsmi_ret_ok(ret, device) and sku.value.decode():
|
||||
# device_sku = sku.value.decode()
|
||||
# Retrieve the device SKU as a substring from VBIOS
|
||||
device_sku = ""
|
||||
ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256)
|
||||
if rsmi_ret_ok(ret, device, 'get_vbios_version') and vbios.value.decode():
|
||||
if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
device_sku = "Unsupported"
|
||||
printLog(device, 'Card SKU', '\t\t' + device_sku)
|
||||
elif rsmi_ret_ok(ret, device, 'get_vbios_version') and vbios.value.decode():
|
||||
# Device SKU is just the characters in between the two '-' in vbios_version
|
||||
if vbios.value.decode().count('-') == 2 and len(str(vbios.value.decode().split('-')[1])) > 1:
|
||||
device_sku = vbios.value.decode().split('-')[1]
|
||||
@@ -2535,7 +2602,7 @@ def showEvents(deviceList, eventTypes):
|
||||
break
|
||||
|
||||
|
||||
def printTempGraph(deviceList, delay):
|
||||
def printTempGraph(deviceList, delay, temp_type):
|
||||
# deviceList must be in ascending order
|
||||
deviceList.sort()
|
||||
devices = 0
|
||||
@@ -2549,7 +2616,7 @@ def printTempGraph(deviceList, delay):
|
||||
terminalWidth = os.get_terminal_size()[0]
|
||||
printStrings = list()
|
||||
for device in deviceList:
|
||||
temp = getTemp(device, 'edge')
|
||||
temp = getTemp(device, temp_type)
|
||||
if temp == 'N/A':
|
||||
percentage = 0
|
||||
else:
|
||||
@@ -2622,11 +2689,16 @@ def getGraphColor(percentage):
|
||||
|
||||
|
||||
def showTempGraph(deviceList):
|
||||
printLogSpacer(' Temperature Graph ')
|
||||
deviceList.sort()
|
||||
(temp_type, temp_value) = findFirstAvailableTemp(deviceList[0])
|
||||
printLogSpacer(' Temperature Graph ' + temp_type + ' ')
|
||||
temp_type = temp_type.lower()
|
||||
temp_type = temp_type.replace('(', '')
|
||||
temp_type = temp_type.replace(')', '')
|
||||
# Start a thread for constantly printing
|
||||
try:
|
||||
# Create a thread (call print function, devices, delay in ms)
|
||||
_thread.start_new_thread(printTempGraph, (deviceList, 150))
|
||||
_thread.start_new_thread(printTempGraph, (deviceList, 150, temp_type))
|
||||
except Exception as e:
|
||||
printErrLog(device, 'Unable to start new thread. %s' % (e))
|
||||
# Catch user input for program termination
|
||||
|
||||
@@ -11,8 +11,16 @@ import os
|
||||
|
||||
# Use ROCm installation path if running from standard installation
|
||||
# With File Reorg rsmiBindings.py will be installed in /opt/rocm/libexec/rocm_smi.
|
||||
# relative path changed accordingly
|
||||
path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@'
|
||||
# relative path changed accordingly.
|
||||
# if ROCM_SMI_LIB_PATH is set, we can load 'librocm_smi64.so' from that location
|
||||
#
|
||||
path_librocm = str()
|
||||
rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH')
|
||||
if (rocm_smi_lib_path != None):
|
||||
path_librocm = rocm_smi_lib_path
|
||||
else:
|
||||
path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@'
|
||||
|
||||
if not os.path.isfile(path_librocm):
|
||||
print('Unable to find %s . Trying /opt/rocm*' % path_librocm)
|
||||
for root, dirs, files in os.walk('/opt', followlinks=True):
|
||||
@@ -22,9 +30,10 @@ if not os.path.isfile(path_librocm):
|
||||
print('Using lib from %s' % path_librocm)
|
||||
else:
|
||||
print('Unable to find librocm_smi64.so.@VERSION_MAJOR@')
|
||||
else:
|
||||
print('Library loaded from: %s ' % path_librocm)
|
||||
|
||||
# ----------> TODO: Support static libs as well as SO
|
||||
|
||||
try:
|
||||
cdll.LoadLibrary(path_librocm)
|
||||
rocmsmi = CDLL(path_librocm)
|
||||
@@ -36,7 +45,6 @@ except OSError:
|
||||
.format('\33[33m', '\033[0m'))
|
||||
exit()
|
||||
|
||||
|
||||
# Device ID
|
||||
dv_id = c_uint64()
|
||||
# GPU ID
|
||||
|
||||
@@ -78,6 +78,7 @@
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
|
||||
using namespace ROCmLogging;
|
||||
using namespace amd::smi;
|
||||
|
||||
static const uint32_t kMaxOverdriveLevel = 20;
|
||||
static const float kEnergyCounterResolution = 15.3f;
|
||||
@@ -632,7 +633,7 @@ rsmi_status_t
|
||||
rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
|
||||
rsmi_error_count_t *ec) {
|
||||
std::vector<std::string> val_vec;
|
||||
rsmi_status_t ret;
|
||||
rsmi_status_t ret(RSMI_STATUS_NOT_SUPPORTED);
|
||||
std::ostringstream ss;
|
||||
|
||||
TRY
|
||||
@@ -673,8 +674,8 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
|
||||
|
||||
default:
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", default case -> reporting RSMI_STATUS_NOT_SUPPORTED"
|
||||
<< amd::smi::getRSMIStatusString(ret);
|
||||
<< ", default case -> reporting "
|
||||
<< amd::smi::getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED);
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
@@ -682,6 +683,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
|
||||
DEVICE_MUTEX
|
||||
|
||||
ret = GetDevValueVec(type, dv_ind, &val_vec);
|
||||
if (val_vec.size() != 2 ) ret = RSMI_STATUS_FILE_ERROR;
|
||||
|
||||
if (ret == RSMI_STATUS_FILE_ERROR || val_vec.size() != 2) {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
@@ -698,8 +700,6 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
|
||||
return ret;
|
||||
}
|
||||
|
||||
assert(val_vec.size() == 2);
|
||||
|
||||
std::string junk;
|
||||
std::istringstream fs1(val_vec[0]);
|
||||
|
||||
@@ -820,6 +820,21 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision) {
|
||||
std::ostringstream outss;
|
||||
rsmi_status_t ret;
|
||||
outss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(outss);
|
||||
CHK_SUPPORT_NAME_ONLY(revision)
|
||||
|
||||
ret = get_id(dv_ind, amd::smi::kDevDevRevID, revision);
|
||||
outss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
|
||||
LOG_TRACE(outss);
|
||||
return ret;
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *id) {
|
||||
TRY
|
||||
@@ -2503,7 +2518,16 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|
||||
}
|
||||
|
||||
if (temperature == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Cause: temperature was a null ptr reference"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
// The HBM temperature is retreived from the gpu_metrics
|
||||
@@ -2512,12 +2536,32 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|
||||
|| sensor_type == RSMI_TEMP_TYPE_HBM_2
|
||||
|| sensor_type == RSMI_TEMP_TYPE_HBM_3) {
|
||||
if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Cause: To retreive HBM temp, we only support metric = "
|
||||
<< "RSMI_TEMP_CURRENT"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
rsmi_gpu_metrics_t gpu_metrics;
|
||||
ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Cause: rsmi_dev_gpu_metrics_info_get returned "
|
||||
<< getRSMIStatusString(ret)
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(ret) << " |";
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -2537,11 +2581,28 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|
||||
default:
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
if (val_ui16 == UINT16_MAX)
|
||||
if (val_ui16 == UINT16_MAX) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Cause: Reached UINT16 max value, overflow"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
else
|
||||
} else
|
||||
*temperature = val_ui16 * CENTRIGRADE_TO_MILLI_CENTIGRADE;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======= "
|
||||
<< " | Success "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Data: " << *temperature
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_SUCCESS) << " | ";
|
||||
LOG_INFO(ss);
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
} // end HBM temperature
|
||||
|
||||
@@ -2550,6 +2611,15 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|
||||
GET_DEV_FROM_INDX
|
||||
|
||||
if (dev->monitor() == nullptr) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Cause: monitor returned nullptr"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
std::shared_ptr<amd::smi::Monitor> m = dev->monitor();
|
||||
@@ -2563,6 +2633,15 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type,
|
||||
CHK_API_SUPPORT_ONLY(temperature, metric, sensor_index)
|
||||
|
||||
ret = get_dev_mon_value(mon_type, dv_ind, sensor_index, temperature);
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======= "
|
||||
<< " | Success "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Sensor_index: " << sensor_index
|
||||
<< " | Type: " << monitorTypesToString.at(mon_type)
|
||||
<< " | Data: " << *temperature
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(ret) << " | ";
|
||||
LOG_INFO(ss);
|
||||
|
||||
return ret;
|
||||
CATCH
|
||||
@@ -2995,6 +3074,7 @@ rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t reserved,
|
||||
|
||||
DEVICE_MUTEX
|
||||
rsmi_status_t ret = get_power_profiles(dv_ind, status, nullptr);
|
||||
|
||||
return ret;
|
||||
CATCH
|
||||
}
|
||||
@@ -3015,6 +3095,7 @@ rsmi_dev_power_profile_set(uint32_t dv_ind, uint32_t dummy,
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
rsmi_status_t ret = set_power_profile(dv_ind, profile);
|
||||
|
||||
return ret;
|
||||
CATCH
|
||||
}
|
||||
@@ -3052,6 +3133,14 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
|
||||
DEVICE_MUTEX
|
||||
ret = get_dev_value_int(mem_type_file, dv_ind, total);
|
||||
|
||||
// Fallback to KFD reported memory if VRAM total is 0
|
||||
if (mem_type == RSMI_MEM_TYPE_VRAM && *total == 0) {
|
||||
GET_DEV_AND_KFDNODE_FROM_INDX
|
||||
if (kfd_node->get_total_memory(total) == 0 && *total > 0) {
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
CATCH
|
||||
}
|
||||
@@ -3088,6 +3177,17 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
|
||||
DEVICE_MUTEX
|
||||
ret = get_dev_value_int(mem_type_file, dv_ind, used);
|
||||
|
||||
// Fallback to KFD reported memory if no VRAM
|
||||
if (mem_type == RSMI_MEM_TYPE_VRAM && *used == 0) {
|
||||
GET_DEV_AND_KFDNODE_FROM_INDX
|
||||
uint64_t total = 0;
|
||||
ret = get_dev_value_int(amd::smi::kDevMemTotVRAM, dv_ind, &total);
|
||||
if (total != 0) return ret; // do not need to fallback
|
||||
if ( kfd_node->get_used_memory(used) == 0 ) {
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
CATCH
|
||||
}
|
||||
|
||||
@@ -59,6 +59,7 @@
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <cstring>
|
||||
#include <type_traits>
|
||||
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_device.h"
|
||||
@@ -85,6 +86,7 @@ static const char *kDevPerfLevelFName = "power_dpm_force_performance_level";
|
||||
static const char *kDevDevProdNameFName = "product_name";
|
||||
static const char *kDevDevProdNumFName = "product_number";
|
||||
static const char *kDevDevIDFName = "device";
|
||||
static const char *kDevDevRevIDFName = "revision";
|
||||
static const char *kDevVendorIDFName = "vendor";
|
||||
static const char *kDevSubSysDevIDFName = "subsystem_device";
|
||||
static const char *kDevSubSysVendorIDFName = "subsystem_vendor";
|
||||
@@ -238,6 +240,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
|
||||
{kDevDevProdName, kDevDevProdNameFName},
|
||||
{kDevDevProdNum, kDevDevProdNumFName},
|
||||
{kDevDevID, kDevDevIDFName},
|
||||
{kDevDevRevID, kDevDevRevIDFName},
|
||||
{kDevVendorID, kDevVendorIDFName},
|
||||
{kDevSubSysDevID, kDevSubSysDevIDFName},
|
||||
{kDevSubSysVendorID, kDevSubSysVendorIDFName},
|
||||
@@ -374,12 +377,13 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
|
||||
// Functions with only mandatory dependencies
|
||||
{"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}},
|
||||
{"rsmi_dev_id_get", {{kDevDevIDFName}, {}}},
|
||||
{"rsmi_dev_revision_get", {{kDevDevRevIDFName}, {}}},
|
||||
{"rsmi_dev_vendor_id_get", {{kDevVendorIDFName}, {}}},
|
||||
|
||||
{"rsmi_dev_name_get", {{kDevVendorIDFName,
|
||||
kDevDevIDFName}, {}}},
|
||||
{"rsmi_dev_sku_get", {{kDevDevProdNumFName}, {}}},
|
||||
{"rsmi_dev_brand_get", {{kDevVendorIDFName}, {}}},
|
||||
{"rsmi_dev_brand_get", {{kDevVendorIDFName,
|
||||
kDevVBiosVerFName}, {}}},
|
||||
{"rsmi_dev_vendor_name_get", {{kDevVendorIDFName}, {}}},
|
||||
{"rsmi_dev_serial_number_get", {{kDevSerialNumberFName}, {}}},
|
||||
{"rsmi_dev_subsystem_id_get", {{kDevSubSysDevIDFName}, {}}},
|
||||
@@ -823,7 +827,12 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
|
||||
}
|
||||
ss << "Successfully read DevInfoBinary for DevInfoType ("
|
||||
<< RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS ("
|
||||
<< sysfs_path << "), returning binaryData = " << p_binary_data;
|
||||
<< sysfs_path << "), returning binaryData = " << p_binary_data
|
||||
<< "; byte_size = " << std::dec << static_cast<int>(b_size);
|
||||
|
||||
std::string metricDescription = "AMD SMI GPU METRICS (16-byte width), "
|
||||
+ sysfs_path;
|
||||
logHexDump(metricDescription.c_str(), p_binary_data, b_size, 16);
|
||||
LOG_INFO(ss);
|
||||
return 0;
|
||||
}
|
||||
@@ -888,6 +897,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
|
||||
|
||||
switch (type) {
|
||||
case kDevDevID:
|
||||
case kDevDevRevID:
|
||||
case kDevSubSysDevID:
|
||||
case kDevSubSysVendorID:
|
||||
case kDevVendorID:
|
||||
@@ -1025,6 +1035,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) {
|
||||
case kDevDevProdName:
|
||||
case kDevDevProdNum:
|
||||
case kDevDevID:
|
||||
case kDevDevRevID:
|
||||
case kDevSubSysDevID:
|
||||
case kDevSubSysVendorID:
|
||||
case kDevVendorID:
|
||||
@@ -1375,6 +1386,7 @@ std::string Device::readBootPartitionState<rsmi_nps_mode_type_t>(
|
||||
return boot_state;
|
||||
}
|
||||
|
||||
|
||||
#undef RET_IF_NONZERO
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
@@ -61,6 +61,10 @@
|
||||
#include "rocm_smi/rocm_smi_monitor.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
|
||||
using namespace ROCmLogging;
|
||||
using namespace amd::smi;
|
||||
|
||||
#define TRY try {
|
||||
#define CATCH } catch (...) {return amd::smi::handleException();}
|
||||
@@ -140,6 +144,196 @@ typedef struct {
|
||||
|
||||
} rsmi_gpu_metrics_v_1_3;
|
||||
|
||||
|
||||
// log current gpu_metrics file content read
|
||||
// any metrics value can be a nullptr
|
||||
void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header,
|
||||
const rsmi_gpu_metrics_v_1_2 *rsmi_gpu_metrics_v_1_2,
|
||||
const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3,
|
||||
const rsmi_gpu_metrics_t *rsmi_gpu_metrics) {
|
||||
if (RocmSMI::getInstance().isLoggingOn() == false) {
|
||||
return;
|
||||
}
|
||||
std::ostringstream ss;
|
||||
if (gpu_metrics_table_header != nullptr) {
|
||||
ss
|
||||
/* Common Header */
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_table_header->structure_size,
|
||||
"gpu_metrics_table_header->structure_size")
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_table_header->format_revision,
|
||||
"gpu_metrics_table_header->format_revision")
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_table_header->content_revision,
|
||||
"gpu_metrics_table_header->content_revision");
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
if (rsmi_gpu_metrics == nullptr) {
|
||||
return;
|
||||
} else {
|
||||
// do nothing - continue
|
||||
}
|
||||
ss
|
||||
/* Common Header */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->common_header.structure_size,
|
||||
"rsmi_gpu_metrics->common_header.structure_size")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->common_header.format_revision,
|
||||
"rsmi_gpu_metrics->common_header.format_revision")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->common_header.content_revision,
|
||||
"rsmi_gpu_metrics->common_header.content_revision")
|
||||
/* Temperature */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_edge,
|
||||
"rsmi_gpu_metrics->temperature_edge")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_hotspot,
|
||||
"rsmi_gpu_metrics->temperature_hotspot")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_mem,
|
||||
"rsmi_gpu_metrics->temperature_mem")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_vrgfx,
|
||||
"rsmi_gpu_metrics->temperature_vrgfx")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_vrsoc,
|
||||
"rsmi_gpu_metrics->temperature_vrsoc")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_vrmem,
|
||||
"rsmi_gpu_metrics->temperature_vrmem")
|
||||
/* Utilization */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_gfx_activity,
|
||||
"rsmi_gpu_metrics->average_gfx_activity")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_umc_activity,
|
||||
"rsmi_gpu_metrics->average_umc_activity")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_mm_activity,
|
||||
"rsmi_gpu_metrics->average_mm_activity")
|
||||
/* Power/Energy */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_socket_power,
|
||||
"rsmi_gpu_metrics->average_socket_power")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->energy_accumulator,
|
||||
"rsmi_gpu_metrics->energy_accumulator")
|
||||
/* Driver attached timestamp (in ns) */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->system_clock_counter,
|
||||
"rsmi_gpu_metrics->system_clock_counter")
|
||||
/* Average clocks */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_gfxclk_frequency,
|
||||
"rsmi_gpu_metrics->average_gfxclk_frequency")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_socclk_frequency,
|
||||
"rsmi_gpu_metrics->average_socclk_frequency")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_uclk_frequency,
|
||||
"rsmi_gpu_metrics->average_uclk_frequency")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_vclk0_frequency,
|
||||
"rsmi_gpu_metrics->average_vclk0_frequency")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_dclk0_frequency,
|
||||
"rsmi_gpu_metrics->average_dclk0_frequency")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_vclk1_frequency,
|
||||
"rsmi_gpu_metrics->average_vclk1_frequency")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->average_dclk1_frequency,
|
||||
"rsmi_gpu_metrics->average_dclk1_frequency")
|
||||
/* Current clocks */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_gfxclk,
|
||||
"rsmi_gpu_metrics->current_gfxclk")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_socclk,
|
||||
"rsmi_gpu_metrics->current_socclk")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_uclk,
|
||||
"rsmi_gpu_metrics->current_uclk")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_vclk0,
|
||||
"rsmi_gpu_metrics->current_vclk0")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_dclk0,
|
||||
"rsmi_gpu_metrics->current_dclk0")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_vclk1,
|
||||
"rsmi_gpu_metrics->current_vclk1")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_dclk1,
|
||||
"rsmi_gpu_metrics->current_dclk1")
|
||||
/* Throttle status */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->throttle_status,
|
||||
"rsmi_gpu_metrics->throttle_status")
|
||||
/* Fans */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->current_fan_speed,
|
||||
"rsmi_gpu_metrics->current_fan_speed")
|
||||
/* Link width/speed */
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->pcie_link_width,
|
||||
"rsmi_gpu_metrics->pcie_link_width")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->pcie_link_speed,
|
||||
"rsmi_gpu_metrics->pcie_link_speed")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->padding,
|
||||
"rsmi_gpu_metrics->padding")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->gfx_activity_acc,
|
||||
"rsmi_gpu_metrics->gfx_activity_acc")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->mem_actvity_acc,
|
||||
"rsmi_gpu_metrics->mem_actvity_acc");
|
||||
for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) {
|
||||
ss << print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_hbm[i],
|
||||
"rsmi_gpu_metrics->temperature_hbm[" + std::to_string(i) + "]");
|
||||
}
|
||||
|
||||
if (rsmi_gpu_metrics_v_1_2 != nullptr) {
|
||||
/* PMFW attached timestamp (10ns resolution) */
|
||||
ss
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics_v_1_2->firmware_timestamp,
|
||||
"rsmi_gpu_metrics_v_1_2->firmware_timestamp");
|
||||
}
|
||||
|
||||
if (gpu_metrics_v_1_3 != nullptr) {
|
||||
/* PMFW attached timestamp (10ns resolution) */
|
||||
ss
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_v_1_3->firmware_timestamp,
|
||||
"gpu_metrics_v_1_3->firmware_timestamp")
|
||||
/* Voltage (mV) */
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_v_1_3->voltage_soc,
|
||||
"gpu_metrics_v_1_3->voltage_soc")
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_v_1_3->gfx_voltage,
|
||||
"gpu_metrics_v_1_3->voltage_gfx")
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_v_1_3->mem_voltage,
|
||||
"gpu_metrics_v_1_3->voltage_mem")
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_v_1_3->padding1,
|
||||
"gpu_metrics_v_1_3->padding1")
|
||||
/* Throttle status (ASIC independent) */
|
||||
<< print_unsigned_hex_and_int(
|
||||
gpu_metrics_v_1_3->indep_throttle_status,
|
||||
"gpu_metrics_v_1_3->indep_throttle_status");
|
||||
}
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind,
|
||||
rsmi_gpu_metrics_t *data, uint8_t content_v) {
|
||||
assert(content_v != RSMI_GPU_METRICS_API_CONTENT_VER_1 &&
|
||||
@@ -269,16 +463,28 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
|
||||
rsmi_gpu_metrics_v_1_3 smu_v_1_3;
|
||||
rsmi_status_t ret;
|
||||
|
||||
std::ostringstream ss;
|
||||
if (!dev->gpu_metrics_ver().structure_size) {
|
||||
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
sizeof(struct metrics_table_header_t), &dev->gpu_metrics_ver());
|
||||
log_gpu_metrics(&dev->gpu_metrics_ver(), nullptr, nullptr, nullptr);
|
||||
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ss << "Returning = " << getRSMIStatusString(ret)
|
||||
<< ",\ndev->gpu_metrics_ver().structure_size = "
|
||||
<< print_unsigned_int(dev->gpu_metrics_ver().structure_size)
|
||||
<< ", could not read common header";
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
// only supports gpu_metrics_v1_x version
|
||||
if (dev->gpu_metrics_ver().format_revision != 1) {
|
||||
ss << "Returning = " << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED)
|
||||
<< ",\ndev->gpu_metrics_ver().format_revision = "
|
||||
<< print_unsigned_int(dev->gpu_metrics_ver().format_revision)
|
||||
<< " was not equal to 1";
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
@@ -290,19 +496,31 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
|
||||
RSMI_GPU_METRICS_API_CONTENT_VER_1) {
|
||||
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
sizeof(rsmi_gpu_metrics_t), smu);
|
||||
ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_1";
|
||||
LOG_DEBUG(ss);
|
||||
log_gpu_metrics(nullptr, nullptr, nullptr, smu);
|
||||
} else if (dev->gpu_metrics_ver().content_revision ==
|
||||
RSMI_GPU_METRICS_API_CONTENT_VER_2) {
|
||||
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
sizeof(rsmi_gpu_metrics_v_1_2), &smu_v_1_2);
|
||||
map_gpu_metrics_1_2_to_rsmi_gpu_metrics_t(&smu_v_1_2, smu);
|
||||
ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_2";
|
||||
LOG_DEBUG(ss);
|
||||
log_gpu_metrics(nullptr, &smu_v_1_2, nullptr, smu);
|
||||
} else if (dev->gpu_metrics_ver().content_revision ==
|
||||
RSMI_GPU_METRICS_API_CONTENT_VER_3) {
|
||||
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
sizeof(rsmi_gpu_metrics_v_1_3), &smu_v_1_3);
|
||||
map_gpu_metrics_1_3_to_rsmi_gpu_metrics_t(&smu_v_1_3, smu);
|
||||
ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_3";
|
||||
LOG_DEBUG(ss);
|
||||
log_gpu_metrics(nullptr, nullptr, &smu_v_1_3, smu);
|
||||
} else {
|
||||
ret = GetGPUMetricsFormat1(dv_ind, smu,
|
||||
dev->gpu_metrics_ver().content_revision);
|
||||
ss << __PRETTY_FUNCTION__ << " | GetGPUMetricsFormat1";
|
||||
LOG_DEBUG(ss);
|
||||
log_gpu_metrics(nullptr, nullptr, nullptr, smu);
|
||||
}
|
||||
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
|
||||
@@ -43,6 +43,9 @@
|
||||
|
||||
#include <assert.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <dirent.h>
|
||||
|
||||
#include <algorithm>
|
||||
@@ -770,6 +773,95 @@ KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth,
|
||||
|
||||
return 0;
|
||||
}
|
||||
// /sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties
|
||||
// size_in_bytes 68702699520
|
||||
int KFDNode::get_total_memory(uint64_t* total) {
|
||||
if (total == nullptr) return EINVAL;
|
||||
*total = 0;
|
||||
|
||||
std::string f_path = kKFDNodesPathRoot;
|
||||
f_path += "/";
|
||||
f_path += std::to_string(node_indx_);
|
||||
f_path += "/mem_banks";
|
||||
|
||||
auto kfd_node_dir = opendir(f_path.c_str());
|
||||
if (kfd_node_dir == nullptr) {
|
||||
return errno;
|
||||
}
|
||||
auto dentry = readdir(kfd_node_dir);
|
||||
while (dentry != nullptr) {
|
||||
if (dentry->d_name[0] == '.') {
|
||||
dentry = readdir(kfd_node_dir);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!is_number(dentry->d_name)) {
|
||||
dentry = readdir(kfd_node_dir);
|
||||
continue;
|
||||
}
|
||||
|
||||
// read "size_in_bytes 68702699520" line
|
||||
const std::string size_in_bytes_property = "size_in_bytes ";
|
||||
std::string memory_bank_file = f_path + "/"
|
||||
+ dentry->d_name + "/properties";
|
||||
std::ifstream fs(memory_bank_file);
|
||||
if (!fs) {
|
||||
dentry = readdir(kfd_node_dir);
|
||||
continue;
|
||||
}
|
||||
std::string line;
|
||||
while (std::getline(fs, line)) {
|
||||
if (line.substr(0, size_in_bytes_property.length())
|
||||
== size_in_bytes_property) {
|
||||
auto bytes = line.substr(size_in_bytes_property.length());
|
||||
try {
|
||||
*total += std::stol(bytes);
|
||||
break;
|
||||
} catch(...) {
|
||||
dentry = readdir(kfd_node_dir);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} // end loop for lines in property file
|
||||
} // end loop for mem_bank directory
|
||||
|
||||
if (closedir(kfd_node_dir)) {
|
||||
std::string err_str = "Failed to close KFD node directory ";
|
||||
err_str += f_path;
|
||||
err_str += ".";
|
||||
perror(err_str.c_str());
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ioctl on kfd node device
|
||||
int KFDNode::get_used_memory(uint64_t* used) {
|
||||
if (used == nullptr) return EINVAL;
|
||||
static const char *kPathKFDIoctl = "/dev/kfd";
|
||||
|
||||
int kfd_fd = open(kPathKFDIoctl, O_RDWR | O_CLOEXEC);
|
||||
if (kfd_fd <= 0) {
|
||||
return 1;
|
||||
}
|
||||
struct kfd_ioctl_get_available_memory_args mem = {0, 0, 0};
|
||||
mem.gpu_id = gpu_id_;
|
||||
if (ioctl(kfd_fd, AMDKFD_IOC_AVAILABLE_MEMORY , &mem) != 0) {
|
||||
close(kfd_fd);
|
||||
return 1;
|
||||
}
|
||||
close(kfd_fd);
|
||||
|
||||
// used = total - available
|
||||
uint64_t total = 0;
|
||||
int ret = get_total_memory(&total);
|
||||
if (ret == 0 && total > 0 && mem.available < total) {
|
||||
*used = total - mem.available;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
@@ -177,6 +177,9 @@ void Logger::error(const char* text) throw() {
|
||||
logIntoFile(data);
|
||||
} else if (m_LogType == CONSOLE) {
|
||||
logOnConsole(data);
|
||||
} else if (m_LogType == BOTH_FILE_AND_CONSOLE) {
|
||||
logOnConsole(data);
|
||||
logIntoFile(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -208,6 +211,9 @@ void Logger::alarm(const char* text) throw() {
|
||||
logIntoFile(data);
|
||||
} else if (m_LogType == CONSOLE) {
|
||||
logOnConsole(data);
|
||||
} else if (m_LogType == BOTH_FILE_AND_CONSOLE) {
|
||||
logOnConsole(data);
|
||||
logIntoFile(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -239,6 +245,9 @@ void Logger::always(const char* text) throw() {
|
||||
logIntoFile(data);
|
||||
} else if (m_LogType == CONSOLE) {
|
||||
logOnConsole(data);
|
||||
} else if (m_LogType == BOTH_FILE_AND_CONSOLE) {
|
||||
logOnConsole(data);
|
||||
logIntoFile(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -303,6 +312,10 @@ void Logger::info(const char* text) throw() {
|
||||
logIntoFile(data);
|
||||
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_INFO)) {
|
||||
logOnConsole(data);
|
||||
} else if ((m_LogType == BOTH_FILE_AND_CONSOLE)
|
||||
&& (m_LogLevel >= LOG_LEVEL_INFO)) {
|
||||
logOnConsole(data);
|
||||
logIntoFile(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -333,6 +346,10 @@ void Logger::trace(const char* text) throw() {
|
||||
logIntoFile(data);
|
||||
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_TRACE)) {
|
||||
logOnConsole(data);
|
||||
} else if ((m_LogType == BOTH_FILE_AND_CONSOLE)
|
||||
&& (m_LogLevel >= LOG_LEVEL_TRACE)) {
|
||||
logOnConsole(data);
|
||||
logIntoFile(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -363,6 +380,10 @@ void Logger::debug(const char* text) throw() {
|
||||
logIntoFile(data);
|
||||
} else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_DEBUG)) {
|
||||
logOnConsole(data);
|
||||
} else if ((m_LogType == BOTH_FILE_AND_CONSOLE)
|
||||
&& (m_LogLevel >= LOG_LEVEL_DEBUG)) {
|
||||
logOnConsole(data);
|
||||
logIntoFile(data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -424,6 +445,9 @@ std::string Logger::getLogSettings() {
|
||||
case CONSOLE:
|
||||
logSettings += "LogType = CONSOLE";
|
||||
break;
|
||||
case BOTH_FILE_AND_CONSOLE:
|
||||
logSettings += "LogType = BOTH_FILE_AND_CONSOLE";
|
||||
break;
|
||||
default:
|
||||
logSettings += "LogType = <undefined>";
|
||||
}
|
||||
@@ -471,7 +495,26 @@ void Logger::initialize_resources() {
|
||||
}
|
||||
m_File.open(logFileName.c_str(), std::ios::out | std::ios::app);
|
||||
m_LogLevel = LOG_LEVEL_TRACE;
|
||||
m_LogType = FILE_LOG;
|
||||
// RSMI_LOGGING = 1, output to logs only
|
||||
// RSMI_LOGGING = 2, output to console only
|
||||
// RSMI_LOGGING = 3, output to logs and console
|
||||
switch (amd::smi::RocmSMI::getInstance().getLogSetting()) {
|
||||
case 0:
|
||||
m_LogType = NO_LOG;
|
||||
break;
|
||||
case 1:
|
||||
m_LogType = FILE_LOG;
|
||||
break;
|
||||
case 2:
|
||||
m_LogType = CONSOLE;
|
||||
break;
|
||||
case 3:
|
||||
m_LogType = BOTH_FILE_AND_CONSOLE;
|
||||
break;
|
||||
default:
|
||||
m_LogType = NO_LOG;
|
||||
break;
|
||||
}
|
||||
if (!m_File.is_open()) {
|
||||
std::cout << "WARNING: Issue opening log file (" << logFileName
|
||||
<< ") to write." << std::endl;
|
||||
|
||||
@@ -84,6 +84,7 @@ amd::smi::RocmSMI::devInfoTypesStrings = {
|
||||
{amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"},
|
||||
{amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"},
|
||||
{amd::smi::kDevDevID, amdSMI + "kDevDevID"},
|
||||
{amd::smi::kDevDevRevID, amdSMI + "kDevDevRevID"},
|
||||
{amd::smi::kDevDevProdName, amdSMI + "kDevDevProdName"},
|
||||
{amd::smi::kDevDevProdNum, amdSMI + "kDevDevProdNum"},
|
||||
{amd::smi::kDevVendorID, amdSMI + "kDevVendorID"},
|
||||
@@ -169,6 +170,7 @@ static uint32_t GetDeviceIndex(const std::string s) {
|
||||
// computed for cardX.
|
||||
// On success, return drm_minor which is >= 128 otherwise return 0
|
||||
static uint32_t GetDrmRenderMinor(const std::string s) {
|
||||
std::ostringstream ss;
|
||||
std::string drm_path = s;
|
||||
int drm_minor = 0;
|
||||
const std::string render_file_prefix = "renderD";
|
||||
@@ -194,6 +196,10 @@ static uint32_t GetDrmRenderMinor(const std::string s) {
|
||||
if (closedir(drm_dir)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | Discovered drmRenderMinor = "
|
||||
<< std::to_string(drm_minor) << " | For drm_path = " << drm_path << " | ";
|
||||
LOG_DEBUG(ss);
|
||||
return static_cast<uint32_t>(drm_minor);
|
||||
}
|
||||
|
||||
@@ -376,11 +382,15 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
|
||||
// Remove any drm nodes that don't have a corresponding readable kfd node.
|
||||
// kfd nodes will not be added if their properties file is not readable.
|
||||
std::ostringstream ss;
|
||||
auto dev_iter = devices_.begin();
|
||||
while (dev_iter != devices_.end()) {
|
||||
uint64_t bdfid = (*dev_iter)->bdfid();
|
||||
if (tmp_map.find(bdfid) == tmp_map.end()) {
|
||||
ss << __PRETTY_FUNCTION__ << " | removing device = "
|
||||
<< (*dev_iter)->path();
|
||||
dev_iter = devices_.erase(dev_iter);
|
||||
LOG_DEBUG(ss);
|
||||
continue;
|
||||
}
|
||||
dev_iter++;
|
||||
@@ -410,6 +420,9 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
}
|
||||
// Leaving below to help debug temp file issues
|
||||
// displayAppTmpFilesContent();
|
||||
std::string amdGPUDeviceList = displayAllDevicePaths(devices_);
|
||||
ss << __PRETTY_FUNCTION__ << " | current device paths = " << amdGPUDeviceList;
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -457,17 +470,21 @@ static uint32_t GetEnvVarUInteger(const char *ev_str) {
|
||||
|
||||
// provides a way to get env variable detail in both debug & release
|
||||
// helps enable full logging
|
||||
static bool getRSMIEnvVar_LoggingEnabled(const char *ev_str) {
|
||||
bool isLoggingEnabled = false;
|
||||
// RSMI_LOGGING = 1, output to logs only
|
||||
// RSMI_LOGGING = 2, output to console only
|
||||
// RSMI_LOGGING = 3, output to logs and console
|
||||
static uint32_t getRSMIEnvVar_LoggingEnabled(const char *ev_str) {
|
||||
uint32_t ret = 0;
|
||||
ev_str = getenv(ev_str);
|
||||
|
||||
if (ev_str != nullptr) {
|
||||
isLoggingEnabled = true;
|
||||
int ev_ret = atoi(ev_str);
|
||||
ret = static_cast<uint32_t>(ev_ret);
|
||||
}
|
||||
return isLoggingEnabled;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static std::unordered_set<uint32_t> GetEnvVarUIntegerSets(const char *ev_str) {
|
||||
static inline std::unordered_set<uint32_t> GetEnvVarUIntegerSets(
|
||||
const char *ev_str) {
|
||||
std::unordered_set<uint32_t> returnSet;
|
||||
#ifndef DEBUG
|
||||
(void)ev_str;
|
||||
@@ -518,7 +535,16 @@ const RocmSMI_env_vars& RocmSMI::getEnv(void) {
|
||||
}
|
||||
|
||||
bool RocmSMI::isLoggingOn(void) {
|
||||
bool isLoggingOn = false;
|
||||
GetEnvVariables();
|
||||
if (this->env_vars_.logging_on > 0
|
||||
&& this->env_vars_.logging_on <= 3) {
|
||||
isLoggingOn = true;
|
||||
}
|
||||
return isLoggingOn;
|
||||
}
|
||||
|
||||
uint32_t RocmSMI::getLogSetting() {
|
||||
return this->env_vars_.logging_on;
|
||||
}
|
||||
|
||||
@@ -543,7 +569,9 @@ void RocmSMI::printEnvVarInfo(void) {
|
||||
<< ((env_vars_.debug_inf_loop == 0) ? "<undefined>"
|
||||
: std::to_string(env_vars_.debug_inf_loop))
|
||||
<< std::endl;
|
||||
bool isLoggingOn = (env_vars_.logging_on) ? true : false;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
|
||||
<< getLogSetting() << std::endl;
|
||||
bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
|
||||
<< (isLoggingOn ? "true" : "false") << std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {";
|
||||
@@ -630,6 +658,9 @@ RocmSMI::FindMonitor(std::string monitor_path) {
|
||||
}
|
||||
void
|
||||
RocmSMI::AddToDeviceList(std::string dev_name) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
auto dev_path = std::string(kPathDRMRoot);
|
||||
dev_path += "/";
|
||||
dev_path += dev_name;
|
||||
@@ -646,6 +677,10 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
|
||||
GetSupportedEventGroups(card_indx, dev->supported_event_groups());
|
||||
|
||||
devices_.push_back(dev);
|
||||
ss << __PRETTY_FUNCTION__ << " | Adding to device list dev_name = "
|
||||
<< dev_name << " | path = " << dev_path
|
||||
<< " | card index = " << std::to_string(card_indx) << " | ";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return;
|
||||
}
|
||||
@@ -653,16 +688,26 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
|
||||
static const uint32_t kAmdGpuId = 0x1002;
|
||||
|
||||
static bool isAMDGPU(std::string dev_path) {
|
||||
bool isAmdGpu = false;
|
||||
std::ostringstream ss;
|
||||
std::string vend_path = dev_path + "/device/vendor";
|
||||
if (!FileExists(vend_path.c_str())) {
|
||||
return false;
|
||||
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
|
||||
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
|
||||
"is an amdgpu device - FALSE");
|
||||
LOG_DEBUG(ss);
|
||||
return isAmdGpu;
|
||||
}
|
||||
|
||||
std::ifstream fs;
|
||||
fs.open(vend_path);
|
||||
|
||||
if (!fs.is_open()) {
|
||||
return false;
|
||||
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
|
||||
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
|
||||
"is an amdgpu device - FALSE");
|
||||
LOG_DEBUG(ss);
|
||||
return isAmdGpu;
|
||||
}
|
||||
|
||||
uint32_t vendor_id;
|
||||
@@ -672,9 +717,13 @@ static bool isAMDGPU(std::string dev_path) {
|
||||
fs.close();
|
||||
|
||||
if (vendor_id == kAmdGpuId) {
|
||||
return true;
|
||||
isAmdGpu = true;
|
||||
}
|
||||
return false;
|
||||
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
|
||||
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
|
||||
"is an amdgpu device - FALSE");
|
||||
LOG_DEBUG(ss);
|
||||
return isAmdGpu;
|
||||
}
|
||||
|
||||
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
|
||||
@@ -313,6 +313,7 @@ int Monitor::writeMonitor(MonitorTypes type, uint32_t sensor_id,
|
||||
// This string version should work for all valid monitor types
|
||||
int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id,
|
||||
std::string *val) {
|
||||
std::ostringstream ss;
|
||||
assert(val != nullptr);
|
||||
|
||||
std::string temp_str;
|
||||
@@ -320,11 +321,21 @@ int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id,
|
||||
|
||||
DBG_FILE_ERROR(sysfs_path, (std::string *)nullptr)
|
||||
int ret = ReadSysfsStr(sysfs_path, val);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Success | Read hwmon file: " << sysfs_path
|
||||
<< " | Type: " << monitorTypesToString.at(type)
|
||||
<< " | Sensor id: " << std::to_string(sensor_id)
|
||||
<< " | Data: " << *val
|
||||
<< " | Returning: " << std::to_string(ret) << " |";
|
||||
LOG_INFO(ss);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int32_t
|
||||
Monitor::setTempSensorLabelMap(void) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
std::string type_str;
|
||||
int ret;
|
||||
|
||||
|
||||
@@ -52,11 +52,14 @@
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <cstdint>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
#include <iomanip>
|
||||
#include <type_traits>
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
@@ -103,7 +106,7 @@ bool FileExists(char const *filename) {
|
||||
return (stat(filename, &buf) == 0);
|
||||
}
|
||||
|
||||
static void debugFilesDiscovered(std::vector<std::string> files) {
|
||||
static inline void debugFilesDiscovered(std::vector<std::string> files) {
|
||||
std::ostringstream ss;
|
||||
int numberOfFilesFound = static_cast<int>(files.size());
|
||||
ss << "fileName.size() = " << numberOfFilesFound
|
||||
@@ -204,9 +207,10 @@ int ReadSysfsStr(std::string path, std::string *retStr) {
|
||||
if (!fs.is_open()) {
|
||||
ret = errno;
|
||||
errno = 0;
|
||||
oss << "Could not read SYSFS file (" << path << ")"
|
||||
<< ", returning " << std::to_string(ret) << " ("
|
||||
<< std::strerror(ret) << ")";
|
||||
oss << __PRETTY_FUNCTION__
|
||||
<< " | Fail | Cause: file does not exist or permissions issue"
|
||||
<< " | SYSFS file: " << path
|
||||
<< " | Returning: " << std::strerror(ret) << " |";
|
||||
LOG_ERROR(oss);
|
||||
return ret;
|
||||
}
|
||||
@@ -457,9 +461,13 @@ rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName,
|
||||
}
|
||||
|
||||
chmod(fileName, S_IRUSR|S_IRGRP|S_IROTH);
|
||||
write(fd, storageData.c_str(), storageData.size());
|
||||
ssize_t rc_write = write(fd, storageData.c_str(), storageData.size());
|
||||
close(fd);
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
if (rc_write == -1) {
|
||||
return RSMI_STATUS_FILE_ERROR;
|
||||
} else {
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> getListOfAppTmpFiles() {
|
||||
@@ -531,19 +539,39 @@ void displayAppTmpFilesContent() {
|
||||
}
|
||||
|
||||
// Used to debug vector string list and their content
|
||||
void displayVectorContent(std::vector<std::string> v) {
|
||||
std::cout << "Vector = {";
|
||||
std::string debugVectorContent(std::vector<std::string> v) {
|
||||
std::ostringstream ss;
|
||||
ss << "Vector = {";
|
||||
if (v.size() > 0) {
|
||||
for (auto it=v.begin(); it < v.end(); it++) {
|
||||
std::cout << *it;
|
||||
ss << *it;
|
||||
auto temp_it = it;
|
||||
if(++temp_it != v.end()) {
|
||||
std::cout << ", ";
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::cout << "}" << std::endl;
|
||||
}
|
||||
ss << "}" << std::endl;
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
// Used to debug vector string list and their content
|
||||
std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v) {
|
||||
std::ostringstream ss;
|
||||
ss << "Vector = {";
|
||||
if (v.size() > 0) {
|
||||
for (auto it=v.begin(); it < v.end(); it++) {
|
||||
ss << (*it)->path();
|
||||
auto temp_it = it;
|
||||
if(++temp_it != v.end()) {
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
}
|
||||
ss << "}" << std::endl;
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
// Attempts to read application specific temporary file
|
||||
@@ -595,14 +623,20 @@ std::string getRSMIStatusString(rsmi_status_t ret) {
|
||||
// string domainName = domain name of the the system's node on the network
|
||||
// string os_distribution = pretty name of os distribution
|
||||
// (typically found in /etc/*-release file)
|
||||
// string endianness = system's endianness.
|
||||
// Expressed as big endian or little endian.
|
||||
// Big Endian (BE), multi-bit symbols encoded as big endian (MSB first)
|
||||
// Little Endian (LE), multi-bit symbols encoded as little endian (LSB first)
|
||||
std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string> getSystemDetails(void) {
|
||||
std::string, std::string, std::string, std::string>
|
||||
getSystemDetails(void) {
|
||||
struct utsname buf;
|
||||
bool errorDetected = false;
|
||||
std::string temp_data;
|
||||
std::string sysname, nodename, release, version, machine;
|
||||
std::string domainName = "<undefined>";
|
||||
std::string os_distribution = "<undefined>";
|
||||
std::string endianness = "<undefined>";
|
||||
|
||||
if (uname(&buf) < 0) {
|
||||
errorDetected = true;
|
||||
@@ -630,8 +664,16 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
}
|
||||
}
|
||||
}
|
||||
if (isSystemBigEndian()) {
|
||||
endianness = "Big Endian, multi-bit symbols encoded as"
|
||||
" big endian (MSB first)";
|
||||
} else {
|
||||
endianness = "Little Endian, multi-bit symbols encoded as"
|
||||
" little endian (LSB first)";
|
||||
}
|
||||
return std::make_tuple(errorDetected, sysname, nodename, release,
|
||||
version, machine, domainName, os_distribution);
|
||||
version, machine, domainName, os_distribution,
|
||||
endianness);
|
||||
}
|
||||
|
||||
// If logging is enabled through RSMI_LOGGING environment variable.
|
||||
@@ -639,9 +681,10 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
void logSystemDetails(void) {
|
||||
std::ostringstream ss;
|
||||
bool errorDetected;
|
||||
std::string sysname, node, release, version, machine, domain, distName;
|
||||
std::string sysname, node, release, version, machine, domain, distName,
|
||||
endianness;
|
||||
std::tie(errorDetected, sysname, node, release, version, machine, domain,
|
||||
distName) = getSystemDetails();
|
||||
distName, endianness) = getSystemDetails();
|
||||
if (errorDetected == false) {
|
||||
ss << "====== Gathered system details ============\n"
|
||||
<< "SYSTEM NAME: " << sysname << "\n"
|
||||
@@ -650,7 +693,8 @@ void logSystemDetails(void) {
|
||||
<< "RELEASE: " << release << "\n"
|
||||
<< "VERSION: " << version << "\n"
|
||||
<< "MACHINE TYPE: " << machine << "\n"
|
||||
<< "DOMAIN: " << domain << "\n";
|
||||
<< "DOMAIN: " << domain << "\n"
|
||||
<< "ENDIANNESS: " << endianness << "\n";
|
||||
LOG_INFO(ss);
|
||||
} else {
|
||||
ss << "====== Gathered system details ============\n"
|
||||
@@ -659,5 +703,116 @@ void logSystemDetails(void) {
|
||||
}
|
||||
}
|
||||
|
||||
// Usage:
|
||||
// logHexDump(desc, addr, len, bytesPerLine);
|
||||
// desc: if non-NULL, printed as a description before hex dump.
|
||||
// addr: the address to start dumping from.
|
||||
// len: the number of bytes to dump.
|
||||
// bytesPerLine: number of bytes on each output line.
|
||||
void logHexDump(
|
||||
const char *desc, const void *addr, const size_t len, size_t bytesPerLine) {
|
||||
// UNCOMMENT: printf lines if you want to see directly to stdout
|
||||
std::ostringstream ss;
|
||||
// Silently ignore per-line values.
|
||||
if (bytesPerLine < 4 || bytesPerLine > 64) bytesPerLine = 16;
|
||||
|
||||
size_t i;
|
||||
unsigned char buff[bytesPerLine + 1];
|
||||
const unsigned char *pc // ptr to data (char, 1 byte sized data)
|
||||
= (const unsigned char *) addr;
|
||||
|
||||
// Output description if given.
|
||||
// if (desc != NULL) printf("%s:\n", desc);
|
||||
if (desc != NULL) ss << "\n" << desc << "\n";
|
||||
|
||||
// Length checks.
|
||||
if (len == 0) {
|
||||
// printf(" ZERO LENGTH\n");
|
||||
ss << " ZERO LENGTH\n";
|
||||
LOG_ERROR(ss);
|
||||
return;
|
||||
}
|
||||
std::string endianness = "<undefined>";
|
||||
if (isSystemBigEndian()) {
|
||||
endianness = "** System is Big Endian, multi-bit symbols encoded as"
|
||||
" big endian (MSB first) **";
|
||||
} else {
|
||||
endianness = "** System is Little Endian, multi-bit symbols encoded as"
|
||||
" little endian (LSB first) **";
|
||||
}
|
||||
ss << "\t" << endianness << "\n";
|
||||
|
||||
// Process every byte in the data.
|
||||
for (i = 0; i < len; i++) {
|
||||
// Multiple of bytesPerLine means new or first line (with line offset).
|
||||
if ((i % bytesPerLine) == 0) {
|
||||
// Only print previous-line ASCII buffer for lines beyond first.
|
||||
// if (i != 0) printf(" %s\n", buff);
|
||||
if (i != 0) ss << " " << buff << "\n";
|
||||
// Output the offset of current line.
|
||||
// printf(" %08lx ", i);
|
||||
ss << " " << std::setw(8) << std::setfill('0') << std::hex << i << " ";
|
||||
}
|
||||
|
||||
// Now the hex code for the specific character.
|
||||
// printf(" %02x", pc[i]);
|
||||
|
||||
ss << " " << std::setw(2) << std::setfill('0') << std::hex
|
||||
<< static_cast<unsigned>(pc[i]);
|
||||
|
||||
// And buffer a printable ASCII character for later.
|
||||
// x20 = 32 || x7e = 126 (ascii table range)
|
||||
if ((pc[i] < 0x20) || (pc[i] > 0x7e)) { // isprint() may be better.
|
||||
buff[i % bytesPerLine] = '.';
|
||||
} else {
|
||||
buff[i % bytesPerLine] = pc[i];
|
||||
}
|
||||
buff[(i % bytesPerLine) + 1] = '\0';
|
||||
}
|
||||
|
||||
// Pad out last line if not exactly bytesPerLine characters.
|
||||
while ((i % bytesPerLine) != 0) {
|
||||
// printf(" ");
|
||||
ss << " ";
|
||||
i++;
|
||||
}
|
||||
|
||||
// And print the final ASCII buffer.
|
||||
// printf(" %s\n", buff);
|
||||
ss << " " << buff << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
bool isSystemBigEndian() {
|
||||
int n = 1;
|
||||
bool isBigEndian = true;
|
||||
if (*(char *)&n == 1) {
|
||||
isBigEndian = false;
|
||||
}
|
||||
return isBigEndian;
|
||||
}
|
||||
|
||||
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str)
|
||||
{
|
||||
auto result = rsmi_status_t::RSMI_STATUS_SUCCESS;
|
||||
auto bus_id = static_cast<uint8_t>((bdf_id & 0x0000FF00) >> 8);
|
||||
auto dev_id = static_cast<uint8_t>((bdf_id & 0x000000F8) >> 3);
|
||||
auto func_id = static_cast<uint8_t>(bdf_id & 0x00000003);
|
||||
|
||||
bfd_str = std::string();
|
||||
if (!(bus_id > 0)) {
|
||||
result = rsmi_status_t::RSMI_STATUS_NO_DATA;
|
||||
return result;
|
||||
}
|
||||
|
||||
std::stringstream bdf_sstream;
|
||||
bdf_sstream << std::hex << std::setfill('0') << std::setw(sizeof(uint8_t) * 2) << +bus_id << ":";
|
||||
bdf_sstream << std::hex << std::setfill('0') << std::setw(sizeof(uint8_t) * 2) << +dev_id << ".";
|
||||
bdf_sstream << std::hex << std::setfill('0') << +func_id;
|
||||
bfd_str = bdf_sstream.str();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
@@ -578,6 +578,11 @@ amdsmi_status_t amdsmi_get_gpu_id(amdsmi_processor_handle processor_handle,
|
||||
return rsmi_wrapper(rsmi_dev_id_get, processor_handle, id);
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_revision(amdsmi_processor_handle processor_handle,
|
||||
uint16_t *revision) {
|
||||
return rsmi_wrapper(rsmi_dev_revision_get, processor_handle, revision);
|
||||
}
|
||||
|
||||
// TODO(bliu) : add fw info from libdrm
|
||||
amdsmi_status_t amdsmi_get_fw_info(amdsmi_processor_handle processor_handle,
|
||||
amdsmi_fw_info_t *info) {
|
||||
|
||||
@@ -0,0 +1,560 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2017-2023, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "rocm_smi/rocm_smi_properties.h"
|
||||
#include "rocm_smi/rocm_smi_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_device.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <sstream>
|
||||
|
||||
|
||||
//
|
||||
// Property reinforcement check list
|
||||
//
|
||||
// NOTE: This is a *temporary solution* until we get a better approach, likely
|
||||
// a driver API that can give us the capabilities of a GPU in question.
|
||||
//
|
||||
namespace amd {
|
||||
namespace smi {
|
||||
|
||||
const AMDGpuOpModeList_t amdgpu_opmode_check_list {
|
||||
{AMDGpuPropertyOpModeTypes_t::kBareMetal, "Bare Metal"},
|
||||
{AMDGpuPropertyOpModeTypes_t::kSrIov, "SR-IOV"},
|
||||
{AMDGpuPropertyOpModeTypes_t::kBoth, "Both"},
|
||||
};
|
||||
|
||||
const AMDGpuPropertyTypesOffsetList_t amdgpu_typeoffset_check_list {
|
||||
{AMDGpuPropertyTypesOffset_t::kNone, "None"},
|
||||
{AMDGpuPropertyTypesOffset_t::kDevInfoTypes, "Device Info Type"},
|
||||
{AMDGpuPropertyTypesOffset_t::kMonitorTypes, "Monitor Type"},
|
||||
{AMDGpuPropertyTypesOffset_t::kPerfTypes, "Performance Type"},
|
||||
{AMDGpuPropertyTypesOffset_t::kClkTypes, "Clock Type"},
|
||||
{AMDGpuPropertyTypesOffset_t::kVoltMetricTypes, "Volt Metric Type"},
|
||||
};
|
||||
|
||||
|
||||
AMDGpuPropertyId_t make_unique_property_id(AMDGpuPropertyTypesOffset_t type_offset, AMDGpuPropertyId_t property_id) {
|
||||
return (static_cast<AMDGpuPropertyOffsetType>(type_offset) | (property_id));
|
||||
}
|
||||
|
||||
AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id) {
|
||||
const auto property_type_offset_mask =
|
||||
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kDevInfoTypes) |
|
||||
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kMonitorTypes) |
|
||||
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kPerfTypes) |
|
||||
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kClkTypes) |
|
||||
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes);
|
||||
|
||||
auto property_type_offset = (static_cast<AMDGpuPropertyOffsetType>(property_type_offset_mask) & (property_id));
|
||||
auto property_type_id = (static_cast<AMDGpuPropertyOffsetType>(property_id) & ~(property_type_offset_mask));
|
||||
|
||||
return property_type_id;
|
||||
}
|
||||
|
||||
AMDGpuPropertyTypesOffset_t operator| (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs)
|
||||
{
|
||||
if (lhs == rhs) {
|
||||
return lhs;
|
||||
}
|
||||
|
||||
return AMDGpuPropertyTypesOffset_t(static_cast<AMDGpuPropertyOffsetType>(lhs) | static_cast<AMDGpuPropertyOffsetType>(rhs));
|
||||
}
|
||||
|
||||
AMDGpuPropertyTypesOffset_t operator& (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs)
|
||||
{
|
||||
if (lhs == rhs) {
|
||||
return lhs;
|
||||
}
|
||||
|
||||
return AMDGpuPropertyTypesOffset_t(static_cast<AMDGpuPropertyOffsetType>(lhs) & static_cast<AMDGpuPropertyOffsetType>(rhs));
|
||||
}
|
||||
|
||||
AMDGpuPropertyOpModeTypes_t operator| (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs)
|
||||
{
|
||||
if (lhs == rhs) {
|
||||
return lhs;
|
||||
}
|
||||
|
||||
return AMDGpuPropertyOpModeTypes_t(static_cast<AMDGpuPropertyOpModeType>(lhs) | static_cast<AMDGpuPropertyOpModeType>(rhs));
|
||||
}
|
||||
|
||||
AMDGpuPropertyOpModeTypes_t operator& (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs)
|
||||
{
|
||||
if (lhs == rhs) {
|
||||
return lhs;
|
||||
}
|
||||
|
||||
return AMDGpuPropertyOpModeTypes_t(static_cast<AMDGpuPropertyOpModeType>(lhs) & static_cast<AMDGpuPropertyOpModeType>(rhs));
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Note: Due to the fact that we have different enum elements with the same
|
||||
// number, keying a hash by the number is not an option; ie:
|
||||
// - DevInfoTypes::kDevVendorID = 7
|
||||
// - MonitorTypes::kMonPowerCapDefault = 7
|
||||
// So, we are keying it by a unique key, based on their info types
|
||||
//
|
||||
const AMDGpuVerbList_t amdgpu_verb_check_list {
|
||||
{ AMDGpuVerbTypes_t::kNone, "None" },
|
||||
{ AMDGpuVerbTypes_t::kSetGpuPciBandwidth, "amdsmi_set_gpu_pci_bandwidth" },
|
||||
{ AMDGpuVerbTypes_t::kSetPowerCap, "amdsmi_set_power_cap" },
|
||||
{ AMDGpuVerbTypes_t::kSetGpuPowerProfile, "amdsmi_set_gpu_power_profile" },
|
||||
{ AMDGpuVerbTypes_t::kSetGpuClkRange, "amdsmi_set_gpu_clk_range" },
|
||||
{ AMDGpuVerbTypes_t::kSetGpuOdClkInfo, "amdsmi_set_gpu_od_clk_info" },
|
||||
{ AMDGpuVerbTypes_t::kSetGpuOdVoltInfo, "amdsmi_set_gpu_od_volt_info" },
|
||||
{ AMDGpuVerbTypes_t::kSetGpuPerfLevelV1, "amdsmi_set_gpu_perf_level_v1" },
|
||||
{ AMDGpuVerbTypes_t::kSetGpuPerfLevel, "amdsmi_set_gpu_perf_level" },
|
||||
{ AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets, "amdsmi_get_gpu_power_profile_presets" },
|
||||
{ AMDGpuVerbTypes_t::kResetGpu, "amdsmi_reset_gpu" },
|
||||
{ AMDGpuVerbTypes_t::kSetGpuPerfDeterminismMode, "amdsmi_set_gpu_perf_determinism_mode" },
|
||||
{ AMDGpuVerbTypes_t::kSetGpuFanSpeed, "amdsmi_set_gpu_fan_speed" },
|
||||
{ AMDGpuVerbTypes_t::kResetGpuFan, "amdsmi_reset_gpu_fan" },
|
||||
{ AMDGpuVerbTypes_t::kSetClkFreq, "amdsmi_set_clk_freq" },
|
||||
{ AMDGpuVerbTypes_t::kSetGpuOverdriveLevelV1, "amdsmi_set_gpu_overdrive_level_v1" },
|
||||
{ AMDGpuVerbTypes_t::kSetGpuOverdriveLevel, "amdsmi_set_gpu_overdrive_level" },
|
||||
{ AMDGpuVerbTypes_t::kGetGpuFanRpms, "amdsmi_get_gpu_fan_rpms" },
|
||||
{ AMDGpuVerbTypes_t::kGetGpuFanSpeed, "amdsmi_get_gpu_fan_speed" },
|
||||
{ AMDGpuVerbTypes_t::kGetGpuFanSpeedMax, "amdsmi_get_gpu_fan_speed_max" },
|
||||
{ AMDGpuVerbTypes_t::kGetGpuVoltMetric, "amdsmi_get_temp_metric" },
|
||||
{ AMDGpuVerbTypes_t::kGetGpuOverDriveLevel, "amdsmi_get_gpu_overdrive_level" },
|
||||
{ AMDGpuVerbTypes_t::kGetGpuOdVoltInfo, "amdsmi_get_gpu_od_volt_info" },
|
||||
{ AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, "amdsmi_get_gpu_od_volt_curve_regions" }
|
||||
};
|
||||
|
||||
const uint16_t kDevRevIDAll(0xFFFF);
|
||||
const AMDGpuPropertyList_t amdgpu_property_reinforcement_list {
|
||||
//
|
||||
// {"Asic ID", {"Asic Rev. ID", "Unique Property ID", "Property Op.Mode", "Availability Flag"}}
|
||||
// DevInfoTypes::kDevPCIEClk = rsmi_dev_pci_bandwidth_get; rsmi_dev_pci_bandwidth_set
|
||||
// MonitorTypes::kMonPowerCapDefault = rsmi_dev_power_cap_default_get;
|
||||
// DevInfoTypes::kDevPowerProfileMode =
|
||||
// rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL = rsmi_dev_clk_range_set;
|
||||
//
|
||||
|
||||
// AMD Instinct MI210
|
||||
{0x740F, {0x02,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
|
||||
DevInfoTypes::kDevPowerProfileMode),
|
||||
AMDGpuVerbTypes_t::kSetGpuPowerProfile,
|
||||
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
|
||||
},
|
||||
|
||||
// AMD MIxxx
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
|
||||
DevInfoTypes::kDevPCIEClk),
|
||||
AMDGpuVerbTypes_t::kSetGpuPciBandwidth,
|
||||
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
|
||||
MonitorTypes::kMonPowerCapDefault),
|
||||
AMDGpuVerbTypes_t::kSetPowerCap,
|
||||
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
|
||||
DevInfoTypes::kDevPowerProfileMode),
|
||||
AMDGpuVerbTypes_t::kSetGpuPowerProfile,
|
||||
AMDGpuPropertyOpModeTypes_t::kBoth, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
|
||||
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL),
|
||||
AMDGpuVerbTypes_t::kSetGpuClkRange,
|
||||
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
|
||||
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL),
|
||||
AMDGpuVerbTypes_t::kSetGpuOdClkInfo,
|
||||
AMDGpuPropertyOpModeTypes_t::kBoth, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
|
||||
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL),
|
||||
AMDGpuVerbTypes_t::kSetGpuOdVoltInfo,
|
||||
AMDGpuPropertyOpModeTypes_t::kBoth, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
|
||||
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_AUTO),
|
||||
AMDGpuVerbTypes_t::kSetGpuPerfLevelV1,
|
||||
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
|
||||
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL),
|
||||
AMDGpuVerbTypes_t::kSetGpuPerfLevel,
|
||||
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
|
||||
DevInfoTypes::kDevPowerProfileMode),
|
||||
AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets,
|
||||
AMDGpuPropertyOpModeTypes_t::kBoth, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
|
||||
DevInfoTypes::kDevGpuReset),
|
||||
AMDGpuVerbTypes_t::kResetGpu,
|
||||
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
|
||||
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_DETERMINISM),
|
||||
AMDGpuVerbTypes_t::kSetGpuPerfDeterminismMode,
|
||||
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
|
||||
MonitorTypes::kMonFanSpeed),
|
||||
AMDGpuVerbTypes_t::kSetGpuFanSpeed,
|
||||
AMDGpuPropertyOpModeTypes_t::kBoth, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
|
||||
MonitorTypes::kMonFanCntrlEnable),
|
||||
AMDGpuVerbTypes_t::kResetGpuFan,
|
||||
AMDGpuPropertyOpModeTypes_t::kBoth, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kClkTypes,
|
||||
rsmi_clk_type::RSMI_CLK_TYPE_FIRST),
|
||||
AMDGpuVerbTypes_t::kSetClkFreq,
|
||||
AMDGpuPropertyOpModeTypes_t::kBoth, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
|
||||
DevInfoTypes::kDevOverDriveLevel),
|
||||
AMDGpuVerbTypes_t::kSetGpuOverdriveLevel,
|
||||
AMDGpuPropertyOpModeTypes_t::kBoth, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
|
||||
DevInfoTypes::kDevOverDriveLevel),
|
||||
AMDGpuVerbTypes_t::kSetGpuOverdriveLevelV1,
|
||||
AMDGpuPropertyOpModeTypes_t::kBoth, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
|
||||
MonitorTypes::kMonFanRPMs),
|
||||
AMDGpuVerbTypes_t::kGetGpuFanRpms,
|
||||
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
|
||||
MonitorTypes::kMonFanSpeed),
|
||||
AMDGpuVerbTypes_t::kGetGpuFanSpeed,
|
||||
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
|
||||
MonitorTypes::kMonMaxFanSpeed),
|
||||
AMDGpuVerbTypes_t::kGetGpuFanSpeedMax,
|
||||
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes,
|
||||
rsmi_voltage_metric_t::RSMI_VOLT_CURRENT),
|
||||
AMDGpuVerbTypes_t::kGetGpuVoltMetric,
|
||||
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
|
||||
DevInfoTypes::kDevOverDriveLevel),
|
||||
AMDGpuVerbTypes_t::kGetGpuOverDriveLevel,
|
||||
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
|
||||
DevInfoTypes::kDevPowerODVoltage),
|
||||
AMDGpuVerbTypes_t::kGetGpuOdVoltInfo,
|
||||
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
|
||||
DevInfoTypes::kDevPowerODVoltage),
|
||||
AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions,
|
||||
AMDGpuPropertyOpModeTypes_t::kBareMetal, false }
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbTypes_t verb_type, rsmi_status_t actual_error_code)
|
||||
{
|
||||
std::ostringstream osstream;
|
||||
osstream << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
osstream << __PRETTY_FUNCTION__ << " actual error code: " << actual_error_code << "\n";
|
||||
LOG_TRACE(osstream);
|
||||
|
||||
if (actual_error_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
return actual_error_code;
|
||||
}
|
||||
|
||||
//
|
||||
// For property reinforcement query, the possible return values are:
|
||||
// RSMI_STATUS_SUCCESS:
|
||||
// - Property found in the reinforcement table, and it *should exist*
|
||||
// RSMI_STATUS_NOT_SUPPORTED:
|
||||
// - Property found in the reinforcement table, and it *should not* exist
|
||||
// RSMI_STATUS_NO_DATA:
|
||||
// - Could not find the correct dev_id and dev_revision info to build the filter
|
||||
// RSMI_STATUS_UNKNOWN_ERROR:
|
||||
// - The results are initialized with that. If that is returned,
|
||||
// likely the reinforcement table does not contain any entries/rules for the
|
||||
// dev_id in question.
|
||||
//
|
||||
auto amdgpu_property_query_result_hdlr = [](rsmi_status_t query_result) {
|
||||
switch (query_result) {
|
||||
case (rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR):
|
||||
case (rsmi_status_t::RSMI_STATUS_NO_DATA):
|
||||
return rsmi_status_t::RSMI_STATUS_NOT_FOUND;
|
||||
break;
|
||||
|
||||
case (rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED):
|
||||
case (rsmi_status_t::RSMI_STATUS_SUCCESS):
|
||||
return query_result;
|
||||
break;
|
||||
|
||||
default:
|
||||
return rsmi_status_t::RSMI_STATUS_NOT_FOUND;
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
///
|
||||
GET_DEV_FROM_INDX
|
||||
osstream << __PRETTY_FUNCTION__ << "| ======= about to run property query ======="
|
||||
<< " [query filters: ]"
|
||||
<< " device: " << dv_ind
|
||||
<< " property/verb: " << static_cast<AMDGpuVerbId_t>(verb_type) << amdgpu_verb_check_list.at(verb_type);
|
||||
auto reinforcement_query_result = dev->check_amdgpu_property_reinforcement_query(dv_ind, verb_type);
|
||||
osstream << __PRETTY_FUNCTION__ << "| ======= result from property query ======="
|
||||
<< " query result: " << reinforcement_query_result;
|
||||
|
||||
reinforcement_query_result = amdgpu_property_query_result_hdlr(reinforcement_query_result);
|
||||
osstream << __PRETTY_FUNCTION__ << "| ======= result from property query ======="
|
||||
<< " query result: " << reinforcement_query_result;
|
||||
|
||||
return reinforcement_query_result;
|
||||
}
|
||||
|
||||
void dump_amdgpu_property_reinforcement_list()
|
||||
{
|
||||
std::ostringstream osstream;
|
||||
osstream << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
if (!amdgpu_property_reinforcement_list.empty()) {
|
||||
for (const auto& property : amdgpu_property_reinforcement_list) {
|
||||
osstream << __PRETTY_FUNCTION__
|
||||
<< " Asic ID: " << property.first
|
||||
<< " Asic Rev.ID: " << property.second.m_pci_rev_id
|
||||
<< " Property ID: " << property.second.m_property
|
||||
<< " Verb ID : " << static_cast<AMDGpuVerbId_t>(property.second.m_verb_id)
|
||||
<< " Verb Desc: " << amdgpu_verb_check_list.at(property.second.m_verb_id)
|
||||
<< " OpMode: " << static_cast<AMDGpuOpModeType_t>(property.second.m_opmode)
|
||||
<< " OpMode Desc: " << amdgpu_opmode_check_list.at(property.second.m_opmode)
|
||||
<< " Flag Avail.: " << property.second.m_should_be_available;
|
||||
}
|
||||
osstream << __PRETTY_FUNCTION__ << "| ======= end =======";
|
||||
return;
|
||||
}
|
||||
|
||||
osstream << __PRETTY_FUNCTION__ << "amdgpu_property_reinforcement_list is empty";
|
||||
LOG_TRACE(osstream);
|
||||
}
|
||||
|
||||
|
||||
rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type)
|
||||
{
|
||||
std::ostringstream osstream;
|
||||
auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR);
|
||||
|
||||
AMDGpuPropertyQuery_t amdgpu_property_query = [&]() {
|
||||
AMDGpuPropertyQuery_t amdgpu_property_query_init{};
|
||||
amdgpu_property_query_init.m_asic_id = 0;
|
||||
amdgpu_property_query_init.m_pci_rev_id = 0;
|
||||
amdgpu_property_query_init.m_dev_idx = dev_idx;
|
||||
amdgpu_property_query_init.m_property = 0;
|
||||
amdgpu_property_query_init.m_verb_id = verb_type;
|
||||
return amdgpu_property_query_init;
|
||||
}();
|
||||
|
||||
auto build_asic_id_filters = [&](const AMDGpuPropertyQuery_t& amdgpu_query_validate, bool& is_filter_good) {
|
||||
auto tmp_amdgpu_query = amdgpu_query_validate;
|
||||
auto id_filter_result(rsmi_status_t::RSMI_STATUS_SUCCESS);
|
||||
if (amdgpu_query_validate.m_asic_id == 0) {
|
||||
id_filter_result = rsmi_dev_id_get(dev_idx, &tmp_amdgpu_query.m_asic_id);
|
||||
if (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
id_filter_result = rsmi_dev_revision_get(dev_idx, &tmp_amdgpu_query.m_pci_rev_id);
|
||||
}
|
||||
}
|
||||
is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) ? true : false;
|
||||
return tmp_amdgpu_query;
|
||||
};
|
||||
|
||||
// If the original amdgpu_query is missing parts of the filter, such as;
|
||||
// asic_id, revision_id, we try to retrieve them based on the dev_idx.
|
||||
// the property we are searching for, *must be present* .
|
||||
osstream << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(osstream);
|
||||
|
||||
bool is_proper_query(false);
|
||||
amdgpu_property_query = build_asic_id_filters(amdgpu_property_query, is_proper_query);
|
||||
if (!is_proper_query) {
|
||||
rsmi_status = rsmi_status_t::RSMI_STATUS_NO_DATA;
|
||||
osstream << __PRETTY_FUNCTION__ << "| ======= end ======="
|
||||
<< ", Missing Query Filters were not successfully retrieved: "
|
||||
<< " [query filters: ]"
|
||||
<< " device: " << dev_idx
|
||||
<< " asic id: " << amdgpu_property_query.m_asic_id
|
||||
<< " revision id: " << amdgpu_property_query.m_pci_rev_id
|
||||
<< " property: " << amdgpu_property_query.m_property
|
||||
<< " verb: " << static_cast<AMDGpuVerbId_t>(amdgpu_property_query.m_verb_id)
|
||||
<< " proper_query: " << is_proper_query
|
||||
<< " error: " << rsmi_status;
|
||||
LOG_TRACE(osstream);
|
||||
return rsmi_status;
|
||||
}
|
||||
|
||||
return run_amdgpu_property_reinforcement_query(amdgpu_property_query);
|
||||
}
|
||||
|
||||
rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query)
|
||||
{
|
||||
std::ostringstream osstream;
|
||||
auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR);
|
||||
|
||||
auto contains = [](const uint16_t asic_id) {
|
||||
return (amdgpu_property_reinforcement_list.find(asic_id) != amdgpu_property_reinforcement_list.end());
|
||||
};
|
||||
|
||||
auto ends_with = [](const std::string& value, const std::string& ending) {
|
||||
if (value.size() < ending.size()) {
|
||||
return false;
|
||||
}
|
||||
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
|
||||
};
|
||||
|
||||
// Traverse through all values for a given key
|
||||
osstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n";
|
||||
LOG_TRACE(osstream);
|
||||
if (contains(amdgpu_property_query.m_asic_id)) {
|
||||
osstream << __PRETTY_FUNCTION__ << " asic id found in table: " << amdgpu_property_query.m_asic_id << "\n";
|
||||
auto itr_begin = amdgpu_property_reinforcement_list.lower_bound(amdgpu_property_query.m_asic_id);
|
||||
auto itr_end = amdgpu_property_reinforcement_list.upper_bound(amdgpu_property_query.m_asic_id);
|
||||
while (itr_begin != itr_end) {
|
||||
// Still same key, and...
|
||||
if (itr_begin->first == amdgpu_property_query.m_asic_id) {
|
||||
osstream << __PRETTY_FUNCTION__ << " asic id found: " << itr_begin->first << "\n";
|
||||
// Pci_rev_id matches the filter or ALL Revisions
|
||||
if ((itr_begin->second.m_pci_rev_id == amdgpu_property_query.m_pci_rev_id) ||
|
||||
(itr_begin->second.m_pci_rev_id == kDevRevIDAll)) {
|
||||
osstream << __PRETTY_FUNCTION__ << " asic rev.id found: " << itr_begin->second.m_pci_rev_id << "\n";
|
||||
// Do we have the property we are looking for?
|
||||
if (((amdgpu_property_query.m_property != 0) &&
|
||||
(itr_begin->second.m_property == amdgpu_property_query.m_property)) ||
|
||||
((amdgpu_property_query.m_verb_id != AMDGpuVerbTypes_t::kNone) &&
|
||||
(itr_begin->second.m_verb_id == amdgpu_property_query.m_verb_id))) {
|
||||
osstream << __PRETTY_FUNCTION__
|
||||
<< " property found: " << itr_begin->second.m_property
|
||||
<< " verb found: " << static_cast<AMDGpuVerbId_t>(itr_begin->second.m_verb_id)
|
||||
<< " " << amdgpu_verb_check_list.at(amdgpu_property_query.m_verb_id)
|
||||
<< " should_be_available: " << itr_begin->second.m_should_be_available << "\n";
|
||||
// and if we do, should we consider it available, or forcefully
|
||||
// considered it unavailable
|
||||
osstream << __PRETTY_FUNCTION__ << "| ======= validating ======="
|
||||
<< ", Property found in the table for this device and flagged as *Not Available* : "
|
||||
<< " [query filters: ]"
|
||||
<< " device: " << amdgpu_property_query.m_dev_idx
|
||||
<< " asic id: " << amdgpu_property_query.m_asic_id
|
||||
<< " revision id: " << amdgpu_property_query.m_pci_rev_id
|
||||
<< " reinf.tbl.rev. id: " << itr_begin->second.m_pci_rev_id;
|
||||
//
|
||||
// The property is set in the reinforcement table to 'it should not be available'
|
||||
if (!itr_begin->second.m_should_be_available) {
|
||||
// If the property is found and set to not available
|
||||
// (rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED),
|
||||
// it should be all good (rsmi_status_t::RSMI_STATUS_SUCCESS);
|
||||
rsmi_status = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED;
|
||||
osstream << __PRETTY_FUNCTION__
|
||||
<< " should_be_available: " << itr_begin->second.m_should_be_available
|
||||
<< " result: " << rsmi_status << "\n";
|
||||
LOG_TRACE(osstream);
|
||||
return rsmi_status;
|
||||
}
|
||||
//
|
||||
// The property is set in the reinforcement table to 'it should be available'
|
||||
rsmi_status = rsmi_status_t::RSMI_STATUS_SUCCESS;
|
||||
osstream << __PRETTY_FUNCTION__
|
||||
<< " should_be_available: " << itr_begin->second.m_should_be_available
|
||||
<< " result: " << rsmi_status << "\n";
|
||||
LOG_TRACE(osstream);
|
||||
return rsmi_status;
|
||||
}
|
||||
}
|
||||
}
|
||||
itr_begin++;
|
||||
}
|
||||
}
|
||||
|
||||
osstream << __PRETTY_FUNCTION__ << "| ======= end ======="
|
||||
<< "Done searching for the Property in reinforcement table for this device: "
|
||||
<< " device: " << amdgpu_property_query.m_dev_idx
|
||||
<< " asic id: " << amdgpu_property_query.m_asic_id
|
||||
<< " revision id: " << amdgpu_property_query.m_pci_rev_id
|
||||
<< " property id: " << amdgpu_property_query.m_property
|
||||
<< " error: " << rsmi_status;
|
||||
LOG_TRACE(osstream);
|
||||
return rsmi_status;
|
||||
}
|
||||
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
@@ -106,7 +106,7 @@ void TestSysInfoRead::Run(void) {
|
||||
err = amdsmi_get_gpu_vbios_info(processor_handles_[i], &info);
|
||||
|
||||
if (err != AMDSMI_STATUS_SUCCESS) {
|
||||
if (err == AMDSMI_STATUS_FILE_ERROR) {
|
||||
if ((err == AMDSMI_STATUS_FILE_ERROR) || (err == AMDSMI_STATUS_NOT_SUPPORTED)) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**VBIOS read: Not supported on this machine"
|
||||
<< std::endl;
|
||||
|
||||
@@ -55,6 +55,20 @@ FILTER[sienna_cichlid]=\
|
||||
$BLACKLIST_ALL_ASICS\
|
||||
"rsmitstReadWrite.TestPerfLevelReadWrite"
|
||||
|
||||
# SWDEV-391407
|
||||
FILTER[90400]=\
|
||||
$BLACKLIST_ALL_ASICS\
|
||||
"rsmitstReadOnly.TestVoltCurvRead:"\
|
||||
"rsmitstReadOnly.TestFrequenciesRead:"\
|
||||
"rsmitstReadWrite.TestFrequenciesReadWrite:"\
|
||||
"rsmitstReadWrite.TestPowerReadWrite"
|
||||
FILTER[90401]=\
|
||||
$BLACKLIST_ALL_ASICS\
|
||||
"rsmitstReadOnly.TestVoltCurvRead:"\
|
||||
"rsmitstReadOnly.TestFrequenciesRead:"\
|
||||
"rsmitstReadWrite.TestFrequenciesReadWrite:"\
|
||||
"rsmitstReadWrite.TestPowerReadWrite"
|
||||
|
||||
# SWDEV-321166
|
||||
FILTER[virtualization]=\
|
||||
$BLACKLIST_ALL_ASICS\
|
||||
@@ -63,4 +77,4 @@ $BLACKLIST_ALL_ASICS\
|
||||
"rsmitstReadWrite.FanReadWrite:"\
|
||||
"rsmitstReadWrite.TestOverdriveReadWrite:"\
|
||||
"rsmitstReadWrite.TestPowerReadWrite:"\
|
||||
"rsmitstReadWrite.TestPowerCapReadWrite"
|
||||
"rsmitstReadWrite.TestPowerCapReadWrite"
|
||||
|
||||
@@ -173,6 +173,13 @@ void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) {
|
||||
std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl;
|
||||
}
|
||||
|
||||
err = amdsmi_get_gpu_revision(dv_ind, &val_ui16);
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device Revision ID: 0x" << std::hex <<
|
||||
val_ui16 << std::endl;
|
||||
}
|
||||
|
||||
amdsmi_board_info_t board_info;
|
||||
err = amdsmi_get_gpu_board_info(dv_ind, &board_info);
|
||||
CHK_ERR_ASRT(err)
|
||||
|
||||
@@ -283,6 +283,7 @@ void DumpMonitorInfo(const TestBase *test) {
|
||||
};
|
||||
|
||||
print_val_str(amd::smi::kDevDevID, "Device ID: ");
|
||||
print_val_str(amd::smi::kDevDevRevID, "Dev.Rev.ID: ");
|
||||
print_val_str(amd::smi::kDevPerfLevel, "Performance Level: ");
|
||||
print_val_str(amd::smi::kDevOverDriveLevel, "OverDrive Level: ");
|
||||
print_vector(amd::smi::kDevGPUMClk,
|
||||
|
||||
Spustitelný soubor
+325
@@ -0,0 +1,325 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2019, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi_test/functional/id_info_read.h"
|
||||
#include "rocm_smi_test/test_common.h"
|
||||
|
||||
TestIdInfoRead::TestIdInfoRead() : TestBase() {
|
||||
set_title("RSMI ID Info Read Test");
|
||||
set_description("This test verifies that ID information such as the "
|
||||
"device, subsystem and vendor IDs can be read properly.");
|
||||
}
|
||||
|
||||
TestIdInfoRead::~TestIdInfoRead(void) {
|
||||
}
|
||||
|
||||
void TestIdInfoRead::SetUp(void) {
|
||||
TestBase::SetUp();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void TestIdInfoRead::DisplayTestInfo(void) {
|
||||
TestBase::DisplayTestInfo();
|
||||
}
|
||||
|
||||
void TestIdInfoRead::DisplayResults(void) const {
|
||||
TestBase::DisplayResults();
|
||||
return;
|
||||
}
|
||||
|
||||
void TestIdInfoRead::Close() {
|
||||
// This will close handles opened within rsmitst utility calls and call
|
||||
// rsmi_shut_down(), so it should be done after other hsa cleanup
|
||||
TestBase::Close();
|
||||
}
|
||||
|
||||
static const uint32_t kBufferLen = 80;
|
||||
|
||||
void TestIdInfoRead::Run(void) {
|
||||
rsmi_status_t err;
|
||||
uint16_t id;
|
||||
uint64_t val_ui64;
|
||||
uint32_t drm_render_minor;
|
||||
|
||||
char buffer[kBufferLen];
|
||||
|
||||
TestBase::Run();
|
||||
if (setup_failed_) {
|
||||
std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t*************************" << std::endl;
|
||||
std::cout << "\t**Device index: " << i << std::endl;
|
||||
}
|
||||
|
||||
// Get the device ID, name, vendor ID and vendor name for the device
|
||||
err = rsmi_dev_id_get(i, &id);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
rsmi_status_t ret;
|
||||
// Verify api support checking functionality is working
|
||||
ret = rsmi_dev_id_get(i, nullptr);
|
||||
ASSERT_EQ(ret, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device ID: 0x" << std::hex << id << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_id_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
// Get device Revision
|
||||
err = rsmi_dev_revision_get(i, &id);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
rsmi_status_t ret;
|
||||
// Verify api support checking functionality is working
|
||||
ret = rsmi_dev_revision_get(i, nullptr);
|
||||
ASSERT_EQ(ret, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << id << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_revision_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
|
||||
err = rsmi_dev_name_get(i, buffer, kBufferLen);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "\t**Device Marketing name not found on this system." <<
|
||||
std::endl;
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_name_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device Marketing name: " << buffer << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_name_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
err = rsmi_dev_brand_get(i, buffer, kBufferLen);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_brand_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device Brand name: " << buffer << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_brand_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
err = rsmi_dev_vram_vendor_get(i, buffer, kBufferLen);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout <<
|
||||
"\t**Vram Vendor string not supported on this system." << std::endl;
|
||||
err = rsmi_dev_vram_vendor_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device Vram Vendor name: " << buffer << std::endl;
|
||||
}
|
||||
err = rsmi_dev_vram_vendor_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
err = rsmi_dev_vendor_id_get(i, &id);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_vendor_id_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Vendor ID: 0x" << std::hex << id << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_vendor_id_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
err = rsmi_dev_drm_render_minor_get(i, &drm_render_minor);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_drm_render_minor_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**DRM Render Minor: " << drm_render_minor << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_drm_render_minor_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
err = rsmi_dev_vendor_name_get(i, buffer, kBufferLen);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "\t**Device Vendor name string not found on this system." <<
|
||||
std::endl;
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_vendor_name_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device Vendor name: " << buffer << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_vendor_name_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
|
||||
// Get the device ID, name, vendor ID and vendor name for the sub-device
|
||||
err = rsmi_dev_subsystem_id_get(i, &id);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_subsystem_id_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Subsystem ID: 0x" << std::hex << id << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_subsystem_id_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
err = rsmi_dev_subsystem_name_get(i, buffer, kBufferLen);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "\t**Subsystem name string not found on this system." <<
|
||||
std::endl;
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_subsystem_name_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Subsystem name: " << buffer << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_subsystem_name_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
err = rsmi_dev_subsystem_vendor_id_get(i, &id);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_subsystem_vendor_id_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Sub-system Vendor ID: 0x" << std::hex <<
|
||||
id << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_subsystem_vendor_id_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
err = rsmi_dev_vendor_name_get(i, buffer, kBufferLen);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout <<
|
||||
"\t**Subsystem Vendor name string not found on this system." <<
|
||||
std::endl;
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_vendor_name_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Subsystem Vendor name: " << buffer << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_vendor_name_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
|
||||
err = rsmi_dev_pci_id_get(i, &val_ui64);
|
||||
// Don't check for RSMI_STATUS_NOT_SUPPORTED since this should always be
|
||||
// supported. It is not based on a sysfs file.
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**PCI ID (BDFID): 0x" << std::hex << val_ui64;
|
||||
std::cout << " (" << std::dec << val_ui64 << ")" << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_pci_id_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
|
||||
err = rsmi_dev_serial_number_get(i, buffer, kBufferLen);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_serial_number_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
|
||||
|
||||
std::cout <<
|
||||
"\t**Serial Number string not supported on this system." << std::endl;
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device Serial Number:" << buffer << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_serial_number_get(i, nullptr, kBufferLen);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
}
|
||||
}
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele