Merge remote-tracking branch 'rocmsmi/amd-staging' into HEAD
Change-Id: I65ed7f3a0d1b6e58bc8377932d7c39db21d1b422
[ROCm/amdsmi commit: 5c41319c83]
Bu işleme şunda yer alıyor:
@@ -1,21 +1,18 @@
|
||||
#!/bin/bash
|
||||
#set -x
|
||||
|
||||
packageName="amd-smi-lib"
|
||||
logPath=/var/log/amd_smi_lib
|
||||
logName=AMD-SMI-lib.log
|
||||
logFile="${logPath}/${logName}"
|
||||
logrotateConfFile=/etc/logrotate.d/amd_smi.conf
|
||||
|
||||
do_addLogFolder() {
|
||||
sudo mkdir -p "${logPath}"
|
||||
sudo touch "${logFile}"
|
||||
sudo chmod -R a+rw "${logPath}"
|
||||
sudo chmod a+rw "${logFile}"
|
||||
}
|
||||
|
||||
do_configureLogrotate() {
|
||||
logrotate --version &>/dev/null
|
||||
local IS_SYSTEMD=0
|
||||
local packageName="amd-smi-lib"
|
||||
local logPath=/var/log/amd_smi_lib
|
||||
local logFile="${logPath}/AMD-SMI-lib.log"
|
||||
local logrotateConfFile=/etc/logrotate.d/amd_smi.conf
|
||||
|
||||
mkdir -p "${logPath}"
|
||||
touch "${logFile}"
|
||||
chmod -R a+rw "${logPath}"
|
||||
chmod a+rw "${logFile}"
|
||||
|
||||
command -v logrotate &>/dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "[WARNING] Detected logrotate is not installed."\
|
||||
"$packageName logs (when turned on) will not rotate properly."
|
||||
@@ -23,14 +20,14 @@ do_configureLogrotate() {
|
||||
fi
|
||||
|
||||
if [ ! -f $logrotateConfFile ]; then
|
||||
sudo touch "${logrotateConfFile}"
|
||||
sudo chmod 644 "${logrotateConfFile}" # root r/w, all others read
|
||||
touch "${logrotateConfFile}"
|
||||
chmod 644 "${logrotateConfFile}" # root r/w, all others read
|
||||
# AMD SMI logging rotation, rotates files using root user/group
|
||||
# Hourly logrotation check
|
||||
# Only rotates if size grew larger than 1MB
|
||||
# Max of 4 rotation files, oldest will be removed
|
||||
# Rotated files use date extention of ex. AMD-SMI-lib.log.2023-05-09_16:51:42
|
||||
cat << EOF | sudo tee "${logrotateConfFile}" >/dev/null
|
||||
cat << EOF > "${logrotateConfFile}"
|
||||
${logFile} {
|
||||
su root root
|
||||
hourly
|
||||
@@ -47,44 +44,29 @@ EOF
|
||||
# issue was RPM build thought we were using macros
|
||||
# https://gitlab.kitware.com/cmake/cmake/-/issues/22965
|
||||
# https://rpm-software-management.github.io/rpm/manual/spec.html
|
||||
sudo sed -i s/%%/%/g "${logrotateConfFile}"
|
||||
sed -i s/%%/%/g "${logrotateConfFile}"
|
||||
# workaround: remove extra 'OURCE' text
|
||||
# from amd_smi.conf. Unsure if CMAKE,
|
||||
# bash, or here document
|
||||
# issue (only seen on RHEL 8.7)
|
||||
sudo sed -i s/OURCE//g "${logrotateConfFile}"
|
||||
sed -i s/OURCE//g "${logrotateConfFile}"
|
||||
fi
|
||||
# check if logrotate uses system timers, Ubuntu/modern OS's do
|
||||
# Several older OS's like RHEL 8.7, do not. Instead defaults
|
||||
# to use daily cron jobs - see https://stackoverflow.com/a/69465677
|
||||
sudo systemctl list-timers|grep -iq logrotate
|
||||
if [ $? -ne 0 ]; then
|
||||
# confirm logrotate file exists in daily
|
||||
if [ -f /etc/cron.daily/logrotate ]; then
|
||||
# move logrotate daily to hourly
|
||||
if [ -d /etc/cron.hourly ]; then
|
||||
sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
|
||||
else
|
||||
echo "[WARNING] Could find and configure hourly cron for $packageName's"\
|
||||
" logrotate. $packageName logs (when turned on) will not rotate properly."
|
||||
return
|
||||
fi
|
||||
else
|
||||
# confirm that it's already been moved to hourly
|
||||
sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "[WARNING] Could not configure an hourly cron for $packageName's logrotate."\
|
||||
"$packageName logs (when turned on) may not rotate properly."
|
||||
fi
|
||||
if [ -d /run/systemd/system ]; then
|
||||
systemctl list-timers | grep -iq logrotate
|
||||
if [ $? -eq 0 ]; then
|
||||
IS_SYSTEMD=1
|
||||
fi
|
||||
return #done configuring for non-systemd timers
|
||||
else
|
||||
fi
|
||||
if [ "$IS_SYSTEMD" -eq 1 ]; then
|
||||
# Configure systemd timers - the typical setup for modern Linux logrotation setups
|
||||
if [ -f /lib/systemd/system/logrotate.timer ]; then
|
||||
if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then
|
||||
sudo cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup
|
||||
cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup
|
||||
fi
|
||||
cat <<'EOF' | sudo tee /lib/systemd/system/logrotate.timer >/dev/null
|
||||
cat << EOF > /lib/systemd/system/logrotate.timer
|
||||
[Unit]
|
||||
Description=Hourly rotation of log files
|
||||
Documentation=man:logrotate(8) man:logrotate.conf(5)
|
||||
@@ -98,12 +80,19 @@ Persistent=true
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
EOF
|
||||
sudo systemctl reenable --now logrotate.timer
|
||||
systemctl reenable --now logrotate.timer
|
||||
else
|
||||
echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\
|
||||
"$packageName logs (when turned on) will not rotate properly."
|
||||
fi
|
||||
return #done configuring for systemd timers
|
||||
else
|
||||
# $IS_SYSTEMD -eq 0
|
||||
if [ -f /etc/cron.daily/logrotate ]; then
|
||||
# move logrotate daily to hourly
|
||||
if [ -d /etc/cron.hourly ]; then
|
||||
mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
@@ -118,8 +107,7 @@ do_ldconfig() {
|
||||
case "$1" in
|
||||
( configure )
|
||||
do_ldconfig
|
||||
do_addLogFolder
|
||||
do_configureLogrotate
|
||||
do_configureLogrotate || return 0
|
||||
;;
|
||||
( abort-upgrade | abort-remove | abort-deconfigure )
|
||||
echo "$1"
|
||||
|
||||
@@ -1,29 +1,4 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
packageName="amd-smi-lib"
|
||||
logPath=/var/log/amd_smi_lib
|
||||
logName=AMD-SMI-lib.log
|
||||
logFile="${logPath}/${logName}"
|
||||
logrotateConfFile=/etc/logrotate.d/amd_smi.conf
|
||||
|
||||
rm_logFolder() {
|
||||
sudo rm -rf "$logPath"
|
||||
}
|
||||
|
||||
return_logrotateToOrigConfig() {
|
||||
if [ -f $logrotateConfFile ]; then
|
||||
sudo rm -rf "${logrotateConfFile}"
|
||||
fi
|
||||
if [ -f /etc/cron.hourly/logrotate ]; then
|
||||
sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate
|
||||
fi
|
||||
if [ -f /lib/systemd/system/logrotate.timer.backup ]; then
|
||||
sudo cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer
|
||||
sudo rm -rf /lib/systemd/system/logrotate.timer.backup
|
||||
sudo systemctl reenable --now logrotate.timer
|
||||
fi
|
||||
}
|
||||
|
||||
rm_ldconfig() {
|
||||
# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
|
||||
@@ -39,6 +14,25 @@ rm_pyc() {
|
||||
rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi/__pycache__
|
||||
}
|
||||
|
||||
rm_logFolder() {
|
||||
rm -rf /var/log/amd_smi_lib
|
||||
}
|
||||
|
||||
return_logrotateToOrigConfig() {
|
||||
local logrotateConfFile=/etc/logrotate.d/amd_smi.conf
|
||||
if [ -f $logrotateConfFile ]; then
|
||||
rm -rf "$logrotateConfFile"
|
||||
fi
|
||||
if [ -f /etc/cron.hourly/logrotate ]; then
|
||||
mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate
|
||||
fi
|
||||
if [ -f /lib/systemd/system/logrotate.timer.backup ]; then
|
||||
cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer
|
||||
rm -rf /lib/systemd/system/logrotate.timer.backup
|
||||
systemctl reenable --now logrotate.timer
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
case "$1" in
|
||||
( remove | upgrade)
|
||||
|
||||
@@ -1,21 +1,18 @@
|
||||
#!/bin/bash
|
||||
#set -x
|
||||
|
||||
packageName="amd-smi-lib"
|
||||
logPath=/var/log/amd_smi_lib
|
||||
logName=AMD-SMI-lib.log
|
||||
logFile="${logPath}/${logName}"
|
||||
logrotateConfFile=/etc/logrotate.d/amd_smi.conf
|
||||
|
||||
do_addLogFolder() {
|
||||
sudo mkdir -p "${logPath}"
|
||||
sudo touch "${logFile}"
|
||||
sudo chmod -R a+rw "${logPath}"
|
||||
sudo chmod a+rw "${logFile}"
|
||||
}
|
||||
|
||||
do_configureLogrotate() {
|
||||
logrotate --version &>/dev/null
|
||||
local IS_SYSTEMD=0
|
||||
local packageName="amd-smi-lib"
|
||||
local logPath=/var/log/amd_smi_lib
|
||||
local logFile="${logPath}/AMD-SMI-lib.log"
|
||||
local logrotateConfFile=/etc/logrotate.d/amd_smi.conf
|
||||
|
||||
mkdir -p "${logPath}"
|
||||
touch "${logFile}"
|
||||
chmod -R a+rw "${logPath}"
|
||||
chmod a+rw "${logFile}"
|
||||
|
||||
command -v logrotate &>/dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "[WARNING] Detected logrotate is not installed."\
|
||||
"$packageName logs (when turned on) will not rotate properly."
|
||||
@@ -23,14 +20,14 @@ do_configureLogrotate() {
|
||||
fi
|
||||
|
||||
if [ ! -f $logrotateConfFile ]; then
|
||||
sudo touch "${logrotateConfFile}"
|
||||
sudo chmod 644 "${logrotateConfFile}" # root r/w, all others read
|
||||
touch "${logrotateConfFile}"
|
||||
chmod 644 "${logrotateConfFile}" # root r/w, all others read
|
||||
# AMD SMI logging rotation, rotates files using root user/group
|
||||
# Hourly logrotation check
|
||||
# Only rotates if size grew larger than 1MB
|
||||
# Max of 4 rotation files, oldest will be removed
|
||||
# Rotated files use date extention of ex. AMD-SMI-lib.log.2023-05-09_16:51:42
|
||||
cat << EOF | sudo tee "${logrotateConfFile}" >/dev/null
|
||||
cat << EOF > "${logrotateConfFile}"
|
||||
${logFile} {
|
||||
su root root
|
||||
hourly
|
||||
@@ -47,44 +44,29 @@ EOF
|
||||
# issue was RPM build thought we were using macros
|
||||
# https://gitlab.kitware.com/cmake/cmake/-/issues/22965
|
||||
# https://rpm-software-management.github.io/rpm/manual/spec.html
|
||||
sudo sed -i s/%%/%/g "${logrotateConfFile}"
|
||||
sed -i s/%%/%/g "${logrotateConfFile}"
|
||||
# workaround: remove extra 'OURCE' text
|
||||
# from amd_smi.conf. Unsure if CMAKE,
|
||||
# bash, or here document
|
||||
# issue (only seen on RHEL 8.7)
|
||||
sudo sed -i s/OURCE//g "${logrotateConfFile}"
|
||||
sed -i s/OURCE//g "${logrotateConfFile}"
|
||||
fi
|
||||
# check if logrotate uses system timers, Ubuntu/modern OS's do
|
||||
# Several older OS's like RHEL 8.7, do not. Instead defaults
|
||||
# to use daily cron jobs - see https://stackoverflow.com/a/69465677
|
||||
sudo systemctl list-timers|grep -iq logrotate
|
||||
if [ $? -ne 0 ]; then
|
||||
# confirm logrotate file exists in daily
|
||||
if [ -f /etc/cron.daily/logrotate ]; then
|
||||
# move logrotate daily to hourly
|
||||
if [ -d /etc/cron.hourly ]; then
|
||||
sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
|
||||
else
|
||||
echo "[WARNING] Could find and configure hourly cron for $packageName's"\
|
||||
" logrotate. $packageName logs (when turned on) will not rotate properly."
|
||||
return
|
||||
fi
|
||||
else
|
||||
# confirm that it's already been moved to hourly
|
||||
sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "[WARNING] Could not configure an hourly cron for $packageName's logrotate."\
|
||||
"$packageName logs (when turned on) may not rotate properly."
|
||||
fi
|
||||
if [ -d /run/systemd/system ]; then
|
||||
systemctl list-timers | grep -iq logrotate
|
||||
if [ $? -eq 0 ]; then
|
||||
IS_SYSTEMD=1
|
||||
fi
|
||||
return #done configuring for non-systemd timers
|
||||
else
|
||||
fi
|
||||
if [ "$IS_SYSTEMD" -eq 1 ]; then
|
||||
# Configure systemd timers - the typical setup for modern Linux logrotation setups
|
||||
if [ -f /lib/systemd/system/logrotate.timer ]; then
|
||||
if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then
|
||||
sudo cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup
|
||||
cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup
|
||||
fi
|
||||
cat <<'EOF' | sudo tee /lib/systemd/system/logrotate.timer >/dev/null
|
||||
cat << EOF > /lib/systemd/system/logrotate.timer
|
||||
[Unit]
|
||||
Description=Hourly rotation of log files
|
||||
Documentation=man:logrotate(8) man:logrotate.conf(5)
|
||||
@@ -98,12 +80,19 @@ Persistent=true
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
EOF
|
||||
sudo systemctl reenable --now logrotate.timer
|
||||
systemctl reenable --now logrotate.timer
|
||||
else
|
||||
echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\
|
||||
"$packageName logs (when turned on) will not rotate properly."
|
||||
fi
|
||||
return #done configuring for systemd timers
|
||||
else
|
||||
# $IS_SYSTEMD -eq 0
|
||||
if [ -f /etc/cron.daily/logrotate ]; then
|
||||
# move logrotate daily to hourly
|
||||
if [ -d /etc/cron.hourly ]; then
|
||||
mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
@@ -115,14 +104,8 @@ do_ldconfig() {
|
||||
fi
|
||||
}
|
||||
|
||||
# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
|
||||
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
|
||||
echo -e "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@" > /etc/ld.so.conf.d/x86_64-libamd_smi_lib.conf
|
||||
ldconfig
|
||||
fi
|
||||
|
||||
# post install or upgrade, $i is 1 or 2 -> do these actions
|
||||
if [ $1 -ge 1 ]; then
|
||||
do_addLogFolder
|
||||
do_configureLogrotate
|
||||
if [ "$1" -ge 1 ]; then
|
||||
do_ldconfig
|
||||
do_configureLogrotate || return 0
|
||||
fi
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
# second term originates from ENABLE_LDCONFIG = ON/OFF at package build
|
||||
if [ $1 -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then
|
||||
if [ "$1" -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then
|
||||
# perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
|
||||
rm -f /etc/ld.so.conf.d/x86_64-libamd_smi_lib.conf
|
||||
ldconfig
|
||||
|
||||
@@ -1,31 +1,4 @@
|
||||
#!/bin/bash
|
||||
#set -x
|
||||
|
||||
set -e
|
||||
|
||||
packageName="amd-smi-lib"
|
||||
logPath=/var/log/amd_smi_lib
|
||||
logName=AMD-SMI-lib.log
|
||||
logFile="${logPath}/${logName}"
|
||||
logrotateConfFile=/etc/logrotate.d/amd_smi.conf
|
||||
|
||||
rm_logFolder() {
|
||||
sudo rm -rf "$logPath"
|
||||
}
|
||||
|
||||
return_logrotateToOrigConfig() {
|
||||
if [ -f $logrotateConfFile ]; then
|
||||
sudo rm -rf "${logrotateConfFile}"
|
||||
fi
|
||||
if [ -f /etc/cron.hourly/logrotate ]; then
|
||||
sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate
|
||||
fi
|
||||
if [ -f /lib/systemd/system/logrotate.timer.backup ]; then
|
||||
sudo cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer
|
||||
sudo rm -rf /lib/systemd/system/logrotate.timer.backup
|
||||
sudo systemctl reenable --now logrotate.timer
|
||||
fi
|
||||
}
|
||||
|
||||
rm_pyc() {
|
||||
# remove pyc files generated by python
|
||||
@@ -33,7 +6,26 @@ rm_pyc() {
|
||||
rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi/__pycache__
|
||||
}
|
||||
|
||||
if [ $1 -le 1 ]; then
|
||||
rm_logFolder() {
|
||||
rm -rf /var/log/amd_smi_lib
|
||||
}
|
||||
|
||||
return_logrotateToOrigConfig() {
|
||||
local logrotateConfFile=/etc/logrotate.d/amd_smi.conf
|
||||
if [ -f $logrotateConfFile ]; then
|
||||
rm -rf "$logrotateConfFile"
|
||||
fi
|
||||
if [ -f /etc/cron.hourly/logrotate ]; then
|
||||
mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate
|
||||
fi
|
||||
if [ -f /lib/systemd/system/logrotate.timer.backup ]; then
|
||||
cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer
|
||||
rm -rf /lib/systemd/system/logrotate.timer.backup
|
||||
systemctl reenable --now logrotate.timer
|
||||
fi
|
||||
}
|
||||
|
||||
if [ "$1" -le 1 ]; then
|
||||
# perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
|
||||
rm_pyc
|
||||
rm_logFolder
|
||||
|
||||
@@ -7,3 +7,5 @@
|
||||
/_templates/
|
||||
/html/
|
||||
/latex/
|
||||
404.md
|
||||
data/AMD-404.png
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
docBin/
|
||||
@@ -844,7 +844,7 @@ int main() {
|
||||
}
|
||||
CHK_RSMI_NOT_SUPPORTED_RET(ret)
|
||||
|
||||
std::cout << "\t**Averge Power Usage: ";
|
||||
std::cout << "\t**Average Power Usage: ";
|
||||
ret = rsmi_dev_power_ave_get(i, 0, &val_ui64);
|
||||
if (ret == RSMI_STATUS_SUCCESS) {
|
||||
std::cout << static_cast<float>(val_ui64)/1000 << " W" << std::endl;
|
||||
|
||||
@@ -480,6 +480,19 @@ typedef enum {
|
||||
RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
|
||||
} rsmi_temperature_type_t;
|
||||
|
||||
/**
|
||||
* @brief Activity (Utilization) Metrics. This enum is used to identify
|
||||
* various activity metrics.
|
||||
*
|
||||
*/
|
||||
typedef enum {
|
||||
/* Utilization */
|
||||
RSMI_ACTIVITY_GFX = (0x1 << 0),
|
||||
RSMI_ACTIVITY_UMC = (0x1 << 1), //!< memory controller
|
||||
RSMI_ACTIVITY_MM = (0x1 << 2) //!< UVD or VCN
|
||||
} rsmi_activity_metric_t;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Voltage Metrics. This enum is used to identify various
|
||||
* Volatge metrics. Corresponding values will be in millivolt.
|
||||
@@ -788,6 +801,17 @@ typedef struct {
|
||||
typedef rsmi_pcie_bandwidth_t rsmi_pcie_bandwidth;
|
||||
/// \endcond
|
||||
|
||||
/**
|
||||
* @brief This structure holds information about the possible activity
|
||||
* averages. Specifically, the utilization counters.
|
||||
*/
|
||||
typedef struct {
|
||||
/* Utilization */
|
||||
uint16_t average_gfx_activity;
|
||||
uint16_t average_umc_activity; //!< memory controller
|
||||
uint16_t average_mm_activity; //!< UVD or VCN
|
||||
} rsmi_activity_metric_counter_t;
|
||||
|
||||
/**
|
||||
* @brief This structure holds version information.
|
||||
*/
|
||||
@@ -898,14 +922,28 @@ struct metrics_table_header_t {
|
||||
#define RSMI_GPU_METRICS_API_FORMAT_VER 1
|
||||
// The content version increments when gpu_metrics is extended with new and/or
|
||||
// existing field sizes are changed.
|
||||
|
||||
/**
|
||||
* @brief The GPU metrics version 1
|
||||
*/
|
||||
#define RSMI_GPU_METRICS_API_CONTENT_VER_1 1
|
||||
/**
|
||||
* @brief The GPU metrics version 2
|
||||
*/
|
||||
#define RSMI_GPU_METRICS_API_CONTENT_VER_2 2
|
||||
/**
|
||||
* @brief The GPU metrics version 3
|
||||
*/
|
||||
#define RSMI_GPU_METRICS_API_CONTENT_VER_3 3
|
||||
|
||||
// This should match NUM_HBM_INSTANCES
|
||||
/**
|
||||
* @brief This should match NUM_HBM_INSTANCES
|
||||
*/
|
||||
#define RSMI_NUM_HBM_INSTANCES 4
|
||||
|
||||
// Unit conversion factor for HBM temperatures
|
||||
/**
|
||||
* @brief Unit conversion factor for HBM temperatures
|
||||
*/
|
||||
#define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000
|
||||
|
||||
typedef struct {
|
||||
@@ -964,7 +1002,7 @@ typedef struct {
|
||||
uint16_t padding; // new in v1
|
||||
|
||||
uint32_t gfx_activity_acc; // new in v1
|
||||
uint32_t mem_actvity_acc; // new in v1
|
||||
uint32_t mem_activity_acc; // new in v1
|
||||
uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1
|
||||
/// \endcond
|
||||
} rsmi_gpu_metrics_t;
|
||||
@@ -2288,7 +2326,7 @@ rsmi_dev_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent);
|
||||
* If the function reutrns RSMI_STATUS_SUCCESS, the counter will be set in the value field of
|
||||
* the rsmi_utilization_counter_t.
|
||||
*
|
||||
* @param[in] count The size of @utilization_counters array.
|
||||
* @param[in] count The size of utilization_counters array.
|
||||
*
|
||||
* @param[inout] timestamp The timestamp when the counter is retreived. Resolution: 1 ns.
|
||||
* @retval ::RSMI_STATUS_SUCCESS call was successful
|
||||
@@ -2303,6 +2341,57 @@ rsmi_utilization_count_get(uint32_t dv_ind,
|
||||
uint32_t count,
|
||||
uint64_t *timestamp);
|
||||
|
||||
/**
|
||||
* @brief Get activity metric average utilization counter of the specified device
|
||||
*
|
||||
* @details Given a device index @p dv_ind, the activity metric type,
|
||||
* this function returns the requested utilization counters
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @param[in] activity_metric_type a metric type
|
||||
*
|
||||
* @param[inout] activity_metric_counter Multiple utilization counters can be retrieved with a single
|
||||
* call. The caller must allocate enough space to the rsmi_activity_metric_counter_t structure.
|
||||
*
|
||||
* If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding
|
||||
* field of the counter will be set in the value field of
|
||||
* the activity_metric_counter_t.
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS call was successful
|
||||
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
|
||||
* support this function with the given arguments
|
||||
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
|
||||
*
|
||||
*/
|
||||
rsmi_status_t
|
||||
rsmi_dev_activity_metric_get(uint32_t dv_ind,
|
||||
rsmi_activity_metric_t activity_metric_type,
|
||||
rsmi_activity_metric_counter_t* activity_metric_counter);
|
||||
|
||||
/**
|
||||
* @brief Get activity metric bandwidth average utilization counter of the specified device
|
||||
*
|
||||
* @details Given a device index @p dv_ind, the activity metric type,
|
||||
* this function returns the requested utilization counters
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @param[inout] avg_activity average bandwidth utilization counters can be retrieved
|
||||
*
|
||||
* If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding
|
||||
* field of the counter will be set in the value field of
|
||||
* the activity_metric_counter_t.
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS call was successful
|
||||
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
|
||||
* support this function with the given arguments
|
||||
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
|
||||
*
|
||||
*/
|
||||
rsmi_status_t
|
||||
rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity);
|
||||
|
||||
/**
|
||||
* @brief Get the performance level of the device with provided
|
||||
* device index.
|
||||
@@ -2450,7 +2539,7 @@ rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind,
|
||||
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
|
||||
*
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_gpu_reset(int32_t dv_ind);
|
||||
rsmi_status_t rsmi_dev_gpu_reset(uint32_t dv_ind);
|
||||
|
||||
/**
|
||||
* @brief This function retrieves the voltage/frequency curve information
|
||||
@@ -2684,7 +2773,7 @@ rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t sensor_ind,
|
||||
*
|
||||
*/
|
||||
rsmi_status_t
|
||||
rsmi_dev_perf_level_set(int32_t dv_ind, rsmi_dev_perf_level_t perf_lvl);
|
||||
rsmi_dev_perf_level_set(uint32_t dv_ind, rsmi_dev_perf_level_t perf_lvl);
|
||||
|
||||
/**
|
||||
* @brief Set the PowerPlay performance level associated with the device with
|
||||
@@ -2750,7 +2839,7 @@ rsmi_dev_perf_level_set_v1(uint32_t dv_ind, rsmi_dev_perf_level_t perf_lvl);
|
||||
* @retval ::RSMI_STATUS_PERMISSION function requires root access
|
||||
*
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od);
|
||||
rsmi_status_t rsmi_dev_overdrive_level_set(uint32_t dv_ind, uint32_t od);
|
||||
|
||||
/**
|
||||
* @brief Set the overdrive percent associated with the device with provided
|
||||
@@ -3398,7 +3487,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices,
|
||||
* @brief Get the info of a process on a specific device.
|
||||
*
|
||||
* @details Given a process id @p pid, a @p dv_ind, this function will
|
||||
* write the process information for @p pid on the device, if available, to
|
||||
* write the process information for pid on the device, if available, to
|
||||
* the memory pointed to by @p proc.
|
||||
*
|
||||
* @param[in] pid The process id of the process for which the gpu
|
||||
@@ -3406,7 +3495,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices,
|
||||
*
|
||||
* @param[in] dv_ind a device index where the process running on.
|
||||
*
|
||||
* @param[inout] procs a pointer to memory provided by the caller to which
|
||||
* @param[inout] proc a pointer to memory provided by the caller to which
|
||||
* process information will be written.
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call
|
||||
@@ -3598,7 +3687,7 @@ rsmi_topo_get_link_type(uint32_t dv_ind_src, uint32_t dv_ind_dst,
|
||||
*
|
||||
* @details Given a source device index @p dv_ind_src and
|
||||
* a destination device index @p dv_ind_dst, and a pointer to a
|
||||
* bool @accessible, this function will write the P2P connection status
|
||||
* bool @p accessible, this function will write the P2P connection status
|
||||
* between the device @p dv_ind_src and @p dv_ind_dst to the memory
|
||||
* pointed to by @p accessible.
|
||||
*
|
||||
|
||||
@@ -90,7 +90,7 @@
|
||||
/* This group of macros is used to facilitate checking of support for rsmi_dev*
|
||||
* "getter" functions. When the return buffer is set to nullptr, the macro will
|
||||
* check the previously gathered device support data to see if the function,
|
||||
* with possible variants (e.g., memory types, firware types,...) and
|
||||
* with possible variants (e.g., memory types, firmware types,...) and
|
||||
* subvariants (e.g. monitors/sensors) are supported.
|
||||
*/
|
||||
// This macro assumes dev already available
|
||||
|
||||
@@ -118,6 +118,10 @@ GetProcessGPUs(uint32_t pid, std::unordered_set<uint64_t> *gpu_count);
|
||||
int
|
||||
ReadKFDDeviceProperties(uint32_t dev_id, std::vector<std::string> *retVec);
|
||||
|
||||
int read_node_properties(uint32_t node, std::string property_name,
|
||||
uint64_t *val);
|
||||
int get_gpu_id(uint32_t node, uint64_t *gpu_id);
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -113,7 +113,8 @@ class RocmSMI {
|
||||
uint64_t *weight);
|
||||
int get_node_index(uint32_t dv_ind, uint32_t *node_ind);
|
||||
const RocmSMI_env_vars& getEnv(void);
|
||||
void printEnvVarInfo(void);
|
||||
std::string getRSMIEnvVarInfo(void);
|
||||
void debugRSMIEnvVarInfo();
|
||||
bool isLoggingOn(void);
|
||||
uint32_t getLogSetting(void);
|
||||
static const std::map<amd::smi::DevInfoTypes, std::string> devInfoTypesStrings;
|
||||
|
||||
@@ -99,7 +99,8 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type,
|
||||
rsmi_status_t ErrnoToRsmiStatus(int err);
|
||||
std::string getRSMIStatusString(rsmi_status_t ret);
|
||||
std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string, std::string>
|
||||
std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string>
|
||||
getSystemDetails(void);
|
||||
void logSystemDetails(void);
|
||||
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str);
|
||||
@@ -107,6 +108,9 @@ rsmi_status_t getBDFWithDomain(uint64_t bdf_id, std::string& bfd_str);
|
||||
void logHexDump(const char *desc, const void *addr, const size_t len,
|
||||
size_t perLine);
|
||||
bool isSystemBigEndian();
|
||||
std::string getBuildType();
|
||||
std::string getMyLibPath();
|
||||
int subDirectoryCountInPath(const std::string path);
|
||||
template <typename T>
|
||||
std::string print_int_as_hex(T i, bool showHexNotation=true) {
|
||||
std::stringstream ss;
|
||||
|
||||
@@ -173,10 +173,12 @@ def formatMatrixToJSON(deviceList, matrix, metricName):
|
||||
printSysLog(metricName.format(deviceList[row_indx], deviceList[col_ind]), valueStr)
|
||||
|
||||
|
||||
def getBus(device):
|
||||
def getBus(device, silent=False):
|
||||
""" Return the bus identifier of a given device
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is off.
|
||||
"""
|
||||
bdfid = c_uint64(0)
|
||||
ret = rocmsmi.rsmi_dev_pci_id_get(device, byref(bdfid))
|
||||
@@ -188,16 +190,18 @@ def getBus(device):
|
||||
function = bdfid.value & 0x7
|
||||
|
||||
pic_id = '{:04X}:{:02X}:{:02X}.{:0X}'.format(domain, bus, device, function)
|
||||
if rsmi_ret_ok(ret, device, 'get_pci_id'):
|
||||
if rsmi_ret_ok(ret, device, 'get_pci_id', silent):
|
||||
return pic_id
|
||||
|
||||
|
||||
def getFanSpeed(device):
|
||||
def getFanSpeed(device, silent=True):
|
||||
""" Return a tuple with the fan speed (value,%) for a specified device,
|
||||
or (None,None) if either current fan speed or max fan speed cannot be
|
||||
obtained
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is on.
|
||||
"""
|
||||
fanLevel = c_int64()
|
||||
fanMax = c_int64()
|
||||
@@ -209,7 +213,7 @@ def getFanSpeed(device):
|
||||
/sys/class/drm/cardX/device/hwmon/hwmonX/pwmX
|
||||
"""
|
||||
ret = rocmsmi.rsmi_dev_fan_speed_get(device, sensor_ind, byref(fanLevel))
|
||||
if rsmi_ret_ok(ret, device, 'get_fan_speed', True):
|
||||
if rsmi_ret_ok(ret, device, 'get_fan_speed', silent):
|
||||
fl = fanLevel.value
|
||||
last_ret = ret
|
||||
|
||||
@@ -217,7 +221,7 @@ def getFanSpeed(device):
|
||||
/sys/class/drm/cardX/device/hwmon/hwmonX/pwmX
|
||||
"""
|
||||
ret = rocmsmi.rsmi_dev_fan_speed_max_get(device, sensor_ind, byref(fanMax))
|
||||
if rsmi_ret_ok(ret, device, 'get_fan_max_speed', True):
|
||||
if rsmi_ret_ok(ret, device, 'get_fan_max_speed', silent):
|
||||
fm = fanMax.value
|
||||
|
||||
""" In case we had an error before, we don't overwrite it with a
|
||||
@@ -232,59 +236,67 @@ def getFanSpeed(device):
|
||||
return (last_ret, fl, round((float(fl) / float(fm)) * 100, 2))
|
||||
|
||||
|
||||
def getGpuUse(device):
|
||||
def getGpuUse(device, silent=False):
|
||||
""" Return the current GPU usage as a percentage
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is off.
|
||||
"""
|
||||
percent = c_uint32()
|
||||
ret = rocmsmi.rsmi_dev_busy_percent_get(device, byref(percent))
|
||||
if rsmi_ret_ok(ret, device, 'GPU Utilization '):
|
||||
if rsmi_ret_ok(ret, device, 'GPU Utilization ', silent):
|
||||
return percent.value
|
||||
return -1
|
||||
|
||||
|
||||
def getId(device):
|
||||
def getId(device, silent=False):
|
||||
""" Return the hexadecimal value of a device's ID
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is off.
|
||||
"""
|
||||
dv_id = c_short()
|
||||
ret = rocmsmi.rsmi_dev_id_get(device, byref(dv_id))
|
||||
if rsmi_ret_ok(ret, device, 'get_device_id'):
|
||||
if rsmi_ret_ok(ret, device, 'get_device_id', silent):
|
||||
return hex(dv_id.value)
|
||||
|
||||
|
||||
def getRev(device):
|
||||
def getRev(device, silent=False):
|
||||
""" Return the hexadecimal value of a device's Revision
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is off.
|
||||
"""
|
||||
dv_rev = c_short()
|
||||
ret = rocmsmi.rsmi_dev_revision_get(device, byref(dv_rev))
|
||||
if rsmi_ret_ok(ret, device, 'get_device_rev'):
|
||||
if rsmi_ret_ok(ret, device, 'get_device_rev', silent):
|
||||
return hex(dv_rev.value)
|
||||
|
||||
|
||||
def getMaxPower(device):
|
||||
def getMaxPower(device, silent=False):
|
||||
""" Return the maximum power cap of a given device
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is off.
|
||||
"""
|
||||
power_cap = c_uint64()
|
||||
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
|
||||
if rsmi_ret_ok(ret, device, 'get_power_cap'):
|
||||
if rsmi_ret_ok(ret, device, 'get_power_cap', silent):
|
||||
return power_cap.value / 1000000
|
||||
return -1
|
||||
|
||||
|
||||
def getMemInfo(device, memType, quiet=False):
|
||||
def getMemInfo(device, memType, silent=False):
|
||||
""" Returns a tuple of (memory_used, memory_total) of
|
||||
the requested memory type usage for the device specified
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param type: [vram|vis_vram|gtt] Memory type to return
|
||||
@param quiet=Turn on to silience error output
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is off,
|
||||
which exposes any issue accessing the different
|
||||
memory types.
|
||||
@@ -300,11 +312,11 @@ def getMemInfo(device, memType, quiet=False):
|
||||
memTotal = None
|
||||
|
||||
ret = rocmsmi.rsmi_dev_memory_usage_get(device, memory_type_l.index(memType), byref(memoryUse))
|
||||
if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), quiet):
|
||||
if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), silent):
|
||||
memUsed = memoryUse.value
|
||||
|
||||
ret = rocmsmi.rsmi_dev_memory_total_get(device, memory_type_l.index(memType), byref(memoryTot))
|
||||
if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), quiet):
|
||||
if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), silent):
|
||||
memTotal = memoryTot.value
|
||||
return (memUsed, memTotal)
|
||||
|
||||
@@ -334,14 +346,16 @@ def getProcessName(pid):
|
||||
return pName
|
||||
|
||||
|
||||
def getPerfLevel(device):
|
||||
def getPerfLevel(device, silent=False):
|
||||
""" Return the current performance level of a given device
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is off.
|
||||
"""
|
||||
perf = rsmi_dev_perf_level_t()
|
||||
ret = rocmsmi.rsmi_dev_perf_level_get(device, byref(perf))
|
||||
if rsmi_ret_ok(ret, device, 'get_perf_level'):
|
||||
if rsmi_ret_ok(ret, device, 'get_perf_level', silent):
|
||||
return perf_level_string(perf.value)
|
||||
return 'N/A'
|
||||
|
||||
@@ -369,42 +383,48 @@ def getPidList():
|
||||
return
|
||||
|
||||
|
||||
def getPower(device):
|
||||
def getPower(device, silent=False):
|
||||
""" Return the current power level of a given device
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is off.
|
||||
"""
|
||||
power = c_uint32()
|
||||
ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power))
|
||||
if rsmi_ret_ok(ret, device, 'get_power_avg'):
|
||||
if rsmi_ret_ok(ret, device, 'get_power_avg', silent):
|
||||
return power.value / 1000000
|
||||
return 'N/A'
|
||||
|
||||
|
||||
def getRasEnablement(device, block):
|
||||
def getRasEnablement(device, block, silent=True):
|
||||
""" Return RAS enablement state for a given device
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param block: RAS block identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is on.
|
||||
"""
|
||||
state = rsmi_ras_err_state_t()
|
||||
ret = rocmsmi.rsmi_dev_ecc_status_get(device, rsmi_gpu_block_d[block], byref(state))
|
||||
|
||||
if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), True):
|
||||
if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), silent):
|
||||
return rsmi_ras_err_stale_machine[state.value].upper()
|
||||
return 'N/A'
|
||||
|
||||
|
||||
def getTemp(device, sensor):
|
||||
def getTemp(device, sensor, silent=True):
|
||||
""" Display the current temperature from a given device's sensor
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param sensor: Temperature sensor identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is on.
|
||||
"""
|
||||
temp = c_int64(0)
|
||||
metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT
|
||||
ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), temp_type_lst.index(sensor), metric, byref(temp))
|
||||
if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), True):
|
||||
if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), silent):
|
||||
return temp.value / 1000
|
||||
return 'N/A'
|
||||
|
||||
@@ -428,52 +448,60 @@ def findFirstAvailableTemp(device):
|
||||
continue
|
||||
return (ret_temp_type, ret_temp)
|
||||
|
||||
def getVbiosVersion(device):
|
||||
def getVbiosVersion(device, silent=False):
|
||||
""" Returns the VBIOS version for a given device
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is off.
|
||||
"""
|
||||
vbios = create_string_buffer(256)
|
||||
ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256)
|
||||
if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
return "Unsupported"
|
||||
elif rsmi_ret_ok(ret, device):
|
||||
elif rsmi_ret_ok(ret, device, silent=silent):
|
||||
return vbios.value.decode()
|
||||
|
||||
|
||||
def getVersion(deviceList, component):
|
||||
def getVersion(deviceList, component, silent=False):
|
||||
""" Return the software version for the specified component
|
||||
|
||||
@param deviceList: List of DRM devices (can be a single-item list)
|
||||
@param component: Component (currently only driver)
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is off.
|
||||
"""
|
||||
ver_str = create_string_buffer(256)
|
||||
ret = rocmsmi.rsmi_version_str_get(component, ver_str, 256)
|
||||
if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component)):
|
||||
if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component), silent):
|
||||
return ver_str.value.decode()
|
||||
return None
|
||||
|
||||
|
||||
def getComputePartition(device):
|
||||
def getComputePartition(device, silent=True):
|
||||
""" Return the current compute partition of a given device
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is on.
|
||||
"""
|
||||
currentComputePartition = create_string_buffer(256)
|
||||
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
|
||||
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode():
|
||||
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent) and currentComputePartition.value.decode():
|
||||
return str(currentComputePartition.value.decode())
|
||||
return "N/A"
|
||||
|
||||
|
||||
def getMemoryPartition(device):
|
||||
def getMemoryPartition(device, silent=True):
|
||||
""" Return the current memory partition of a given device
|
||||
|
||||
@param device: DRM device identifier
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is on.
|
||||
"""
|
||||
currentNPSMode = create_string_buffer(256)
|
||||
ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256)
|
||||
if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode():
|
||||
if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent) and currentNPSMode.value.decode():
|
||||
return str(currentNPSMode.value.decode())
|
||||
return "N/A"
|
||||
|
||||
@@ -610,10 +638,21 @@ def printLog(device, metricName, value=None, extraSpace=False, useItalics=False)
|
||||
lock.acquire()
|
||||
if useItalics:
|
||||
logstr = italics + logstr + end
|
||||
if extraSpace:
|
||||
print('\n' + logstr + '\n', end='', flush=True)
|
||||
else:
|
||||
print(logstr + '\n', end='', flush=True)
|
||||
try:
|
||||
if extraSpace:
|
||||
print('\n', end='')
|
||||
print(logstr + '\n', end='')
|
||||
sys.stdout.flush()
|
||||
# when piped into programs like 'head' - print throws an error.
|
||||
# silently ignore instead
|
||||
except(BrokenPipeError, IOError):
|
||||
# https://docs.python.org/3/library/signal.html#note-on-sigpipe
|
||||
# Python flushes standard streams on exit; redirect remaining output
|
||||
# to devnull to avoid another BrokenPipeError at shutdown
|
||||
devnull = os.open(os.devnull, os.O_WRONLY)
|
||||
os.dup2(devnull, sys.stdout.fileno())
|
||||
sys.exit(1) # Python exits with error code 1 on EPIPE
|
||||
|
||||
lock.release()
|
||||
|
||||
|
||||
@@ -785,12 +824,10 @@ def resetFans(deviceList):
|
||||
for device in deviceList:
|
||||
sensor_ind = c_uint32(0)
|
||||
ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind)
|
||||
if (ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED) or (ret == rsmi_status_t.RSMI_STATUS_PERMISSION):
|
||||
if not rsmi_ret_ok(rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED, device, 'reset_fan'):
|
||||
continue
|
||||
if rsmi_ret_ok(ret, device, silent=True):
|
||||
printLog(device, 'Successfully reset fan speed to driver control', None)
|
||||
else:
|
||||
if rsmi_ret_ok(ret, device, 'reset_fan'):
|
||||
printLog(device, 'Successfully reset fan speed to driver control', None)
|
||||
printLog(device, 'Not supported on the given system', None)
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
@@ -1311,8 +1348,10 @@ def setFanSpeed(deviceList, fan):
|
||||
else:
|
||||
fanLevel = int(str(fan))
|
||||
ret = rocmsmi.rsmi_dev_fan_speed_set(device, 0, int(fanLevel))
|
||||
if rsmi_ret_ok(ret, device, 'set_fan_speed'):
|
||||
if rsmi_ret_ok(ret, device, silent=True):
|
||||
printLog(device, 'Successfully set fan speed to level %s' % (str(int(fanLevel))), None)
|
||||
else:
|
||||
printLog(device, 'Not supported on the given system', None)
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
@@ -1595,10 +1634,13 @@ def showAllConcise(deviceList):
|
||||
MAX_ALL_CONCISE_WIDTH = 100
|
||||
appWidth_temp = appWidth
|
||||
appWidth = MAX_ALL_CONCISE_WIDTH
|
||||
silent = True
|
||||
|
||||
printLogSpacer(' Concise Info ')
|
||||
deviceList.sort()
|
||||
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
|
||||
temp_type = '(' + temp_type_lst[0] + ')'
|
||||
if len(deviceList) >= 1:
|
||||
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
|
||||
available_temp_type = temp_type.lower()
|
||||
available_temp_type = available_temp_type.replace('(', '')
|
||||
available_temp_type = available_temp_type.replace(')', '')
|
||||
@@ -1620,9 +1662,9 @@ def showAllConcise(deviceList):
|
||||
values = {}
|
||||
degree_sign = u'\N{DEGREE SIGN}'
|
||||
for device in deviceList:
|
||||
gpu_dev_product_info = getDevProductInfo(device)
|
||||
gpu_dev_product_info = getDevProductInfo(device, silent)
|
||||
gpu_dev_product_info_names = list(gpu_dev_product_info[device])
|
||||
temp_val = str(getTemp(device, available_temp_type))
|
||||
temp_val = str(getTemp(device, available_temp_type, silent))
|
||||
if temp_val != 'N/A':
|
||||
temp_val += degree_sign + 'C'
|
||||
avgPwr = str(getPower(device))
|
||||
@@ -1630,26 +1672,25 @@ def showAllConcise(deviceList):
|
||||
avgPwr += 'W'
|
||||
else:
|
||||
avgPwr = 'N/A'
|
||||
combined_partition = (getMemoryPartition(device) + ", "
|
||||
+ getComputePartition(device))
|
||||
concise = True
|
||||
sclk = showCurrentClocks([device], 'sclk', concise)
|
||||
mclk = showCurrentClocks([device], 'mclk', concise)
|
||||
(retCode, fanLevel, fanSpeed) = getFanSpeed(device)
|
||||
combined_partition = (getMemoryPartition(device, silent) + ", "
|
||||
+ getComputePartition(device, silent))
|
||||
sclk = showCurrentClocks([device], 'sclk', concise=silent)
|
||||
mclk = showCurrentClocks([device], 'mclk', concise=silent)
|
||||
(retCode, fanLevel, fanSpeed) = getFanSpeed(device, silent)
|
||||
fan = str(fanSpeed) + '%'
|
||||
if getPerfLevel(device) != -1:
|
||||
perf = getPerfLevel(device)
|
||||
if getPerfLevel(device, silent) != -1:
|
||||
perf = getPerfLevel(device, silent)
|
||||
else:
|
||||
perf = 'Unsupported'
|
||||
if getMaxPower(device) != -1:
|
||||
pwrCap = str(getMaxPower(device)) + 'W'
|
||||
if getMaxPower(device, silent) != -1:
|
||||
pwrCap = str(getMaxPower(device, silent)) + 'W'
|
||||
else:
|
||||
pwrCap = 'Unsupported'
|
||||
if getGpuUse(device) != -1:
|
||||
gpu_busy = str(getGpuUse(device)) + '%'
|
||||
if getGpuUse(device, silent) != -1:
|
||||
gpu_busy = str(getGpuUse(device, silent)) + '%'
|
||||
else:
|
||||
gpu_busy = 'Unsupported'
|
||||
vram_used, vram_total = getMemInfo(device, 'vram', True)
|
||||
vram_used, vram_total = getMemInfo(device, 'vram', silent)
|
||||
mem_use_pct = 0
|
||||
if vram_used is None:
|
||||
mem_use_pct='Unsupported'
|
||||
@@ -1683,7 +1724,7 @@ def showAllConcise(deviceList):
|
||||
for device in deviceList:
|
||||
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
|
||||
zip(range(len(max_widths)), values['card%s' % (str(device))])), None)
|
||||
gpu_dev_product_info = getDevProductInfo(device)
|
||||
gpu_dev_product_info = getDevProductInfo(device, silent)
|
||||
gpu_dev_product_info_names = list(gpu_dev_product_info[device])
|
||||
if (len(gpu_dev_product_info_names) > 1):
|
||||
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
|
||||
@@ -1707,19 +1748,20 @@ def showAllConciseHw(deviceList):
|
||||
header = ['GPU', 'DID', 'DREV', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS']
|
||||
head_widths = [len(head) + 2 for head in header]
|
||||
values = {}
|
||||
silent = True
|
||||
for device in deviceList:
|
||||
gpuid = getId(device)
|
||||
gpuid = getId(device, silent)
|
||||
if str(gpuid).startswith('0x'):
|
||||
gpuid = str(gpuid)[2:]
|
||||
gpurev = getRev(device)
|
||||
gpurev = getRev(device, silent)
|
||||
if str(gpurev).startswith('0x'):
|
||||
gpurev = str(gpurev)[2:]
|
||||
|
||||
gfxRas = getRasEnablement(device, 'GFX')
|
||||
sdmaRas = getRasEnablement(device, 'SDMA')
|
||||
umcRas = getRasEnablement(device, 'UMC')
|
||||
vbios = getVbiosVersion(device)
|
||||
bus = getBus(device)
|
||||
gfxRas = getRasEnablement(device, 'GFX', silent)
|
||||
sdmaRas = getRasEnablement(device, 'SDMA', silent)
|
||||
umcRas = getRasEnablement(device, 'UMC', silent)
|
||||
vbios = getVbiosVersion(device, silent)
|
||||
bus = getBus(device, silent)
|
||||
values['card%s' % (str(device))] = [device, gpuid, gpurev, gfxRas, sdmaRas, umcRas, vbios, bus]
|
||||
val_widths = {}
|
||||
for device in deviceList:
|
||||
@@ -1760,15 +1802,19 @@ def showClocks(deviceList):
|
||||
for clk_type in sorted(rsmi_clk_names_dict):
|
||||
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1:
|
||||
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
|
||||
if rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True):
|
||||
printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None)
|
||||
for x in range(freq.num_supported):
|
||||
fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000)
|
||||
if x == freq.current:
|
||||
printLog(device, str(x), str(fr) + ' *')
|
||||
else:
|
||||
printLog(device, str(x), str(fr))
|
||||
printLog(device, '', None)
|
||||
if ret == rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA:
|
||||
printLog(device, 'Clock [%s] on device [%s] exists but EMPTY! Likely driver error!' % (clk_type, str(device)))
|
||||
continue
|
||||
if not rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True):
|
||||
continue
|
||||
printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None)
|
||||
for x in range(freq.num_supported):
|
||||
fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000)
|
||||
if x == freq.current:
|
||||
printLog(device, str(x), str(fr) + ' *')
|
||||
else:
|
||||
printLog(device, str(x), str(fr))
|
||||
printLog(device, '', None)
|
||||
else:
|
||||
logging.debug('{} frequency is unsupported on device[{}]'.format(clk_type, device))
|
||||
printLog(device, '', None)
|
||||
@@ -1814,8 +1860,8 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
|
||||
if concise: # in case function is used for concise output, no need to print.
|
||||
return '{:.0f}Mhz'.format(fr)
|
||||
printLog(device, '{} clock level'.format(clk_defined), '{} ({:.0f}Mhz)'.format(levl, fr))
|
||||
else:
|
||||
printErrLog(device, '%s clock is unsupported' % (clk_defined))
|
||||
elif not concise:
|
||||
logging.debug('{} clock is unsupported on device[{}]'.format(clk_defined, device))
|
||||
|
||||
else: # if clk is not defined, will display all current clk
|
||||
for clk_type in sorted(rsmi_clk_names_dict):
|
||||
@@ -1832,7 +1878,7 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
|
||||
printLog(device, '%s clock level:' % (clk_type), levl)
|
||||
else:
|
||||
printLog(device, '%s clock level: %s' % (clk_type, levl), '(%sMhz)' % (str(fr)[:-2]))
|
||||
else:
|
||||
elif not concise:
|
||||
logging.debug('{} clock is unsupported on device[{}]'.format(clk_type, device))
|
||||
# pcie clocks
|
||||
if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1:
|
||||
@@ -1845,9 +1891,10 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
|
||||
fr = '{:.1f}GT/s x{}'.format(bw.transfer_rate.frequency[current_f] / 1000000000,
|
||||
bw.lanes[current_f])
|
||||
printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr))
|
||||
else:
|
||||
logging.debug('PCIe clock is unsupported on device[{}]'.format(device))
|
||||
printLogSpacer()
|
||||
elif not concise:
|
||||
logging.debug('{} clock is unsupported on device[{}]'.format('PCIe', device))
|
||||
if not concise:
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
def showCurrentFans(deviceList):
|
||||
@@ -2113,6 +2160,7 @@ def showMemUse(deviceList):
|
||||
@param deviceList: List of DRM devices (can be a single-item list)
|
||||
"""
|
||||
memoryUse = c_uint64()
|
||||
avgMemBandwidth = c_uint16()
|
||||
printLogSpacer(' Current Memory Use ')
|
||||
for device in deviceList:
|
||||
ret = rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse))
|
||||
@@ -2124,6 +2172,12 @@ def showMemUse(deviceList):
|
||||
printLog(device, utilization_counter_name[ut_counter.type], ut_counter.val)
|
||||
else:
|
||||
printLog(device, 'Memory Activity', 'N/A')
|
||||
|
||||
ret = rocmsmi.rsmi_dev_activity_avg_mm_get(device, byref(avgMemBandwidth))
|
||||
if rsmi_ret_ok(ret, device, silent=True):
|
||||
printLog(device, 'Avg. Memory Bandwidth', avgMemBandwidth.value)
|
||||
else:
|
||||
printLog(device, 'Not supported on the given system', None)
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
@@ -2404,47 +2458,51 @@ def showProductName(deviceList):
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
def getDevProductInfo(device):
|
||||
def getDevProductInfo(device, silent=False):
|
||||
""" Show the requested product name for the device requested
|
||||
|
||||
@param device: Device we want to get the info for
|
||||
@param silent=Turn on to silence error output
|
||||
(you plan to handle manually). Default is off.
|
||||
"""
|
||||
|
||||
# Retrieve card vendor
|
||||
MAX_BUFF_SIZE = 256
|
||||
MAX_DESC_SIZE = 20
|
||||
device_info = "N/A"
|
||||
device_series = "N/A"
|
||||
device_model = "N/A"
|
||||
gpu_revision = "N/A"
|
||||
device_list = {}
|
||||
vendor = create_string_buffer(MAX_BUFF_SIZE)
|
||||
ret = rocmsmi.rsmi_dev_vendor_name_get(device, vendor, MAX_BUFF_SIZE)
|
||||
# Only continue if GPU vendor is AMD
|
||||
if rsmi_ret_ok(ret, device, 'get_vendor_name') and isAmdDevice(device):
|
||||
if rsmi_ret_ok(ret, device, 'get_vendor_name', silent) and isAmdDevice(device):
|
||||
# Retrieve the device series
|
||||
series = create_string_buffer(MAX_BUFF_SIZE)
|
||||
ret = rocmsmi.rsmi_dev_name_get(device, series, MAX_BUFF_SIZE)
|
||||
if rsmi_ret_ok(ret, device, 'get_name'):
|
||||
if rsmi_ret_ok(ret, device, 'get_name', silent):
|
||||
try:
|
||||
device_series = series.value.decode()
|
||||
except UnicodeDecodeError:
|
||||
device_series = "N/A"
|
||||
printErrLog(device, "Unable to read card series")
|
||||
if not silent:
|
||||
printErrLog(device, "Unable to read card series")
|
||||
|
||||
# Retrieve the device model
|
||||
model = create_string_buffer(MAX_BUFF_SIZE)
|
||||
ret = rocmsmi.rsmi_dev_subsystem_name_get(device, model, MAX_BUFF_SIZE)
|
||||
if rsmi_ret_ok(ret, device, 'get_subsystem_name'):
|
||||
if rsmi_ret_ok(ret, device, 'get_subsystem_name', silent):
|
||||
try:
|
||||
device_model = model.value.decode()
|
||||
device_model = padHexValue(device_model, 4)
|
||||
except UnicodeDecodeError:
|
||||
device_model = "N/A"
|
||||
printErrLog(device, "Unable to read device model")
|
||||
if not silent:
|
||||
printErrLog(device, "Unable to read device model")
|
||||
|
||||
try:
|
||||
gpu_revision = padHexValue(getRev(device), 2)
|
||||
except Exception as exc:
|
||||
gpu_revision = "N/A"
|
||||
printErrLog(device, "Unable to read card revision %s" % (exc))
|
||||
if not silent:
|
||||
printErrLog(device, "Unable to read card revision %s" % (exc))
|
||||
|
||||
device_series_str = str(device_series[:MAX_DESC_SIZE])
|
||||
device_series_str = device_series_str.ljust(MAX_DESC_SIZE, ' ')
|
||||
@@ -2790,7 +2848,9 @@ def getGraphColor(percentage):
|
||||
|
||||
def showTempGraph(deviceList):
|
||||
deviceList.sort()
|
||||
(temp_type, temp_value) = findFirstAvailableTemp(deviceList[0])
|
||||
temp_type = '(' + temp_type_lst[0] + ')'
|
||||
if len(deviceList) >= 1:
|
||||
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
|
||||
printLogSpacer(' Temperature Graph ' + temp_type + ' ')
|
||||
temp_type = temp_type.lower()
|
||||
temp_type = temp_type.replace('(', '')
|
||||
@@ -3381,7 +3441,7 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
|
||||
@param my_ret: Return of RSMI call (rocm_smi_lib API)
|
||||
@param metric: Parameter of GPU currently being analyzed
|
||||
@param silent: Echo verbose error reponse.
|
||||
True siliences err output, False does not silience err output (default).
|
||||
True silences err output, False does not silence err output (default).
|
||||
"""
|
||||
global RETCODE
|
||||
global PRINT_JSON
|
||||
@@ -3398,8 +3458,8 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
|
||||
if err_str.value is not None:
|
||||
returnString += '%s\t' % (err_str.value.decode())
|
||||
if not PRINT_JSON:
|
||||
logging.debug('%s', returnString)
|
||||
if not silent:
|
||||
logging.debug('%s', returnString)
|
||||
if my_ret in rsmi_status_verbose_err_out:
|
||||
printLog(device, metric + ", " + rsmi_status_verbose_err_out[my_ret], None)
|
||||
RETCODE = my_ret
|
||||
@@ -3465,8 +3525,7 @@ def save(deviceList, savefilepath):
|
||||
# The code below is for when this script is run as an executable instead of when imported as a module
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='AMD ROCm System Management Interface | ROCM-SMI version: %s | Kernel version: %s' % (
|
||||
__version__, getVersion(None, rsmi_sw_component_t.RSMI_SW_COMP_DRIVER)),
|
||||
description=f'AMD ROCm System Management Interface | ROCM-SMI version: {__version__}',
|
||||
formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=90, width=120))
|
||||
groupDev = parser.add_argument_group()
|
||||
groupDisplayOpt = parser.add_argument_group('Display Options')
|
||||
@@ -3626,6 +3685,11 @@ if __name__ == '__main__':
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Must set PRINT_JSON early so the prints can be silenced
|
||||
if args.json or args.csv:
|
||||
PRINT_JSON = True
|
||||
# Initialize rsmiBindings
|
||||
rocmsmi = initRsmiBindings(silent=PRINT_JSON)
|
||||
# Initialize the rocm SMI library
|
||||
initializeRsmi()
|
||||
|
||||
@@ -3661,8 +3725,7 @@ if __name__ == '__main__':
|
||||
sys.exit(1)
|
||||
|
||||
# If we want JSON/CSV output, initialize the keys (devices)
|
||||
if args.json or args.csv:
|
||||
PRINT_JSON = True
|
||||
if PRINT_JSON:
|
||||
for device in deviceList:
|
||||
JSON_DATA['card' + str(device)] = {}
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
"""ROCm_SMI_LIB CLI Tool Python Bindings"""
|
||||
# NOTE: You MUST call rsmiBindings.initRsmiBindings() when using this library!
|
||||
# TODO: Get most (or all) of these from rocm_smi.h to avoid mismatches and redundancy
|
||||
|
||||
from __future__ import print_function
|
||||
@@ -14,36 +15,42 @@ import os
|
||||
# relative path changed accordingly.
|
||||
# if ROCM_SMI_LIB_PATH is set, we can load 'librocm_smi64.so' from that location
|
||||
#
|
||||
# Library load is wrapped in a function so prints can be hidden for PRINT_JSON mode.
|
||||
path_librocm = str()
|
||||
rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH')
|
||||
if (rocm_smi_lib_path != None):
|
||||
path_librocm = rocm_smi_lib_path
|
||||
else:
|
||||
path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@'
|
||||
def initRsmiBindings(silent=False):
|
||||
def print_silent(*args):
|
||||
if not silent:
|
||||
print(args)
|
||||
|
||||
if not os.path.isfile(path_librocm):
|
||||
print('Unable to find %s . Trying /opt/rocm*' % path_librocm)
|
||||
for root, dirs, files in os.walk('/opt', followlinks=True):
|
||||
if 'librocm_smi64.so.@VERSION_MAJOR@' in files:
|
||||
path_librocm = os.path.join(os.path.realpath(root), 'librocm_smi64.so.@VERSION_MAJOR@')
|
||||
if os.path.isfile(path_librocm):
|
||||
print('Using lib from %s' % path_librocm)
|
||||
rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH')
|
||||
if (rocm_smi_lib_path != None):
|
||||
path_librocm = rocm_smi_lib_path
|
||||
else:
|
||||
print('Unable to find librocm_smi64.so.@VERSION_MAJOR@')
|
||||
else:
|
||||
print('Library loaded from: %s ' % path_librocm)
|
||||
path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@'
|
||||
|
||||
# ----------> TODO: Support static libs as well as SO
|
||||
try:
|
||||
cdll.LoadLibrary(path_librocm)
|
||||
rocmsmi = CDLL(path_librocm)
|
||||
except OSError:
|
||||
print('Unable to load the rocm_smi library.\n'\
|
||||
'Set LD_LIBRARY_PATH to the folder containing librocm_smi64.so.@VERSION_MAJOR@\n'\
|
||||
'{0}Please refer to https://github.com/'\
|
||||
'RadeonOpenCompute/rocm_smi_lib for the installation guide.{1}'\
|
||||
.format('\33[33m', '\033[0m'))
|
||||
exit()
|
||||
if not os.path.isfile(path_librocm):
|
||||
print_silent('Unable to find %s . Trying /opt/rocm*' % path_librocm)
|
||||
for root, dirs, files in os.walk('/opt', followlinks=True):
|
||||
if 'librocm_smi64.so.@VERSION_MAJOR@' in files:
|
||||
path_librocm = os.path.join(os.path.realpath(root), 'librocm_smi64.so.@VERSION_MAJOR@')
|
||||
if os.path.isfile(path_librocm):
|
||||
print_silent('Using lib from %s' % path_librocm)
|
||||
else:
|
||||
print('Unable to find librocm_smi64.so.@VERSION_MAJOR@')
|
||||
else:
|
||||
print_silent('Library loaded from: %s ' % path_librocm)
|
||||
|
||||
# ----------> TODO: Support static libs as well as SO
|
||||
try:
|
||||
cdll.LoadLibrary(path_librocm)
|
||||
return CDLL(path_librocm)
|
||||
except OSError:
|
||||
print('Unable to load the rocm_smi library.\n'\
|
||||
'Set LD_LIBRARY_PATH to the folder containing librocm_smi64.so.@VERSION_MAJOR@\n'\
|
||||
'{0}Please refer to https://github.com/'\
|
||||
'RadeonOpenCompute/rocm_smi_lib for the installation guide.{1}'\
|
||||
.format('\33[33m', '\033[0m'))
|
||||
exit()
|
||||
|
||||
# Device ID
|
||||
dv_id = c_uint64()
|
||||
|
||||
Dosya farkı çok büyük olduğundan ihmal edildi
Fark Yükle
@@ -41,20 +41,20 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <unistd.h>
|
||||
#include <asm/unistd.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/stat.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
@@ -164,8 +164,7 @@ GetSupportedEventGroups(uint32_t dev_num, dev_evt_grp_set_t *supported_grps) {
|
||||
}
|
||||
// /sys/bus/event_source/devices/<hw block>_<instance>/type
|
||||
Event::Event(rsmi_event_type_t event, uint32_t dev_ind) :
|
||||
event_type_(event), prev_cntr_val_(0) {
|
||||
fd_ = -1;
|
||||
event_type_(event), fd_(-1), prev_cntr_val_(0) {
|
||||
rsmi_event_group_t grp = EvtGrpFromEvtID(event);
|
||||
assert(grp != RSMI_EVNT_GRP_INVALID); // This should have failed before now
|
||||
|
||||
@@ -398,10 +397,11 @@ readn(int fd, void *buf, size_t n) {
|
||||
return static_cast<ssize_t>(n - left);
|
||||
}
|
||||
if (bytes < 0) {
|
||||
if (errno == EINTR) /* read got interrupted */
|
||||
if (errno == EINTR) {
|
||||
/* read got interrupted */
|
||||
continue;
|
||||
else
|
||||
return -errno;
|
||||
}
|
||||
return -errno;
|
||||
}
|
||||
|
||||
left -= static_cast<size_t>(bytes);
|
||||
|
||||
@@ -43,30 +43,28 @@
|
||||
|
||||
#include <pthread.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <assert.h>
|
||||
#include <sys/stat.h>
|
||||
#include <stdint.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_device.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_kfd.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
#include "shared_mutex.h" // NOLINT
|
||||
|
||||
@@ -611,7 +609,6 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
|
||||
bool reg_file;
|
||||
|
||||
int ret = isRegularFile(sysfs_path, ®_file);
|
||||
|
||||
if (ret != 0) {
|
||||
ss << "File did not exist - SYSFS file (" << sysfs_path
|
||||
<< ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type)
|
||||
@@ -708,7 +705,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr) {
|
||||
int ret;
|
||||
std::ostringstream ss;
|
||||
|
||||
fs.rdbuf()->pubsetbuf(0,0);
|
||||
fs.rdbuf()->pubsetbuf(nullptr,0);
|
||||
ret = openSysfsFileStream(type, &fs, valStr.c_str());
|
||||
if (ret != 0) {
|
||||
ss << "Could not write device info string (" << valStr
|
||||
@@ -822,7 +819,8 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
|
||||
FILE *ptr;
|
||||
sysfs_path += "/device/";
|
||||
sysfs_path += kDevAttribNameMap.at(type);
|
||||
ptr = fopen(sysfs_path.c_str(), "rb");
|
||||
|
||||
ptr = fopen(sysfs_path.c_str(), "rb");
|
||||
if (!ptr) {
|
||||
ss << "Could not read DevInfoBinary for DevInfoType ("
|
||||
<< RocmSMI::devInfoTypesStrings.at(type) << ")"
|
||||
@@ -874,21 +872,21 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type,
|
||||
retVec->push_back(line);
|
||||
}
|
||||
|
||||
if (retVec->size() == 0) {
|
||||
if (retVec->empty()) {
|
||||
ss << "Read devInfoMultiLineStr for DevInfoType ("
|
||||
<< RocmSMI::devInfoTypesStrings.at(type) << ")"
|
||||
<< ", but contained no string lines";
|
||||
LOG_INFO(ss);
|
||||
return 0;
|
||||
LOG_ERROR(ss);
|
||||
return ENXIO;
|
||||
}
|
||||
// Remove any *trailing* empty (whitespace) lines
|
||||
while (retVec->size() != 0 &&
|
||||
while (!retVec->empty() &&
|
||||
retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) {
|
||||
retVec->pop_back();
|
||||
}
|
||||
|
||||
// allow logging output of multiline strings
|
||||
for (auto l: *retVec) {
|
||||
for (const auto& l: *retVec) {
|
||||
allLines += "\n" + l;
|
||||
}
|
||||
|
||||
@@ -902,6 +900,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type,
|
||||
<< RocmSMI::devInfoTypesStrings.at(type) << ")"
|
||||
<< ", but lines were empty";
|
||||
LOG_INFO(ss);
|
||||
return ENXIO;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -924,10 +923,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
|
||||
ret = readDevInfoStr(type, &tempStr);
|
||||
RET_IF_NONZERO(ret);
|
||||
|
||||
if (tempStr == "") {
|
||||
if (tempStr.empty()) {
|
||||
return EINVAL;
|
||||
}
|
||||
tmp_val = std::stoi(tempStr, 0, 16);
|
||||
tmp_val = std::stoi(tempStr, nullptr, 16);
|
||||
if (tmp_val < 0) {
|
||||
return EINVAL;
|
||||
}
|
||||
@@ -949,10 +948,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
|
||||
case kDevXGMIError:
|
||||
ret = readDevInfoStr(type, &tempStr);
|
||||
RET_IF_NONZERO(ret);
|
||||
if (tempStr == "") {
|
||||
if (tempStr.empty()) {
|
||||
return EINVAL;
|
||||
}
|
||||
*val = std::stoul(tempStr, 0);
|
||||
*val = std::stoul(tempStr, nullptr);
|
||||
break;
|
||||
|
||||
case kDevUniqueId:
|
||||
@@ -979,10 +978,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
|
||||
case kDevFwVersionVcn:
|
||||
ret = readDevInfoStr(type, &tempStr);
|
||||
RET_IF_NONZERO(ret);
|
||||
if (tempStr == "") {
|
||||
if (tempStr.empty()) {
|
||||
return EINVAL;
|
||||
}
|
||||
*val = std::stoul(tempStr, 0, 16);
|
||||
*val = std::stoul(tempStr, nullptr, 16);
|
||||
break;
|
||||
|
||||
case kDevGpuReset:
|
||||
@@ -1120,13 +1119,9 @@ void Device::DumpSupportedFunctions(void) {
|
||||
}
|
||||
|
||||
void Device::fillSupportedFuncs(void) {
|
||||
if (supported_funcs_.size() != 0) {
|
||||
if (!supported_funcs_.empty()) {
|
||||
return;
|
||||
}
|
||||
if (monitor() == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::map<const char *, dev_depends_t>::const_iterator it =
|
||||
kDevFuncDependsMap.begin();
|
||||
std::string dev_rt = path_ + "/device";
|
||||
@@ -1160,7 +1155,7 @@ void Device::fillSupportedFuncs(void) {
|
||||
std::vector<DevInfoTypes>::const_iterator var =
|
||||
it->second.variants.begin();
|
||||
|
||||
if (it->second.variants.size() == 0) {
|
||||
if (it->second.variants.empty()) {
|
||||
supported_funcs_[it->first] = nullptr;
|
||||
it++;
|
||||
continue;
|
||||
@@ -1176,13 +1171,15 @@ void Device::fillSupportedFuncs(void) {
|
||||
(*supported_variants)[kDevInfoVarTypeToRSMIVariant.at(*var)] = nullptr;
|
||||
}
|
||||
|
||||
if ((*supported_variants).size() > 0) {
|
||||
if (!(*supported_variants).empty()) {
|
||||
supported_funcs_[it->first] = supported_variants;
|
||||
}
|
||||
|
||||
it++;
|
||||
}
|
||||
monitor()->fillSupportedFuncs(&supported_funcs_);
|
||||
if (monitor() != nullptr) {
|
||||
monitor()->fillSupportedFuncs(&supported_funcs_);
|
||||
}
|
||||
// DumpSupportedFunctions();
|
||||
}
|
||||
|
||||
@@ -1222,35 +1219,32 @@ bool Device::DeviceAPISupported(std::string name, uint64_t variant,
|
||||
|
||||
if (sub_variant == RSMI_DEFAULT_VARIANT) {
|
||||
return true;
|
||||
} else { // sub_variant != RSMI_DEFAULT_VARIANT
|
||||
// if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr
|
||||
assert(var_it->second != nullptr);
|
||||
}
|
||||
// sub_variant != RSMI_DEFAULT_VARIANT
|
||||
// if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr
|
||||
assert(var_it->second != nullptr);
|
||||
|
||||
return subvariant_match(&(var_it->second), sub_variant);
|
||||
}
|
||||
} else { // variant == RSMI_DEFAULT_VARIANT
|
||||
if (func_it->second != nullptr) {
|
||||
var_it = func_it->second->find(variant);
|
||||
}
|
||||
if (sub_variant == RSMI_DEFAULT_VARIANT) {
|
||||
return true;
|
||||
} else { // sub_variant != RSMI_DEFAULT_VARIANT
|
||||
if (func_it->second == nullptr) {
|
||||
return false;
|
||||
}
|
||||
return subvariant_match(&(var_it->second), sub_variant);
|
||||
}
|
||||
return subvariant_match(&(var_it->second), sub_variant);
|
||||
}
|
||||
assert(false); // We should not reach here
|
||||
|
||||
return false;
|
||||
// variant == RSMI_DEFAULT_VARIANT
|
||||
if (func_it->second != nullptr) {
|
||||
var_it = func_it->second->find(variant);
|
||||
}
|
||||
if (sub_variant == RSMI_DEFAULT_VARIANT) {
|
||||
return true;
|
||||
}
|
||||
// sub_variant != RSMI_DEFAULT_VARIANT
|
||||
if (func_it->second == nullptr) {
|
||||
return false;
|
||||
}
|
||||
return subvariant_match(&(var_it->second), sub_variant);
|
||||
}
|
||||
|
||||
rsmi_status_t Device::restartAMDGpuDriver(void) {
|
||||
REQUIRE_ROOT_ACCESS
|
||||
bool restartSuccessful = true;
|
||||
bool success = false;
|
||||
std::string out = "";
|
||||
std::string out;
|
||||
bool wasGdmServiceActive = false;
|
||||
|
||||
// sudo systemctl is-active gdm
|
||||
|
||||
@@ -41,24 +41,22 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <dirent.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <regex> // NOLINT
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <pthread.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_monitor.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
@@ -151,7 +149,7 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header,
|
||||
const rsmi_gpu_metrics_v_1_2 *rsmi_gpu_metrics_v_1_2,
|
||||
const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3,
|
||||
const rsmi_gpu_metrics_t *rsmi_gpu_metrics) {
|
||||
if (RocmSMI::getInstance().isLoggingOn() == false) {
|
||||
if (!RocmSMI::getInstance().isLoggingOn()) {
|
||||
return;
|
||||
}
|
||||
std::ostringstream ss;
|
||||
@@ -171,9 +169,8 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header,
|
||||
}
|
||||
if (rsmi_gpu_metrics == nullptr) {
|
||||
return;
|
||||
} else {
|
||||
// do nothing - continue
|
||||
}
|
||||
|
||||
ss
|
||||
/* Common Header */
|
||||
<< print_unsigned_hex_and_int(
|
||||
@@ -291,8 +288,8 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header,
|
||||
rsmi_gpu_metrics->gfx_activity_acc,
|
||||
"rsmi_gpu_metrics->gfx_activity_acc")
|
||||
<< print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->mem_actvity_acc,
|
||||
"rsmi_gpu_metrics->mem_actvity_acc");
|
||||
rsmi_gpu_metrics->mem_activity_acc,
|
||||
"rsmi_gpu_metrics->mem_activity_acc");
|
||||
for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) {
|
||||
ss << print_unsigned_hex_and_int(
|
||||
rsmi_gpu_metrics->temperature_hbm[i],
|
||||
@@ -366,7 +363,7 @@ static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind,
|
||||
}
|
||||
|
||||
#define ASSIGN_DATA_FIELD(FIELD, SRC) \
|
||||
data->FIELD = SRC->FIELD;
|
||||
data->FIELD = (SRC)->FIELD;
|
||||
|
||||
#define ASSIGN_COMMON_FORMATS(SRC) \
|
||||
ASSIGN_DATA_FIELD(common_header, (SRC)) \
|
||||
@@ -417,7 +414,7 @@ static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind,
|
||||
|
||||
// These fields didn't exist in v0
|
||||
data->gfx_activity_acc = 0;
|
||||
data->mem_actvity_acc = 0;
|
||||
data->mem_activity_acc = 0;
|
||||
(void)memset(data->temperature_hbm, 0,
|
||||
RSMI_NUM_HBM_INSTANCES * sizeof(uint16_t));
|
||||
} // else handle other conversions to format 1
|
||||
|
||||
@@ -41,20 +41,19 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <sys/stat.h>
|
||||
#include <dirent.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <fstream>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_io_link.h"
|
||||
|
||||
@@ -161,7 +160,7 @@ static int ReadLinkProperties(uint32_t node_indx, uint32_t link_indx,
|
||||
retVec->push_back(line);
|
||||
}
|
||||
|
||||
if (retVec->size() == 0) {
|
||||
if (retVec->empty()) {
|
||||
fs.close();
|
||||
return 0;
|
||||
}
|
||||
@@ -182,7 +181,7 @@ static int DiscoverLinks(std::map<std::pair<uint32_t, uint32_t>,
|
||||
if (links == nullptr) {
|
||||
return EINVAL;
|
||||
}
|
||||
assert(links->size() == 0);
|
||||
assert(links->empty());
|
||||
|
||||
links->clear();
|
||||
|
||||
@@ -229,8 +228,8 @@ static int DiscoverLinks(std::map<std::pair<uint32_t, uint32_t>,
|
||||
}
|
||||
|
||||
link_indx = static_cast<uint32_t>(std::stoi(dentry_io_link->d_name));
|
||||
link = std::shared_ptr<IOLink>(new IOLink(node_indx, link_indx,
|
||||
directory));
|
||||
link = std::make_shared<IOLink>(node_indx, link_indx,
|
||||
directory);
|
||||
|
||||
link->Initialize();
|
||||
|
||||
@@ -273,7 +272,7 @@ static int DiscoverLinksPerNode(uint32_t node_indx, std::map<uint32_t,
|
||||
if (links == nullptr) {
|
||||
return EINVAL;
|
||||
}
|
||||
assert(links->size() == 0);
|
||||
assert(links->empty());
|
||||
|
||||
links->clear();
|
||||
|
||||
@@ -297,8 +296,8 @@ static int DiscoverLinksPerNode(uint32_t node_indx, std::map<uint32_t,
|
||||
}
|
||||
|
||||
link_indx = static_cast<uint32_t>(std::stoi(dentry->d_name));
|
||||
link = std::shared_ptr<IOLink>(new IOLink(node_indx, link_indx,
|
||||
directory));
|
||||
link = std::make_shared<IOLink>(node_indx, link_indx,
|
||||
directory);
|
||||
|
||||
link->Initialize();
|
||||
|
||||
@@ -323,16 +322,15 @@ int DiscoverP2PLinksPerNode(uint32_t node_indx, std::map<uint32_t,
|
||||
return DiscoverLinksPerNode(node_indx, links, P2P_LINK_DIRECTORY);
|
||||
}
|
||||
|
||||
IOLink::~IOLink() {
|
||||
}
|
||||
IOLink::~IOLink() = default;
|
||||
|
||||
int IOLink::ReadProperties(void) {
|
||||
int ret;
|
||||
|
||||
std::vector<std::string> propVec;
|
||||
|
||||
assert(properties_.size() == 0);
|
||||
if (properties_.size() > 0) {
|
||||
assert(properties_.empty());
|
||||
if (!properties_.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -347,8 +345,8 @@ int IOLink::ReadProperties(void) {
|
||||
uint64_t val_int; // Assume all properties are unsigned integers for now
|
||||
std::istringstream fs;
|
||||
|
||||
for (uint32_t i = 0; i < propVec.size(); ++i) {
|
||||
fs.str(propVec[i]);
|
||||
for (const auto & i : propVec) {
|
||||
fs.str(i);
|
||||
fs >> key_str;
|
||||
fs >> val_int;
|
||||
|
||||
|
||||
@@ -41,28 +41,29 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <dirent.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <fstream>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "rocm_smi/rocm_smi_io_link.h"
|
||||
#include "rocm_smi/rocm_smi_kfd.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_device.h"
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
|
||||
namespace amd {
|
||||
namespace smi {
|
||||
@@ -195,7 +196,7 @@ int ReadKFDDeviceProperties(uint32_t kfd_node_id,
|
||||
retVec->push_back(line);
|
||||
}
|
||||
|
||||
if (retVec->size() == 0) {
|
||||
if (retVec->empty()) {
|
||||
fs.close();
|
||||
return ENOENT;
|
||||
}
|
||||
@@ -517,7 +518,7 @@ int DiscoverKFDNodes(std::map<uint64_t, std::shared_ptr<KFDNode>> *nodes) {
|
||||
if (nodes == nullptr) {
|
||||
return EINVAL;
|
||||
}
|
||||
assert(nodes->size() == 0);
|
||||
assert(nodes->empty());
|
||||
|
||||
nodes->clear();
|
||||
|
||||
@@ -548,7 +549,7 @@ int DiscoverKFDNodes(std::map<uint64_t, std::shared_ptr<KFDNode>> *nodes) {
|
||||
continue;
|
||||
}
|
||||
|
||||
node = std::shared_ptr<KFDNode>(new KFDNode(node_indx));
|
||||
node = std::make_shared<KFDNode>(node_indx);
|
||||
|
||||
node->Initialize();
|
||||
|
||||
@@ -596,16 +597,15 @@ int DiscoverKFDNodes(std::map<uint64_t, std::shared_ptr<KFDNode>> *nodes) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
KFDNode::~KFDNode() {
|
||||
}
|
||||
KFDNode::~KFDNode() = default;
|
||||
|
||||
int KFDNode::ReadProperties(void) {
|
||||
int ret;
|
||||
|
||||
std::vector<std::string> propVec;
|
||||
|
||||
assert(properties_.size() == 0);
|
||||
if (properties_.size() > 0) {
|
||||
assert(properties_.empty());
|
||||
if (!properties_.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -620,8 +620,8 @@ int KFDNode::ReadProperties(void) {
|
||||
uint64_t val_int; // Assume all properties are unsigned integers for now
|
||||
std::istringstream fs;
|
||||
|
||||
for (uint32_t i = 0; i < propVec.size(); ++i) {
|
||||
fs.str(propVec[i]);
|
||||
for (const auto & i : propVec) {
|
||||
fs.str(i);
|
||||
fs >> key_str;
|
||||
fs >> val_int;
|
||||
|
||||
@@ -776,20 +776,30 @@ KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth,
|
||||
// /sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties
|
||||
// size_in_bytes 68702699520
|
||||
int KFDNode::get_total_memory(uint64_t* total) {
|
||||
if (total == nullptr) return EINVAL;
|
||||
std::ostringstream ss;
|
||||
if (total == nullptr) {
|
||||
return EINVAL;
|
||||
}
|
||||
*total = 0;
|
||||
|
||||
std::string f_path = kKFDNodesPathRoot;
|
||||
f_path += "/";
|
||||
f_path += std::to_string(node_indx_);
|
||||
f_path += "/mem_banks";
|
||||
int subDirCount = subDirectoryCountInPath(f_path);
|
||||
ss << __PRETTY_FUNCTION__ << " | [before loop] Within " << f_path
|
||||
<< " has subdirectory count = " << std::to_string(subDirCount);
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
auto kfd_node_dir = opendir(f_path.c_str());
|
||||
if (kfd_node_dir == nullptr) {
|
||||
return errno;
|
||||
}
|
||||
auto dentry = readdir(kfd_node_dir);
|
||||
while (dentry != nullptr) {
|
||||
while (dentry != nullptr && subDirCount > 0) {
|
||||
ss << __PRETTY_FUNCTION__ << " | [inside loop] Within " << f_path
|
||||
<< " has subdirectory count = " << std::to_string(subDirCount);
|
||||
LOG_DEBUG(ss);
|
||||
if (dentry->d_name[0] == '.') {
|
||||
dentry = readdir(kfd_node_dir);
|
||||
continue;
|
||||
@@ -823,6 +833,7 @@ int KFDNode::get_total_memory(uint64_t* total) {
|
||||
}
|
||||
}
|
||||
} // end loop for lines in property file
|
||||
subDirCount--;
|
||||
} // end loop for mem_bank directory
|
||||
|
||||
if (closedir(kfd_node_dir)) {
|
||||
@@ -863,5 +874,80 @@ int KFDNode::get_used_memory(uint64_t* used) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// /sys/class/kfd/kfd/topology/nodes/*/properties
|
||||
int read_node_properties(uint32_t node, std::string property_name,
|
||||
uint64_t *val) {
|
||||
std::ostringstream ss;
|
||||
int retVal = EINVAL;
|
||||
if (property_name.empty() || val == nullptr) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: Could not read node #" << std::to_string(node)
|
||||
<< ", property_name is empty or *val is nullptr "
|
||||
<< " | return = " << std::to_string(retVal)
|
||||
<< " | ";
|
||||
LOG_DEBUG(ss);
|
||||
return retVal;
|
||||
}
|
||||
std::shared_ptr<KFDNode> myNode = std::shared_ptr<KFDNode>(new KFDNode(node));
|
||||
myNode->Initialize();
|
||||
if (KFDNodeSupported(node)) {
|
||||
retVal = myNode->get_property_value(property_name, val);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Successfully read node #" << std::to_string(node)
|
||||
<< " for property_name = " << property_name
|
||||
<< " | Data (" << property_name << ") * val = "
|
||||
<< std::to_string(*val)
|
||||
<< " | return = " << std::to_string(retVal)
|
||||
<< " | ";
|
||||
LOG_DEBUG(ss);
|
||||
} else {
|
||||
retVal = 1;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: Could not read node #" << std::to_string(node)
|
||||
<< ", KFD node was an unsupported node."
|
||||
<< " | return = " << std::to_string(retVal)
|
||||
<< " | ";
|
||||
LOG_ERROR(ss);
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
// /sys/class/kfd/kfd/topology/nodes/*/gpu_id
|
||||
int get_gpu_id(uint32_t node, uint64_t *gpu_id) {
|
||||
std::ostringstream ss;
|
||||
int retVal = EINVAL;
|
||||
if (gpu_id == nullptr) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: Could not read node #" << std::to_string(node)
|
||||
<< ", gpu_id is a nullptr "
|
||||
<< " | return = " << std::to_string(retVal)
|
||||
<< " | ";
|
||||
LOG_DEBUG(ss);
|
||||
return retVal;
|
||||
}
|
||||
std::shared_ptr<KFDNode> myNode = std::shared_ptr<KFDNode>(new KFDNode(node));
|
||||
myNode->Initialize();
|
||||
if (KFDNodeSupported(node)) {
|
||||
retVal = ReadKFDGpuId(node, gpu_id);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Successfully read node #" << std::to_string(node)
|
||||
<< " for gpu_id"
|
||||
<< " | Data (gpu_id) *gpu_id = "
|
||||
<< std::to_string(*gpu_id)
|
||||
<< " | return = " << std::to_string(retVal)
|
||||
<< " | ";
|
||||
LOG_DEBUG(ss);
|
||||
} else {
|
||||
retVal = 1;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: Could not read node #" << std::to_string(node)
|
||||
<< ", KFD node was an unsupported node."
|
||||
<< " | return = " << std::to_string(retVal)
|
||||
<< " | ";
|
||||
LOG_ERROR(ss);
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
@@ -55,7 +55,7 @@
|
||||
* be printed, unless RSMI_LOGGING is enabled.
|
||||
*
|
||||
* BUFFER log type should be use while logging raw buffer or raw messages
|
||||
* Having direct interface as well as C++ Singleton inface. Can use
|
||||
* Having direct interface as well as C++ Singleton iface. Can use
|
||||
* whatever interface fits your needs.
|
||||
*/
|
||||
|
||||
@@ -70,7 +70,6 @@
|
||||
// Code Specific Header Files(s)
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
|
||||
using namespace ROCmLogging;
|
||||
|
||||
@@ -117,7 +116,7 @@ void Logger::logIntoFile(std::string& data) {
|
||||
if(!m_File.is_open()) {
|
||||
initialize_resources();
|
||||
if (!m_File.is_open()) {
|
||||
std::cout << "WARNING: re-initializing resources was unsuccessfull."
|
||||
std::cout << "WARNING: re-initializing resources was unsuccessful."
|
||||
<<" Unable to print the following message." << std::endl;
|
||||
logOnConsole(data);
|
||||
unlock();
|
||||
@@ -164,7 +163,7 @@ void Logger::error(const char* text) throw() {
|
||||
// By default, logging is disabled
|
||||
// The check below allows us to toggle logging through RSMI_LOGGING
|
||||
// set or unset
|
||||
if (m_loggingIsOn == false) {
|
||||
if (!m_loggingIsOn) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -198,7 +197,7 @@ void Logger::alarm(const char* text) throw() {
|
||||
// By default, logging is disabled (ie. no RSMI_LOGGING)
|
||||
// The check below allows us to toggle logging through RSMI_LOGGING
|
||||
// set or unset
|
||||
if (m_loggingIsOn == false) {
|
||||
if (!m_loggingIsOn) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -232,7 +231,7 @@ void Logger::always(const char* text) throw() {
|
||||
// By default, logging is disabled (ie. no RSMI_LOGGING)
|
||||
// The check below allows us to toggle logging through RSMI_LOGGING
|
||||
// set or unset
|
||||
if (m_loggingIsOn == false) {
|
||||
if (!m_loggingIsOn) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -270,7 +269,7 @@ void Logger::buffer(const char* text) throw() {
|
||||
if(!m_File.is_open()) {
|
||||
initialize_resources();
|
||||
if (!m_File.is_open()) {
|
||||
std::cout << "WARNING: re-initializing resources was unsuccessfull."
|
||||
std::cout << "WARNING: re-initializing resources was unsuccessful."
|
||||
<<" Unable to print the following message." << std::endl;
|
||||
std::string txtStr(text);
|
||||
std::cout << txtStr << std::endl;
|
||||
@@ -300,7 +299,7 @@ void Logger::info(const char* text) throw() {
|
||||
// By default, logging is disabled (ie. no RSMI_LOGGING)
|
||||
// The check below allows us to toggle logging through RSMI_LOGGING
|
||||
// set or unset
|
||||
if (m_loggingIsOn == false) {
|
||||
if (!m_loggingIsOn) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -334,7 +333,7 @@ void Logger::trace(const char* text) throw() {
|
||||
// By default, logging is disabled (ie. no RSMI_LOGGING)
|
||||
// The check below allows us to toggle logging through RSMI_LOGGING
|
||||
// set or unset
|
||||
if (m_loggingIsOn == false) {
|
||||
if (!m_loggingIsOn) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -368,7 +367,7 @@ void Logger::debug(const char* text) throw() {
|
||||
// By default, logging is disabled (ie. no RSMI_LOGGING)
|
||||
// The check below allows us to toggle logging through RSMI_LOGGING
|
||||
// set or unset
|
||||
if (m_loggingIsOn == false) {
|
||||
if (!m_loggingIsOn) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -426,7 +425,7 @@ void Logger::enableFileLogging() {
|
||||
|
||||
// Returns a string of details on current log settings
|
||||
std::string Logger::getLogSettings() {
|
||||
std::string logSettings = "";
|
||||
std::string logSettings;
|
||||
|
||||
if (m_File.is_open()) {
|
||||
logSettings += "OpenStatus = File (" + logFileName + ") is open";
|
||||
@@ -490,7 +489,7 @@ void Logger::initialize_resources() {
|
||||
// The check below allows us to toggle logging through RSMI_LOGGING
|
||||
// set or unset
|
||||
m_loggingIsOn = amd::smi::RocmSMI::getInstance().isLoggingOn();
|
||||
if (m_loggingIsOn == false) {
|
||||
if (!m_loggingIsOn) {
|
||||
return;
|
||||
}
|
||||
m_File.open(logFileName.c_str(), std::ios::out | std::ios::app);
|
||||
|
||||
@@ -39,25 +39,26 @@
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
#include <dirent.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <string>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <utility>
|
||||
#include <functional>
|
||||
#include <dirent.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cerrno>
|
||||
#include <unordered_map>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_device.h"
|
||||
@@ -285,7 +286,8 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
|
||||
|
||||
// We are looking for the last element in the path that has the form
|
||||
// XXXX:XX:XX.X, where X is a hex integer (lower case is expected)
|
||||
std::size_t slash_i, end_i;
|
||||
std::size_t slash_i;
|
||||
std::size_t end_i;
|
||||
std::string tmp;
|
||||
|
||||
std::string tpath_str(tpath);
|
||||
@@ -332,9 +334,9 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
|
||||
GetEnvVariables();
|
||||
// To help debug env variable issues
|
||||
// printEnvVarInfo();
|
||||
// debugRSMIEnvVarInfo();
|
||||
|
||||
while (std::string(kAMDMonitorTypes[i]) != "") {
|
||||
while (!std::string(kAMDMonitorTypes[i]).empty()) {
|
||||
amd_monitor_types_.insert(kAMDMonitorTypes[i]);
|
||||
++i;
|
||||
}
|
||||
@@ -348,12 +350,12 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
}
|
||||
|
||||
uint64_t bdfid;
|
||||
for (uint32_t i = 0; i < devices_.size(); ++i) {
|
||||
if (ConstructBDFID(devices_[i]->path(), &bdfid) != 0) {
|
||||
for (auto & device : devices_) {
|
||||
if (ConstructBDFID(device->path(), &bdfid) != 0) {
|
||||
std::cerr << "Failed to construct BDFID." << std::endl;
|
||||
ret = 1;
|
||||
} else {
|
||||
devices_[i]->set_bdfid(bdfid);
|
||||
device->set_bdfid(bdfid);
|
||||
}
|
||||
}
|
||||
if (ret != 0) {
|
||||
@@ -389,7 +391,7 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
uint64_t bdfid = (*dev_iter)->bdfid();
|
||||
if (tmp_map.find(bdfid) == tmp_map.end()) {
|
||||
ss << __PRETTY_FUNCTION__ << " | removing device = "
|
||||
<< (*dev_iter)->path();
|
||||
<< (*dev_iter)->path() << "; bdfid = " << std::to_string(bdfid);
|
||||
dev_iter = devices_.erase(dev_iter);
|
||||
LOG_DEBUG(ss);
|
||||
continue;
|
||||
@@ -444,8 +446,7 @@ RocmSMI::RocmSMI(uint64_t flags) : init_options_(flags),
|
||||
kfd_notif_evt_fh_(-1), kfd_notif_evt_fh_refcnt_(0) {
|
||||
}
|
||||
|
||||
RocmSMI::~RocmSMI() {
|
||||
}
|
||||
RocmSMI::~RocmSMI() = default;
|
||||
|
||||
RocmSMI& RocmSMI::getInstance(uint64_t flags) {
|
||||
// Assume c++11 or greater. static objects will be created by only 1 thread
|
||||
@@ -494,7 +495,7 @@ static inline std::unordered_set<uint32_t> GetEnvVarUIntegerSets(
|
||||
if(ev_str == nullptr) { return returnSet; }
|
||||
std::string stringEnv = ev_str;
|
||||
|
||||
if (stringEnv.empty() == false) {
|
||||
if (!stringEnv.empty()) {
|
||||
// parse out values by commas
|
||||
std::string parsedVal;
|
||||
std::istringstream ev_str_ss(stringEnv);
|
||||
@@ -549,48 +550,54 @@ uint32_t RocmSMI::getLogSetting() {
|
||||
return this->env_vars_.logging_on;
|
||||
}
|
||||
|
||||
void RocmSMI::printEnvVarInfo(void) {
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_output_bitfield = "
|
||||
<< ((env_vars_.debug_output_bitfield == 0) ? "<undefined>"
|
||||
: std::to_string(env_vars_.debug_output_bitfield))
|
||||
<< std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_DRM_root_override = "
|
||||
<< ((env_vars_.path_DRM_root_override == nullptr)
|
||||
? "<undefined>" : env_vars_.path_DRM_root_override)
|
||||
<< std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_HWMon_root_override = "
|
||||
<< ((env_vars_.path_HWMon_root_override == nullptr)
|
||||
? "<undefined>" : env_vars_.path_HWMon_root_override)
|
||||
<< std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_power_root_override = "
|
||||
<< ((env_vars_.path_power_root_override == nullptr)
|
||||
? "<undefined>" : env_vars_.path_power_root_override)
|
||||
<< std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_inf_loop = "
|
||||
<< ((env_vars_.debug_inf_loop == 0) ? "<undefined>"
|
||||
: std::to_string(env_vars_.debug_inf_loop))
|
||||
<< std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
|
||||
void RocmSMI::debugRSMIEnvVarInfo(void) {
|
||||
std::cout << __PRETTY_FUNCTION__
|
||||
<< RocmSMI::getInstance().getRSMIEnvVarInfo();
|
||||
}
|
||||
|
||||
std::string RocmSMI::getRSMIEnvVarInfo(void) {
|
||||
std::ostringstream ss;
|
||||
ss << "\n\tRSMI_DEBUG_BITFIELD = "
|
||||
<< ((env_vars_.debug_output_bitfield == 0) ? "<undefined>"
|
||||
: std::to_string(env_vars_.debug_output_bitfield))
|
||||
<< std::endl;
|
||||
ss << "\tRSMI_DEBUG_DRM_ROOT_OVERRIDE = "
|
||||
<< ((env_vars_.path_DRM_root_override == nullptr)
|
||||
? "<undefined>" : env_vars_.path_DRM_root_override)
|
||||
<< std::endl;
|
||||
ss << "\tRSMI_DEBUG_HWMON_ROOT_OVERRIDE = "
|
||||
<< ((env_vars_.path_HWMon_root_override == nullptr)
|
||||
? "<undefined>" : env_vars_.path_HWMon_root_override)
|
||||
<< std::endl;
|
||||
ss << "\tRSMI_DEBUG_PP_ROOT_OVERRIDE = "
|
||||
<< ((env_vars_.path_power_root_override == nullptr)
|
||||
? "<undefined>" : env_vars_.path_power_root_override)
|
||||
<< std::endl;
|
||||
ss << "\tRSMI_DEBUG_INFINITE_LOOP = "
|
||||
<< ((env_vars_.debug_inf_loop == 0) ? "<undefined>"
|
||||
: std::to_string(env_vars_.debug_inf_loop))
|
||||
<< std::endl;
|
||||
ss << "\tRSMI_LOGGING = "
|
||||
<< getLogSetting() << std::endl;
|
||||
bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
|
||||
<< (isLoggingOn ? "true" : "false") << std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {";
|
||||
ss << "\tRSMI_LOGGING (are logs on) = "
|
||||
<< (isLoggingOn ? "TRUE" : "FALSE") << std::endl;
|
||||
ss << "\tRSMI_DEBUG_ENUM_OVERRIDE = {";
|
||||
if (env_vars_.enum_overrides.empty()) {
|
||||
std::cout << "}" << std::endl;
|
||||
return;
|
||||
ss << "}" << std::endl;
|
||||
return ss.str();
|
||||
}
|
||||
for (auto it=env_vars_.enum_overrides.begin();
|
||||
it != env_vars_.enum_overrides.end(); ++it) {
|
||||
DevInfoTypes type = static_cast<DevInfoTypes>(*it);
|
||||
std::cout << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type)
|
||||
+ ")");
|
||||
ss << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) + ")");
|
||||
auto temp_it = it;
|
||||
if(++temp_it != env_vars_.enum_overrides.end()) {
|
||||
std::cout << ", ";
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
std::cout << "}" << std::endl;
|
||||
ss << "}" << std::endl;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::shared_ptr<Monitor>
|
||||
@@ -638,7 +645,7 @@ RocmSMI::FindMonitor(std::string monitor_path) {
|
||||
fs.close();
|
||||
|
||||
if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) {
|
||||
m = std::shared_ptr<Monitor>(new Monitor(mon_name, &env_vars_));
|
||||
m = std::make_shared<Monitor>(mon_name, &env_vars_);
|
||||
m->setTempSensorLabelMap();
|
||||
m->setVoltSensorLabelMap();
|
||||
break;
|
||||
@@ -666,12 +673,12 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
|
||||
dev_path += "/";
|
||||
dev_path += dev_name;
|
||||
|
||||
auto dev = std::shared_ptr<Device>(new Device(dev_path, &env_vars_));
|
||||
auto dev = std::make_shared<Device>(dev_path, &env_vars_);
|
||||
|
||||
std::shared_ptr<Monitor> m = FindMonitor(dev_path + "/device/hwmon");
|
||||
dev->set_monitor(m);
|
||||
|
||||
std::string d_name = dev_name;
|
||||
const std::string& d_name = dev_name;
|
||||
uint32_t card_indx = GetDeviceIndex(d_name);
|
||||
dev->set_drm_render_minor(GetDrmRenderMinor(dev_path));
|
||||
dev->set_card_index(card_indx);
|
||||
@@ -682,8 +689,6 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
|
||||
<< dev_name << " | path = " << dev_path
|
||||
<< " | card index = " << std::to_string(card_indx) << " | ";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static const uint32_t kAmdGpuId = 0x1002;
|
||||
@@ -694,8 +699,7 @@ static bool isAMDGPU(std::string dev_path) {
|
||||
std::string vend_path = dev_path + "/device/vendor";
|
||||
if (!FileExists(vend_path.c_str())) {
|
||||
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
|
||||
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
|
||||
"is an amdgpu device - FALSE");
|
||||
<< " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE");
|
||||
LOG_DEBUG(ss);
|
||||
return isAmdGpu;
|
||||
}
|
||||
@@ -705,8 +709,7 @@ static bool isAMDGPU(std::string dev_path) {
|
||||
|
||||
if (!fs.is_open()) {
|
||||
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
|
||||
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
|
||||
"is an amdgpu device - FALSE");
|
||||
<< " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE");
|
||||
LOG_DEBUG(ss);
|
||||
return isAmdGpu;
|
||||
}
|
||||
@@ -721,8 +724,7 @@ static bool isAMDGPU(std::string dev_path) {
|
||||
isAmdGpu = true;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
|
||||
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
|
||||
"is an amdgpu device - FALSE");
|
||||
<< " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE");
|
||||
LOG_DEBUG(ss);
|
||||
return isAmdGpu;
|
||||
}
|
||||
@@ -730,6 +732,7 @@ static bool isAMDGPU(std::string dev_path) {
|
||||
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
std::string err_msg;
|
||||
uint32_t count = 0;
|
||||
std::ostringstream ss;
|
||||
|
||||
// If this gets called more than once, clear previous findings.
|
||||
devices_.clear();
|
||||
@@ -756,17 +759,125 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
}
|
||||
dentry = readdir(drm_dir);
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << " | Discovered a potential of "
|
||||
<< std::to_string(count) << " cards" << " | ";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
struct systemNode {
|
||||
uint32_t s_node_id = 0;
|
||||
uint64_t s_gpu_id = 0;
|
||||
uint64_t s_unique_id = 0;
|
||||
};
|
||||
// allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id}
|
||||
std::multimap<uint64_t, systemNode> allSystemNodes;
|
||||
uint32_t node_id = 0;
|
||||
while (true) {
|
||||
uint64_t gpu_id = 0, unique_id = 0;
|
||||
int ret_gpu_id = get_gpu_id(node_id, &gpu_id);
|
||||
int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id);
|
||||
if (ret_gpu_id == 0 || ret_unique_id == 0) {
|
||||
systemNode myNode;
|
||||
myNode.s_node_id = node_id;
|
||||
myNode.s_gpu_id = gpu_id;
|
||||
myNode.s_unique_id = unique_id;
|
||||
if(gpu_id != 0) { // only add gpu nodes, 0 = CPU
|
||||
allSystemNodes.emplace(unique_id, myNode);
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
node_id++;
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | Ordered system nodes found = {";
|
||||
for(auto i: allSystemNodes) {
|
||||
ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
|
||||
<< "; gpu_id = " << std::to_string(i.second.s_gpu_id)
|
||||
<< "; unique_id = " << std::to_string(i.second.s_unique_id)
|
||||
<< "], "
|
||||
;
|
||||
}
|
||||
ss << "}";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
// Discover all root cards & gpu partitions associated with each
|
||||
for (uint32_t node_id = 0; node_id < count; node_id++) {
|
||||
std::string path = kPathDRMRoot;
|
||||
path += "/card";
|
||||
path += std::to_string(node_id);
|
||||
uint64_t primary_unique_id = 0;
|
||||
|
||||
// each identified gpu card node is a primary node for
|
||||
// potential matching unique ids
|
||||
if (isAMDGPU(path) ||
|
||||
(init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) {
|
||||
std::string d_name = "card";
|
||||
d_name += std::to_string(node_id);
|
||||
AddToDeviceList(d_name);
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Ordered system nodes seen in lookup = {";
|
||||
for (auto i : allSystemNodes) {
|
||||
ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
|
||||
<< "; gpu_id = " << std::to_string(i.second.s_gpu_id)
|
||||
<< "; unique_id = " << std::to_string(i.second.s_unique_id)
|
||||
<< "], ";
|
||||
}
|
||||
ss << "}";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
uint64_t temp_primary_unique_id = 0;
|
||||
if (allSystemNodes.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// get lowest key 1st to keep order of nodes matching card
|
||||
uint32_t lowest_NodeId = 0;
|
||||
uint32_t curr_NodeId = 0;
|
||||
|
||||
for (auto it = allSystemNodes.begin(), end = allSystemNodes.end();
|
||||
it != end; it = allSystemNodes.upper_bound(it->first)) {
|
||||
curr_NodeId = it->second.s_node_id;
|
||||
if (it == allSystemNodes.begin()) {
|
||||
lowest_NodeId = it->second.s_node_id;
|
||||
}
|
||||
if (curr_NodeId <= lowest_NodeId) {
|
||||
lowest_NodeId = curr_NodeId;
|
||||
temp_primary_unique_id = it->second.s_unique_id;
|
||||
}
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | lowest_NodeId = " << std::to_string(lowest_NodeId)
|
||||
<< " | curr_NodeId = " << std::to_string(curr_NodeId)
|
||||
<< " | temp_primary_unique_id = "
|
||||
<< std::to_string(temp_primary_unique_id);
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
if (temp_primary_unique_id != 0) {
|
||||
primary_unique_id = temp_primary_unique_id;
|
||||
} else {
|
||||
allSystemNodes.erase(primary_unique_id);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto numb_nodes = allSystemNodes.count(primary_unique_id);
|
||||
ss << __PRETTY_FUNCTION__ << " | REFRESH - primary_unique_id = "
|
||||
<< std::to_string(primary_unique_id) << " has "
|
||||
<< std::to_string(numb_nodes) << " known gpu nodes";
|
||||
LOG_DEBUG(ss);
|
||||
while (numb_nodes > 1) {
|
||||
std::string secNode = "card";
|
||||
secNode += std::to_string(node_id); // add the primary node id
|
||||
AddToDeviceList(secNode);
|
||||
numb_nodes--;
|
||||
}
|
||||
// remove already added nodes associated with current card
|
||||
auto erasedNodes = allSystemNodes.erase(primary_unique_id);
|
||||
ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = "
|
||||
<< std::to_string(primary_unique_id) << " erased "
|
||||
<< std::to_string(erasedNodes) << " nodes";
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
}
|
||||
|
||||
if (closedir(drm_dir)) {
|
||||
@@ -790,7 +901,7 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) {
|
||||
power_mons_.clear();
|
||||
}
|
||||
|
||||
if (power_mons_.size() != 0) {
|
||||
if (!power_mons_.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -818,7 +929,7 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) {
|
||||
|
||||
if (FileExists(tmp.c_str())) {
|
||||
std::shared_ptr<PowerMon> mon =
|
||||
std::shared_ptr<PowerMon>(new PowerMon(mon_name, &env_vars_));
|
||||
std::make_shared<PowerMon>(mon_name, &env_vars_);
|
||||
power_mons_.push_back(mon);
|
||||
mon->set_dev_index(GetDeviceIndex(dentry->d_name));
|
||||
}
|
||||
@@ -831,8 +942,8 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) {
|
||||
return errno;
|
||||
}
|
||||
|
||||
for (auto m : power_mons_) {
|
||||
for (auto d : devices_) {
|
||||
for (const auto& m : power_mons_) {
|
||||
for (const auto& d : devices_) {
|
||||
if (m->dev_index() == d->index()) {
|
||||
d->set_power_monitor(m);
|
||||
break;
|
||||
|
||||
@@ -41,19 +41,18 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <dirent.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <regex> // NOLINT
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_monitor.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
@@ -286,8 +285,7 @@ static const std::map<const char *, monitor_depends_t> kMonFuncDependsMap = {
|
||||
env_ = nullptr;
|
||||
#endif
|
||||
}
|
||||
Monitor::~Monitor(void) {
|
||||
}
|
||||
Monitor::~Monitor(void) = default;
|
||||
|
||||
std::string
|
||||
Monitor::MakeMonitorPath(MonitorTypes type, uint32_t sensor_id) {
|
||||
@@ -339,7 +337,7 @@ Monitor::setTempSensorLabelMap(void) {
|
||||
std::string type_str;
|
||||
int ret;
|
||||
|
||||
if (temp_type_index_map_.size() > 0) {
|
||||
if (!temp_type_index_map_.empty()) {
|
||||
return 0; // We've already filled in the map
|
||||
}
|
||||
auto add_temp_sensor_entry = [&](uint32_t file_index) {
|
||||
@@ -377,7 +375,7 @@ Monitor::setVoltSensorLabelMap(void) {
|
||||
std::string type_str;
|
||||
int ret;
|
||||
|
||||
if (volt_type_index_map_.size() > 0) {
|
||||
if (!volt_type_index_map_.empty()) {
|
||||
return 0; // We've already filled in the map
|
||||
}
|
||||
auto add_volt_sensor_entry = [&](uint32_t file_index) {
|
||||
@@ -513,10 +511,10 @@ typedef enum {
|
||||
static monitor_types getFuncType(std::string f_name) {
|
||||
monitor_types ret = eDefaultMonitor;
|
||||
|
||||
if (f_name.compare("rsmi_dev_temp_metric_get") == 0) {
|
||||
if (f_name == "rsmi_dev_temp_metric_get") {
|
||||
ret = eTempMonitor;
|
||||
}
|
||||
if (f_name.compare("rsmi_dev_volt_metric_get") == 0) {
|
||||
if (f_name == "rsmi_dev_volt_metric_get") {
|
||||
ret = eVoltMonitor;
|
||||
}
|
||||
return ret;
|
||||
@@ -617,22 +615,22 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) {
|
||||
} else {
|
||||
supported_monitors = intersect;
|
||||
}
|
||||
if (supported_monitors.size() > 0) {
|
||||
for (uint32_t i = 0; i < supported_monitors.size(); ++i) {
|
||||
if (!supported_monitors.empty()) {
|
||||
for (unsigned long & supported_monitor : supported_monitors) {
|
||||
if (m_type == eDefaultMonitor) {
|
||||
assert(supported_monitors[i] > 0);
|
||||
supported_monitors[i] |=
|
||||
(supported_monitors[i] - 1) << MONITOR_TYPE_BIT_POSITION;
|
||||
assert(supported_monitor > 0);
|
||||
supported_monitor |=
|
||||
(supported_monitor - 1) << MONITOR_TYPE_BIT_POSITION;
|
||||
} else if (m_type == eTempMonitor) {
|
||||
// Temp sensor file names are 1-based
|
||||
assert(supported_monitors[i] > 0);
|
||||
supported_monitors[i] |=
|
||||
static_cast<uint64_t>(getTempSensorEnum(supported_monitors[i]))
|
||||
assert(supported_monitor > 0);
|
||||
supported_monitor |=
|
||||
static_cast<uint64_t>(getTempSensorEnum(supported_monitor))
|
||||
<< MONITOR_TYPE_BIT_POSITION;
|
||||
} else if (m_type == eVoltMonitor) {
|
||||
// Voltage sensor file names are 0-based
|
||||
supported_monitors[i] |=
|
||||
static_cast<uint64_t>(getVoltSensorEnum(supported_monitors[i]))
|
||||
supported_monitor |=
|
||||
static_cast<uint64_t>(getVoltSensorEnum(supported_monitor))
|
||||
<< MONITOR_TYPE_BIT_POSITION;
|
||||
} else {
|
||||
assert(false); // Unexpected monitor type
|
||||
@@ -643,10 +641,10 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) {
|
||||
}
|
||||
}
|
||||
|
||||
if (it->second.variants.size() == 0) {
|
||||
if (it->second.variants.empty()) {
|
||||
(*supported_funcs)[it->first] = nullptr;
|
||||
supported_variants = nullptr; // Invoke destructor
|
||||
} else if ((*supported_variants).size() > 0) {
|
||||
} else if (!(*supported_variants).empty()) {
|
||||
(*supported_funcs)[it->first] = supported_variants;
|
||||
}
|
||||
|
||||
|
||||
@@ -41,17 +41,14 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_monitor.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_common.h"
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
@@ -70,8 +67,7 @@ static const std::map<PowerMonTypes, const char *> kMonitorNameMap = {
|
||||
PowerMon::PowerMon(std::string path, RocmSMI_env_vars const *e) :
|
||||
path_(path), env_(e) {
|
||||
}
|
||||
PowerMon::~PowerMon(void) {
|
||||
}
|
||||
PowerMon::~PowerMon(void) = default;
|
||||
|
||||
static int parse_power_str(std::string s, PowerMonTypes type, uint64_t *val) {
|
||||
std::stringstream ss(s);
|
||||
|
||||
@@ -40,27 +40,28 @@
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#define _GNU_SOURCE 1 // REQUIRED: to utilize some GNU features/functions, see
|
||||
// _GNU_SOURCE functions which check
|
||||
#include <dirent.h>
|
||||
#include <dlfcn.h>
|
||||
#include <glob.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/utsname.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cerrno>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
@@ -138,7 +139,7 @@ std::vector<std::string> globFilesExist(const std::string& filePattern) {
|
||||
glob_t result_glob;
|
||||
memset(&result_glob, 0, sizeof(result_glob));
|
||||
|
||||
if (glob(filePattern.c_str(), GLOB_TILDE, NULL, &result_glob) != 0) {
|
||||
if (glob(filePattern.c_str(), GLOB_TILDE, nullptr, &result_glob) != 0) {
|
||||
globfree(&result_glob);
|
||||
// Leaving below to help debug issues discovering future glob file searches
|
||||
// debugFilesDiscovered(fileNames);
|
||||
@@ -146,7 +147,7 @@ std::vector<std::string> globFilesExist(const std::string& filePattern) {
|
||||
}
|
||||
|
||||
for(size_t i = 0; i < result_glob.gl_pathc; ++i) {
|
||||
fileNames.push_back(std::string(result_glob.gl_pathv[i]));
|
||||
fileNames.emplace_back(result_glob.gl_pathv[i]);
|
||||
}
|
||||
globfree(&result_glob);
|
||||
|
||||
@@ -159,17 +160,26 @@ int isRegularFile(std::string fname, bool *is_reg) {
|
||||
struct stat file_stat;
|
||||
int ret;
|
||||
|
||||
assert(is_reg != nullptr);
|
||||
|
||||
ret = stat(fname.c_str(), &file_stat);
|
||||
if (ret) {
|
||||
return errno;
|
||||
}
|
||||
*is_reg = S_ISREG(file_stat.st_mode);
|
||||
|
||||
if (is_reg != nullptr) {
|
||||
*is_reg = S_ISREG(file_stat.st_mode);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int WriteSysfsStr(std::string path, std::string val) {
|
||||
// On success, zero is returned. On error, -1 is returned, and
|
||||
// errno is set to indicate the error.
|
||||
auto is_regular_file_result = isRegularFile(path, nullptr);
|
||||
if (is_regular_file_result != 0) {
|
||||
return ENOENT;
|
||||
}
|
||||
|
||||
std::ofstream fs;
|
||||
int ret = 0;
|
||||
std::ostringstream ss;
|
||||
@@ -196,6 +206,13 @@ int WriteSysfsStr(std::string path, std::string val) {
|
||||
}
|
||||
|
||||
int ReadSysfsStr(std::string path, std::string *retStr) {
|
||||
// On success, zero is returned. On error, -1 is returned, and
|
||||
// errno is set to indicate the error.
|
||||
auto is_regular_file_result = isRegularFile(path, nullptr);
|
||||
if (is_regular_file_result != 0) {
|
||||
return ENOENT;
|
||||
}
|
||||
|
||||
std::stringstream ss;
|
||||
int ret = 0;
|
||||
std::ostringstream oss;
|
||||
@@ -381,7 +398,7 @@ std::string removeString(const std::string origStr,
|
||||
// defaults to trim stdOut
|
||||
std::pair<bool, std::string> executeCommand(std::string command, bool stdOut) {
|
||||
char buffer[128];
|
||||
std::string stdoutAndErr = "";
|
||||
std::string stdoutAndErr;
|
||||
bool successfulRun = true;
|
||||
command = "stdbuf -i0 -o0 -e0 " + command; // remove stdOut and err buffering
|
||||
|
||||
@@ -411,14 +428,10 @@ std::pair<bool, std::string> executeCommand(std::string command, bool stdOut) {
|
||||
return std::make_pair(successfulRun, stdoutAndErr);
|
||||
}
|
||||
|
||||
// originalstring - string to search for substring
|
||||
// originalString - string to search for substring
|
||||
// substring - string looking to find
|
||||
bool containsString(std::string originalString, std::string substring) {
|
||||
if (originalString.find(substring) != std::string::npos) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return (originalString.find(substring) != std::string::npos);
|
||||
}
|
||||
|
||||
// Creates and stores supplied data into a temporary file (within /tmp/).
|
||||
@@ -429,9 +442,9 @@ bool containsString(std::string originalString, std::string substring) {
|
||||
// https://man7.org/linux/man-pages/man3/mkstemp.3.html
|
||||
//
|
||||
// Temporary file name format:
|
||||
// <app prefix>_<state name>_<paramenter name>_<device id>
|
||||
// <app prefix>_<state name>_<parameter name>_<device id>
|
||||
// <app prefix> - prefix for our application's identifier (see kTmpFilePrefix)
|
||||
// <paramenter name> - name of parameter being stored
|
||||
// <parameter name> - name of parameter being stored
|
||||
// <state name> - state at which the stored value captures
|
||||
// <device index> - device identifier
|
||||
//
|
||||
@@ -466,9 +479,8 @@ rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName,
|
||||
close(fd);
|
||||
if (rc_write == -1) {
|
||||
return RSMI_STATUS_FILE_ERROR;
|
||||
} else {
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
std::vector<std::string> getListOfAppTmpFiles() {
|
||||
@@ -477,16 +489,18 @@ std::vector<std::string> getListOfAppTmpFiles() {
|
||||
struct dirent *ent;
|
||||
std::vector<std::string> tmpFiles;
|
||||
|
||||
if ((dir = opendir(path.c_str())) != nullptr) {
|
||||
// captures all files & directories under specified path
|
||||
while ((ent = readdir(dir)) != nullptr) {
|
||||
std::string fileDirName = ent->d_name;
|
||||
// we only want our app specific files
|
||||
if (containsString(fileDirName, kTmpFilePrefix)) {
|
||||
tmpFiles.emplace_back(path + "/" + fileDirName);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
dir = opendir(path.c_str());
|
||||
if (dir == nullptr) {
|
||||
return tmpFiles;
|
||||
}
|
||||
// captures all files & directories under specified path
|
||||
while ((ent = readdir(dir)) != nullptr) {
|
||||
std::string fileDirName = ent->d_name;
|
||||
// we only want our app specific files
|
||||
if (containsString(fileDirName, kTmpFilePrefix)) {
|
||||
tmpFiles.emplace_back(path + "/" + fileDirName);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return tmpFiles;
|
||||
@@ -515,7 +529,7 @@ std::vector<std::string> readEntireFile(std::string path) {
|
||||
std::string line;
|
||||
while (std::getline(inFileStream, line)) {
|
||||
std::istringstream ss(line);
|
||||
if(line.size() > 0) {
|
||||
if (!line.empty()) {
|
||||
fileContent.push_back(line);
|
||||
}
|
||||
}
|
||||
@@ -527,7 +541,7 @@ std::vector<std::string> readEntireFile(std::string path) {
|
||||
// and their content
|
||||
void displayAppTmpFilesContent() {
|
||||
std::vector<std::string> tmpFiles = getListOfAppTmpFiles();
|
||||
if (tmpFiles.empty() == false) {
|
||||
if (!tmpFiles.empty()) {
|
||||
for (auto &x: tmpFiles) {
|
||||
std::string out = readFile(x);
|
||||
std::cout << __PRETTY_FUNCTION__ << " | Temporary file: " << x
|
||||
@@ -543,7 +557,7 @@ void displayAppTmpFilesContent() {
|
||||
std::string debugVectorContent(std::vector<std::string> v) {
|
||||
std::ostringstream ss;
|
||||
ss << "Vector = {";
|
||||
if (v.size() > 0) {
|
||||
if (!v.empty()) {
|
||||
for (auto it=v.begin(); it < v.end(); it++) {
|
||||
ss << *it;
|
||||
auto temp_it = it;
|
||||
@@ -561,7 +575,7 @@ std::string debugVectorContent(std::vector<std::string> v) {
|
||||
std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v) {
|
||||
std::ostringstream ss;
|
||||
ss << "Vector = {";
|
||||
if (v.size() > 0) {
|
||||
if (!v.empty()) {
|
||||
for (auto it=v.begin(); it < v.end(); it++) {
|
||||
ss << (*it)->path();
|
||||
auto temp_it = it;
|
||||
@@ -576,7 +590,7 @@ std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v) {
|
||||
}
|
||||
|
||||
// Attempts to read application specific temporary file
|
||||
// This method is to be used for reading (or determing if it exists),
|
||||
// This method is to be used for reading (or determining if it exists),
|
||||
// in order to keep file naming scheme consistent.
|
||||
//
|
||||
// dv_ind - device index
|
||||
@@ -594,7 +608,7 @@ std::tuple<bool, std::string> readTmpFile(uint32_t dv_ind,
|
||||
"_" + std::to_string(dv_ind);
|
||||
std::string fileContent;
|
||||
std::vector<std::string> tmpFiles = getListOfAppTmpFiles();
|
||||
if (tmpFiles.empty() == false) {
|
||||
if (!tmpFiles.empty()) {
|
||||
for (auto &x: tmpFiles) {
|
||||
if (containsString(x, tmpFileName)) {
|
||||
fileContent = readFile(x);
|
||||
@@ -629,15 +643,23 @@ std::string getRSMIStatusString(rsmi_status_t ret) {
|
||||
// Big Endian (BE), multi-bit symbols encoded as big endian (MSB first)
|
||||
// Little Endian (LE), multi-bit symbols encoded as little endian (LSB first)
|
||||
std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string, std::string>
|
||||
std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string>
|
||||
getSystemDetails(void) {
|
||||
struct utsname buf;
|
||||
bool errorDetected = false;
|
||||
std::string temp_data;
|
||||
std::string sysname, nodename, release, version, machine;
|
||||
std::string sysname;
|
||||
std::string nodename;
|
||||
std::string release;
|
||||
std::string version;
|
||||
std::string machine;
|
||||
std::string domainName = "<undefined>";
|
||||
std::string os_distribution = "<undefined>";
|
||||
std::string endianness = "<undefined>";
|
||||
std::string rocm_lib_path = "<undefined>";
|
||||
std::string rocm_build_type = "<undefined>";
|
||||
std::string rocm_env_variables = "<undefined>";
|
||||
|
||||
if (uname(&buf) < 0) {
|
||||
errorDetected = true;
|
||||
@@ -654,7 +676,7 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
|
||||
std::string filePath = "/etc/os-release";
|
||||
bool fileExists = FileExists(filePath.c_str());
|
||||
if (fileExists == true) {
|
||||
if (fileExists) {
|
||||
std::vector<std::string> fileContent = readEntireFile(filePath);
|
||||
for (auto &line: fileContent) {
|
||||
if (line.find("PRETTY_NAME=") != std::string::npos) {
|
||||
@@ -672,9 +694,13 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
endianness = "Little Endian, multi-bit symbols encoded as"
|
||||
" little endian (LSB first)";
|
||||
}
|
||||
rocm_build_type = getBuildType();
|
||||
rocm_lib_path = getMyLibPath();
|
||||
rocm_env_variables = RocmSMI::getInstance().getRSMIEnvVarInfo();
|
||||
return std::make_tuple(errorDetected, sysname, nodename, release,
|
||||
version, machine, domainName, os_distribution,
|
||||
endianness);
|
||||
endianness, rocm_build_type, rocm_lib_path,
|
||||
rocm_env_variables);
|
||||
}
|
||||
|
||||
// If logging is enabled through RSMI_LOGGING environment variable.
|
||||
@@ -683,9 +709,10 @@ void logSystemDetails(void) {
|
||||
std::ostringstream ss;
|
||||
bool errorDetected;
|
||||
std::string sysname, node, release, version, machine, domain, distName,
|
||||
endianness;
|
||||
endianness, rocm_build_type, lib_path, rocm_env_vars;
|
||||
std::tie(errorDetected, sysname, node, release, version, machine, domain,
|
||||
distName, endianness) = getSystemDetails();
|
||||
distName, endianness, rocm_build_type, lib_path,
|
||||
rocm_env_vars) = getSystemDetails();
|
||||
if (errorDetected == false) {
|
||||
ss << "====== Gathered system details ============\n"
|
||||
<< "SYSTEM NAME: " << sysname << "\n"
|
||||
@@ -695,7 +722,10 @@ void logSystemDetails(void) {
|
||||
<< "VERSION: " << version << "\n"
|
||||
<< "MACHINE TYPE: " << machine << "\n"
|
||||
<< "DOMAIN: " << domain << "\n"
|
||||
<< "ENDIANNESS: " << endianness << "\n";
|
||||
<< "ENDIANNESS: " << endianness << "\n"
|
||||
<< "ROCM BUILD TYPE: " << rocm_build_type << "\n"
|
||||
<< "ROCM-SMI-LIB PATH: " << lib_path << "\n"
|
||||
<< "ROCM ENV VARIABLES: " << rocm_env_vars << "\n";
|
||||
LOG_INFO(ss);
|
||||
} else {
|
||||
ss << "====== Gathered system details ============\n"
|
||||
@@ -724,7 +754,7 @@ void logHexDump(
|
||||
|
||||
// Output description if given.
|
||||
// if (desc != NULL) printf("%s:\n", desc);
|
||||
if (desc != NULL) ss << "\n" << desc << "\n";
|
||||
if (desc != nullptr) ss << "\n" << desc << "\n";
|
||||
|
||||
// Length checks.
|
||||
if (len == 0) {
|
||||
@@ -816,6 +846,36 @@ rsmi_status_t getBDFWithDomain(uint64_t bdf_id, std::string& bfd_str)
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string getBuildType() {
|
||||
std::string build = "<unknown>";
|
||||
#ifndef DEBUG
|
||||
build = "release";
|
||||
#else
|
||||
build = "debug";
|
||||
#endif
|
||||
return build;
|
||||
}
|
||||
|
||||
const char *my_fname(void) {
|
||||
std::string emptyRet="";
|
||||
#ifdef _GNU_SOURCE
|
||||
Dl_info dl_info;
|
||||
dladdr((void *)my_fname, &dl_info);
|
||||
return (dl_info.dli_fname);
|
||||
#else
|
||||
return emptyRet.c_str();
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string getMyLibPath(void) {
|
||||
std::string libName = "rocm-smi-lib";
|
||||
std::string path = std::string(my_fname());
|
||||
if (path.empty()) {
|
||||
path = "Could not find library path for " + libName;
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str)
|
||||
{
|
||||
auto result = rsmi_status_t::RSMI_STATUS_SUCCESS;
|
||||
@@ -837,6 +897,35 @@ rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str)
|
||||
return result;
|
||||
}
|
||||
|
||||
int subDirectoryCountInPath(const std::string path) {
|
||||
int dir_count = 0;
|
||||
struct dirent *dent;
|
||||
DIR *srcdir = opendir(path.c_str());
|
||||
|
||||
if (srcdir == NULL) {
|
||||
perror("opendir");
|
||||
return -1;
|
||||
}
|
||||
|
||||
while ((dent = readdir(srcdir)) != NULL) {
|
||||
struct stat st;
|
||||
|
||||
if (strcmp(dent->d_name, ".") == 0 || strcmp(dent->d_name, "..") == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (fstatat(dirfd(srcdir), dent->d_name, &st, 0) < 0) {
|
||||
perror(dent->d_name);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (S_ISDIR(st.st_mode)) {
|
||||
dir_count++;
|
||||
}
|
||||
}
|
||||
closedir(srcdir);
|
||||
return dir_count;
|
||||
}
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
@@ -90,7 +90,6 @@ AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id) {
|
||||
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kClkTypes) |
|
||||
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes);
|
||||
|
||||
auto property_type_offset = (static_cast<AMDGpuPropertyOffsetType>(property_type_offset_mask) & (property_id));
|
||||
auto property_type_id = (static_cast<AMDGpuPropertyOffsetType>(property_id) & ~(property_type_offset_mask));
|
||||
|
||||
return property_type_id;
|
||||
@@ -167,6 +166,7 @@ const AMDGpuVerbList_t amdgpu_verb_check_list {
|
||||
{ AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, "amdsmi_get_gpu_od_volt_curve_regions" }
|
||||
};
|
||||
|
||||
const uint16_t kDevIDAll(0xFFFF);
|
||||
const uint16_t kDevRevIDAll(0xFFFF);
|
||||
const AMDGpuPropertyList_t amdgpu_property_reinforcement_list {
|
||||
//
|
||||
@@ -177,6 +177,14 @@ const AMDGpuPropertyList_t amdgpu_property_reinforcement_list {
|
||||
// rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL = rsmi_dev_clk_range_set;
|
||||
//
|
||||
|
||||
// AMD All Families
|
||||
{kDevIDAll, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
|
||||
MonitorTypes::kMonFanCntrlEnable),
|
||||
AMDGpuVerbTypes_t::kResetGpuFan,
|
||||
AMDGpuPropertyOpModeTypes_t::kBoth, false }
|
||||
},
|
||||
|
||||
// AMD Instinct MI210
|
||||
{0x740F, {0x02,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
|
||||
@@ -240,12 +248,6 @@ const AMDGpuPropertyList_t amdgpu_property_reinforcement_list {
|
||||
AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets,
|
||||
AMDGpuPropertyOpModeTypes_t::kBoth, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
|
||||
DevInfoTypes::kDevGpuReset),
|
||||
AMDGpuVerbTypes_t::kResetGpu,
|
||||
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
|
||||
},
|
||||
{0x74A1, {kDevRevIDAll,
|
||||
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
|
||||
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_DETERMINISM),
|
||||
@@ -351,7 +353,7 @@ rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbT
|
||||
// likely the reinforcement table does not contain any entries/rules for the
|
||||
// dev_id in question.
|
||||
//
|
||||
auto amdgpu_property_query_result_hdlr = [](rsmi_status_t query_result) {
|
||||
auto amdgpu_property_query_result_hdlr = [&](const rsmi_status_t query_result) {
|
||||
switch (query_result) {
|
||||
case (rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR):
|
||||
case (rsmi_status_t::RSMI_STATUS_NO_DATA):
|
||||
@@ -364,7 +366,7 @@ rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbT
|
||||
break;
|
||||
|
||||
default:
|
||||
return rsmi_status_t::RSMI_STATUS_NOT_FOUND;
|
||||
return actual_error_code;
|
||||
break;
|
||||
}
|
||||
};
|
||||
@@ -416,7 +418,7 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx
|
||||
std::ostringstream osstream;
|
||||
auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR);
|
||||
|
||||
AMDGpuPropertyQuery_t amdgpu_property_query = [&]() {
|
||||
auto amdgpu_property_query = [&]() {
|
||||
AMDGpuPropertyQuery_t amdgpu_property_query_init{};
|
||||
amdgpu_property_query_init.m_asic_id = 0;
|
||||
amdgpu_property_query_init.m_pci_rev_id = 0;
|
||||
@@ -435,7 +437,7 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx
|
||||
id_filter_result = rsmi_dev_revision_get(dev_idx, &tmp_amdgpu_query.m_pci_rev_id);
|
||||
}
|
||||
}
|
||||
is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) ? true : false;
|
||||
is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS);
|
||||
return tmp_amdgpu_query;
|
||||
};
|
||||
|
||||
@@ -446,6 +448,18 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx
|
||||
LOG_TRACE(osstream);
|
||||
|
||||
bool is_proper_query(false);
|
||||
|
||||
// Generic filter for checking properties for all asics and revisions.
|
||||
auto amdgpu_property_query_all_asics = amdgpu_property_query;
|
||||
amdgpu_property_query_all_asics.m_asic_id = kDevIDAll;
|
||||
amdgpu_property_query_all_asics.m_pci_rev_id = kDevRevIDAll;
|
||||
auto amdgpu_property_query_result = run_amdgpu_property_reinforcement_query(amdgpu_property_query_all_asics);
|
||||
// We found a generic entry for all asics and revisions
|
||||
if (amdgpu_property_query_result != rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR) {
|
||||
return amdgpu_property_query_result;
|
||||
}
|
||||
|
||||
// If no generic entry, then we query for specific asic and revision ids.
|
||||
amdgpu_property_query = build_asic_id_filters(amdgpu_property_query, is_proper_query);
|
||||
if (!is_proper_query) {
|
||||
rsmi_status = rsmi_status_t::RSMI_STATUS_NO_DATA;
|
||||
@@ -475,13 +489,6 @@ rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuProper
|
||||
return (amdgpu_property_reinforcement_list.find(asic_id) != amdgpu_property_reinforcement_list.end());
|
||||
};
|
||||
|
||||
auto ends_with = [](const std::string& value, const std::string& ending) {
|
||||
if (value.size() < ending.size()) {
|
||||
return false;
|
||||
}
|
||||
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
|
||||
};
|
||||
|
||||
// Traverse through all values for a given key
|
||||
osstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n";
|
||||
LOG_TRACE(osstream);
|
||||
@@ -495,7 +502,7 @@ rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuProper
|
||||
osstream << __PRETTY_FUNCTION__ << " asic id found: " << itr_begin->first << "\n";
|
||||
// Pci_rev_id matches the filter or ALL Revisions
|
||||
if ((itr_begin->second.m_pci_rev_id == amdgpu_property_query.m_pci_rev_id) ||
|
||||
(itr_begin->second.m_pci_rev_id == kDevRevIDAll)) {
|
||||
(itr_begin->second.m_pci_rev_id == kDevRevIDAll)) {
|
||||
osstream << __PRETTY_FUNCTION__ << " asic rev.id found: " << itr_begin->second.m_pci_rev_id << "\n";
|
||||
// Do we have the property we are looking for?
|
||||
if (((amdgpu_property_query.m_property != 0) &&
|
||||
|
||||
@@ -126,16 +126,20 @@ void TestFrequenciesRead::Run(void) {
|
||||
} else if (err == AMDSMI_STATUS_NOT_YET_IMPLEMENTED) {
|
||||
std::cout << "\t**Get " << name <<
|
||||
": Not implemented on this machine" << std::endl;
|
||||
// special driver issue, shouldn't normally occur
|
||||
} else if (err == AMDSMI_STATUS_UNEXPECTED_DATA) {
|
||||
std::cerr << "WARN: Clock file [" << FreqEnumToStr(t) << "] exists on device [" << i << "] but empty!" << std::endl;
|
||||
std::cerr << " Likely a driver issue!" << std::endl;
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Supported " << name << " clock frequencies: ";
|
||||
std::cout << f.num_supported << std::endl;
|
||||
print_frequencies(&f);
|
||||
// Verify api support checking functionality is working
|
||||
err = amdsmi_get_clk_freq(processor_handles_[i], t, nullptr);
|
||||
ASSERT_EQ(err, AMDSMI_STATUS_INVAL);
|
||||
}
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Supported " << name << " clock frequencies: ";
|
||||
std::cout << f.num_supported << std::endl;
|
||||
print_frequencies(&f);
|
||||
// Verify api support checking functionality is working
|
||||
err = amdsmi_get_clk_freq(processor_handles_[i], t, nullptr);
|
||||
ASSERT_EQ(err, AMDSMI_STATUS_INVAL);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -104,8 +104,7 @@ void TestFrequenciesReadWrite::Run(void) {
|
||||
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
|
||||
PrintDeviceHeader(processor_handles_[dv_ind]);
|
||||
|
||||
for (uint32_t clk = (uint32_t)CLK_TYPE_FIRST;
|
||||
clk <= CLK_TYPE__MAX; ++clk) {
|
||||
for (uint32_t clk = CLK_TYPE_FIRST; clk <= CLK_TYPE__MAX; ++clk) {
|
||||
amdsmi_clk = (amdsmi_clk_type_t)clk;
|
||||
|
||||
auto freq_read = [&]() -> bool {
|
||||
@@ -121,14 +120,20 @@ void TestFrequenciesReadWrite::Run(void) {
|
||||
std::cout << "\t**Set " << FreqEnumToStr(amdsmi_clk) <<
|
||||
": Not supported on this machine" << std::endl;
|
||||
return false;
|
||||
} else {
|
||||
// CHK_ERR_ASRT(ret)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "Initial frequency for clock " <<
|
||||
FreqEnumToStr(amdsmi_clk) << " is " << f.current << std::endl;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// special driver issue, shouldn't normally occur
|
||||
if (ret == AMDSMI_STATUS_UNEXPECTED_DATA) {
|
||||
std::cerr << "WARN: Clock file [" << FreqEnumToStr(amdsmi_clk) << "] exists on device [" << dv_ind << "] but empty!" << std::endl;
|
||||
std::cerr << " Likely a driver issue!" << std::endl;
|
||||
}
|
||||
|
||||
// CHK_ERR_ASRT(ret)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "Initial frequency for clock " <<
|
||||
FreqEnumToStr(amdsmi_clk) << " is " << f.current << std::endl;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
auto freq_write = [&]() {
|
||||
@@ -152,19 +157,18 @@ void TestFrequenciesReadWrite::Run(void) {
|
||||
std::endl;
|
||||
}
|
||||
ret = amdsmi_set_clk_freq(processor_handles_[dv_ind], amdsmi_clk, freq_bitmask);
|
||||
//Certain ASICs does not allow to set particular clocks. If set function for a clock returns
|
||||
//permission error despite root access, manually set ret value to success and return
|
||||
if (ret == AMDSMI_STATUS_NO_PERM && geteuid() == 0) {
|
||||
std::cout << "\t**Set " << FreqEnumToStr(amdsmi_clk) <<
|
||||
": Not supported on this machine. Skipping..." << std::endl;
|
||||
ret = AMDSMI_STATUS_SUCCESS;
|
||||
return;
|
||||
} else if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
// Certain ASICs does not allow to set particular clocks. If set function for a clock returns
|
||||
// permission error despite root access, manually set ret value to success and return
|
||||
//
|
||||
// Sometimes setting clock frequencies is completely not supported
|
||||
if ((ret == AMDSMI_STATUS_NO_PERM && geteuid() == 0) ||
|
||||
(ret == AMDSMI_STATUS_NOT_SUPPORTED)) {
|
||||
std::cout << "\t**Set " << FreqEnumToStr(amdsmi_clk) <<
|
||||
": Not supported on this machine. Skipping..." << std::endl;
|
||||
ret = AMDSMI_STATUS_SUCCESS;
|
||||
return;
|
||||
}
|
||||
|
||||
CHK_ERR_ASRT(ret)
|
||||
ret = amdsmi_get_clk_freq(processor_handles_[dv_ind], amdsmi_clk, &f);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
@@ -187,7 +191,9 @@ void TestFrequenciesReadWrite::Run(void) {
|
||||
}
|
||||
|
||||
ret = amdsmi_set_gpu_perf_level(processor_handles_[dv_ind], AMDSMI_DEV_PERF_LEVEL_AUTO);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "\t**Setting performance level is not supported on this machine. Skipping..." << std::endl;
|
||||
ret = AMDSMI_STATUS_SUCCESS;
|
||||
return;
|
||||
}
|
||||
};
|
||||
@@ -199,44 +205,6 @@ void TestFrequenciesReadWrite::Run(void) {
|
||||
}
|
||||
freq_write();
|
||||
CHK_ERR_ASRT(ret)
|
||||
#if 0
|
||||
ret = amdsmi_get_clk_freq(dv_ind, amdsmi_clk, &f);
|
||||
CHK_ERR_ASRT(ret)
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "Initial frequency for clock " << amdsmi_clk << " is " <<
|
||||
f.current << std::endl;
|
||||
}
|
||||
// Set clocks to something other than the usual default of the lowest
|
||||
// frequency.
|
||||
freq_bitmask = 0b01100; // Try the 3rd and 4th clocks
|
||||
|
||||
std::string freq_bm_str =
|
||||
std::bitset<AMDSMI_MAX_NUM_FREQUENCIES>(freq_bitmask).to_string();
|
||||
|
||||
freq_bm_str.erase(0, std::min(freq_bm_str.find_first_not_of('0'),
|
||||
freq_bm_str.size()-1));
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "Setting frequency mask for clock " << amdsmi_clk <<
|
||||
" to 0b" << freq_bm_str << " ..." << std::endl;
|
||||
}
|
||||
ret = amdsmi_set_clk_freq(dv_ind, amdsmi_clk, freq_bitmask);
|
||||
CHK_ERR_ASRT(ret)
|
||||
|
||||
ret = amdsmi_get_clk_freq(dv_ind, amdsmi_clk, &f);
|
||||
CHK_ERR_ASRT(ret)
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "Frequency is now index " << f.current << std::endl;
|
||||
std::cout << "Resetting mask to all frequencies." << std::endl;
|
||||
}
|
||||
ret = amdsmi_set_clk_freq(dv_ind, amdsmi_clk, 0xFFFFFFFF);
|
||||
CHK_ERR_ASRT(ret)
|
||||
|
||||
ret = amdsmi_set_gpu_perf_level(dv_ind, AMDSMI_DEV_PERF_LEVEL_AUTO);
|
||||
CHK_ERR_ASRT(ret)
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -177,8 +177,8 @@ void TestGpuMetricsRead::Run(void) {
|
||||
<< std::to_string(smu.pcie_link_speed) << '\n';
|
||||
std::cout << "gfx_activity_acc="
|
||||
<< std::dec << smu.gfx_activity_acc << '\n';
|
||||
std::cout << "mem_actvity_acc="
|
||||
<< std::dec << smu.mem_actvity_acc << '\n';
|
||||
std::cout << "mem_activity_acc="
|
||||
<< std::dec << smu.mem_activity_acc << '\n';
|
||||
|
||||
for (int i = 0; i < AMDSMI_NUM_HBM_INSTANCES; ++i) {
|
||||
std::cout << "temperature_hbm[" << i << "]=" << std::dec <<
|
||||
|
||||
@@ -112,6 +112,13 @@ void TestPowerCapReadWrite::Run(void) {
|
||||
max = info.max_power_cap;
|
||||
orig = info.default_power_cap;
|
||||
|
||||
// Check if power cap is within the range
|
||||
// skip the test otherwise
|
||||
if (orig < min || orig > max) {
|
||||
std::cout << "Power cap is not within the range. Skipping test for " << dv_ind << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
new_cap = (max + min)/2;
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
|
||||
@@ -126,6 +126,10 @@ void TestPowerReadWrite::Run(void) {
|
||||
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "The power profile presets settings is not supported. "
|
||||
<< std::endl;
|
||||
|
||||
// Verify api support checking functionality is working
|
||||
ret = amdsmi_get_gpu_power_profile_presets(processor_handles_[dv_ind], 0, nullptr);
|
||||
ASSERT_EQ(ret, AMDSMI_STATUS_NOT_SUPPORTED);
|
||||
continue;
|
||||
}
|
||||
CHK_ERR_ASRT(ret)
|
||||
|
||||
@@ -106,14 +106,12 @@ static void RunCustomTestProlog(TestBase *test) {
|
||||
}
|
||||
test->SetUp();
|
||||
test->Run();
|
||||
return;
|
||||
}
|
||||
static void RunCustomTestEpilog(TestBase *tst) {
|
||||
if (sRSMIGlvalues->verbosity >= TestBase::VERBOSE_STANDARD) {
|
||||
tst->DisplayResults();
|
||||
}
|
||||
tst->Close();
|
||||
return;
|
||||
}
|
||||
|
||||
// If the test case one big test, you should use RunGenericTest()
|
||||
@@ -125,7 +123,6 @@ static void RunCustomTestEpilog(TestBase *tst) {
|
||||
static void RunGenericTest(TestBase *test) {
|
||||
RunCustomTestProlog(test);
|
||||
RunCustomTestEpilog(test);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -63,10 +63,7 @@ $BLACKLIST_ALL_ASICS\
|
||||
# /sys/class/kfd/kfd/topology/nodes/*/properties
|
||||
FILTER[90400]=\
|
||||
$BLACKLIST_ALL_ASICS\
|
||||
"rsmitstReadOnly.TestVoltCurvRead:"\
|
||||
"rsmitstReadOnly.TestFrequenciesRead:"\
|
||||
"rsmitstReadWrite.TestFrequenciesReadWrite:"\
|
||||
"rsmitstReadWrite.TestPowerReadWrite"
|
||||
"rsmitstReadOnly.TestVoltCurvRead"
|
||||
FILTER[90401]=${FILTER[90400]}
|
||||
FILTER[90402]=${FILTER[90400]}
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <cassert>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "test_base.h"
|
||||
@@ -61,10 +61,9 @@ static const char kResultsLabel[] = "TEST RESULTS";
|
||||
// This one is used outside this file
|
||||
const char kSetupLabel[] = "TEST SETUP";
|
||||
|
||||
TestBase::TestBase() : setup_failed_(false), description_("") {
|
||||
}
|
||||
TestBase::~TestBase() {
|
||||
TestBase::TestBase() : setup_failed_(false) {
|
||||
}
|
||||
TestBase::~TestBase() = default;
|
||||
|
||||
void TestBase::MakeHeaderStr(const char *inStr,
|
||||
std::string *outStr) const {
|
||||
@@ -155,8 +154,6 @@ void TestBase::SetUp(uint64_t init_flags) {
|
||||
std::cout << "No AMD SMI tests can be run." << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) {
|
||||
@@ -254,7 +251,7 @@ void TestBase::set_description(std::string d) {
|
||||
size_t endlptr;
|
||||
|
||||
for (size_t i = le; i < description_.size(); i += le) {
|
||||
endlptr = description_.find_last_of(" ", i);
|
||||
endlptr = description_.find_last_of(' ', i);
|
||||
description_.replace(endlptr, 1, "\n");
|
||||
i = endlptr;
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@
|
||||
#ifndef TESTS_AMD_SMI_TEST_TEST_BASE_H_
|
||||
#define TESTS_AMD_SMI_TEST_TEST_BASE_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "amd_smi/amdsmi.h"
|
||||
@@ -150,9 +151,8 @@ class TestBase {
|
||||
"\t===> Abort is over-ridden due to dont_fail command line option." \
|
||||
<< std::endl; \
|
||||
return; \
|
||||
} else { \
|
||||
ASSERT_EQ(AMDSMI_STATUS_SUCCESS, (RET)); \
|
||||
} \
|
||||
ASSERT_EQ(AMDSMI_STATUS_SUCCESS, (RET)); \
|
||||
}
|
||||
|
||||
void MakeHeaderStr(const char *inStr, std::string *outStr);
|
||||
|
||||
@@ -43,13 +43,13 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <getopt.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#include "test_base.h"
|
||||
#include "test_common.h"
|
||||
|
||||
@@ -74,7 +74,7 @@ void DumpMonitorInfo(const TestBase *test);
|
||||
#endif
|
||||
|
||||
#define DISPLAY_AMDSMI_ERR(RET) { \
|
||||
if (RET != AMDSMI_STATUS_SUCCESS) { \
|
||||
if ((RET) != AMDSMI_STATUS_SUCCESS) { \
|
||||
const char *err_str; \
|
||||
std::cout << "\t===> ERROR: AMDSMI call returned " << (RET) << std::endl; \
|
||||
amdsmi_status_code_to_string((RET), &err_str); \
|
||||
@@ -91,7 +91,7 @@ void DumpMonitorInfo(const TestBase *test);
|
||||
} \
|
||||
}
|
||||
#define CHK_AMDSMI_PERM_ERR(RET) { \
|
||||
if (RET == AMDSMI_STATUS_NO_PERM) { \
|
||||
if ((RET) == AMDSMI_STATUS_NO_PERM) { \
|
||||
std::cout << "This command requires root access." << std::endl; \
|
||||
} else { \
|
||||
DISPLAY_AMDSMI_ERR(RET) \
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle