Merge remote-tracking branch 'rocmsmi/amd-staging' into HEAD

Change-Id: I65ed7f3a0d1b6e58bc8377932d7c39db21d1b422
Этот коммит содержится в:
Galantsev, Dmitrii
2023-09-21 19:10:41 -05:00
родитель def17accbe e0483f2ee2
Коммит 5c41319c83
38 изменённых файлов: 1475 добавлений и 1055 удалений
+35 -47
Просмотреть файл
@@ -1,21 +1,18 @@
#!/bin/bash
#set -x
packageName="amd-smi-lib"
logPath=/var/log/amd_smi_lib
logName=AMD-SMI-lib.log
logFile="${logPath}/${logName}"
logrotateConfFile=/etc/logrotate.d/amd_smi.conf
do_addLogFolder() {
sudo mkdir -p "${logPath}"
sudo touch "${logFile}"
sudo chmod -R a+rw "${logPath}"
sudo chmod a+rw "${logFile}"
}
do_configureLogrotate() {
logrotate --version &>/dev/null
local IS_SYSTEMD=0
local packageName="amd-smi-lib"
local logPath=/var/log/amd_smi_lib
local logFile="${logPath}/AMD-SMI-lib.log"
local logrotateConfFile=/etc/logrotate.d/amd_smi.conf
mkdir -p "${logPath}"
touch "${logFile}"
chmod -R a+rw "${logPath}"
chmod a+rw "${logFile}"
command -v logrotate &>/dev/null
if [ $? -ne 0 ]; then
echo "[WARNING] Detected logrotate is not installed."\
"$packageName logs (when turned on) will not rotate properly."
@@ -23,14 +20,14 @@ do_configureLogrotate() {
fi
if [ ! -f $logrotateConfFile ]; then
sudo touch "${logrotateConfFile}"
sudo chmod 644 "${logrotateConfFile}" # root r/w, all others read
touch "${logrotateConfFile}"
chmod 644 "${logrotateConfFile}" # root r/w, all others read
# AMD SMI logging rotation, rotates files using root user/group
# Hourly logrotation check
# Only rotates if size grew larger than 1MB
# Max of 4 rotation files, oldest will be removed
# Rotated files use date extention of ex. AMD-SMI-lib.log.2023-05-09_16:51:42
cat << EOF | sudo tee "${logrotateConfFile}" >/dev/null
cat << EOF > "${logrotateConfFile}"
${logFile} {
su root root
hourly
@@ -47,44 +44,29 @@ EOF
# issue was RPM build thought we were using macros
# https://gitlab.kitware.com/cmake/cmake/-/issues/22965
# https://rpm-software-management.github.io/rpm/manual/spec.html
sudo sed -i s/%%/%/g "${logrotateConfFile}"
sed -i s/%%/%/g "${logrotateConfFile}"
# workaround: remove extra 'OURCE' text
# from amd_smi.conf. Unsure if CMAKE,
# bash, or here document
# issue (only seen on RHEL 8.7)
sudo sed -i s/OURCE//g "${logrotateConfFile}"
sed -i s/OURCE//g "${logrotateConfFile}"
fi
# check if logrotate uses system timers, Ubuntu/modern OS's do
# Several older OS's like RHEL 8.7, do not. Instead defaults
# to use daily cron jobs - see https://stackoverflow.com/a/69465677
sudo systemctl list-timers|grep -iq logrotate
if [ $? -ne 0 ]; then
# confirm logrotate file exists in daily
if [ -f /etc/cron.daily/logrotate ]; then
# move logrotate daily to hourly
if [ -d /etc/cron.hourly ]; then
sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
else
echo "[WARNING] Could find and configure hourly cron for $packageName's"\
" logrotate. $packageName logs (when turned on) will not rotate properly."
return
fi
else
# confirm that it's already been moved to hourly
sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly
if [ $? -ne 0 ]; then
echo "[WARNING] Could not configure an hourly cron for $packageName's logrotate."\
"$packageName logs (when turned on) may not rotate properly."
fi
if [ -d /run/systemd/system ]; then
systemctl list-timers | grep -iq logrotate
if [ $? -eq 0 ]; then
IS_SYSTEMD=1
fi
return #done configuring for non-systemd timers
else
fi
if [ "$IS_SYSTEMD" -eq 1 ]; then
# Configure systemd timers - the typical setup for modern Linux logrotation setups
if [ -f /lib/systemd/system/logrotate.timer ]; then
if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then
sudo cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup
cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup
fi
cat <<'EOF' | sudo tee /lib/systemd/system/logrotate.timer >/dev/null
cat << EOF > /lib/systemd/system/logrotate.timer
[Unit]
Description=Hourly rotation of log files
Documentation=man:logrotate(8) man:logrotate.conf(5)
@@ -98,12 +80,19 @@ Persistent=true
[Install]
WantedBy=timers.target
EOF
sudo systemctl reenable --now logrotate.timer
systemctl reenable --now logrotate.timer
else
echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\
"$packageName logs (when turned on) will not rotate properly."
fi
return #done configuring for systemd timers
else
# $IS_SYSTEMD -eq 0
if [ -f /etc/cron.daily/logrotate ]; then
# move logrotate daily to hourly
if [ -d /etc/cron.hourly ]; then
mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
fi
fi
fi
}
@@ -118,8 +107,7 @@ do_ldconfig() {
case "$1" in
( configure )
do_ldconfig
do_addLogFolder
do_configureLogrotate
do_configureLogrotate || return 0
;;
( abort-upgrade | abort-remove | abort-deconfigure )
echo "$1"
+19 -25
Просмотреть файл
@@ -1,29 +1,4 @@
#!/bin/bash
set -e
packageName="amd-smi-lib"
logPath=/var/log/amd_smi_lib
logName=AMD-SMI-lib.log
logFile="${logPath}/${logName}"
logrotateConfFile=/etc/logrotate.d/amd_smi.conf
rm_logFolder() {
sudo rm -rf "$logPath"
}
return_logrotateToOrigConfig() {
if [ -f $logrotateConfFile ]; then
sudo rm -rf "${logrotateConfFile}"
fi
if [ -f /etc/cron.hourly/logrotate ]; then
sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate
fi
if [ -f /lib/systemd/system/logrotate.timer.backup ]; then
sudo cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer
sudo rm -rf /lib/systemd/system/logrotate.timer.backup
sudo systemctl reenable --now logrotate.timer
fi
}
rm_ldconfig() {
# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
@@ -39,6 +14,25 @@ rm_pyc() {
rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi/__pycache__
}
rm_logFolder() {
rm -rf /var/log/amd_smi_lib
}
return_logrotateToOrigConfig() {
local logrotateConfFile=/etc/logrotate.d/amd_smi.conf
if [ -f $logrotateConfFile ]; then
rm -rf "$logrotateConfFile"
fi
if [ -f /etc/cron.hourly/logrotate ]; then
mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate
fi
if [ -f /lib/systemd/system/logrotate.timer.backup ]; then
cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer
rm -rf /lib/systemd/system/logrotate.timer.backup
systemctl reenable --now logrotate.timer
fi
}
case "$1" in
( remove | upgrade)
+37 -54
Просмотреть файл
@@ -1,21 +1,18 @@
#!/bin/bash
#set -x
packageName="amd-smi-lib"
logPath=/var/log/amd_smi_lib
logName=AMD-SMI-lib.log
logFile="${logPath}/${logName}"
logrotateConfFile=/etc/logrotate.d/amd_smi.conf
do_addLogFolder() {
sudo mkdir -p "${logPath}"
sudo touch "${logFile}"
sudo chmod -R a+rw "${logPath}"
sudo chmod a+rw "${logFile}"
}
do_configureLogrotate() {
logrotate --version &>/dev/null
local IS_SYSTEMD=0
local packageName="amd-smi-lib"
local logPath=/var/log/amd_smi_lib
local logFile="${logPath}/AMD-SMI-lib.log"
local logrotateConfFile=/etc/logrotate.d/amd_smi.conf
mkdir -p "${logPath}"
touch "${logFile}"
chmod -R a+rw "${logPath}"
chmod a+rw "${logFile}"
command -v logrotate &>/dev/null
if [ $? -ne 0 ]; then
echo "[WARNING] Detected logrotate is not installed."\
"$packageName logs (when turned on) will not rotate properly."
@@ -23,14 +20,14 @@ do_configureLogrotate() {
fi
if [ ! -f $logrotateConfFile ]; then
sudo touch "${logrotateConfFile}"
sudo chmod 644 "${logrotateConfFile}" # root r/w, all others read
touch "${logrotateConfFile}"
chmod 644 "${logrotateConfFile}" # root r/w, all others read
# AMD SMI logging rotation, rotates files using root user/group
# Hourly logrotation check
# Only rotates if size grew larger than 1MB
# Max of 4 rotation files, oldest will be removed
# Rotated files use date extention of ex. AMD-SMI-lib.log.2023-05-09_16:51:42
cat << EOF | sudo tee "${logrotateConfFile}" >/dev/null
cat << EOF > "${logrotateConfFile}"
${logFile} {
su root root
hourly
@@ -47,44 +44,29 @@ EOF
# issue was RPM build thought we were using macros
# https://gitlab.kitware.com/cmake/cmake/-/issues/22965
# https://rpm-software-management.github.io/rpm/manual/spec.html
sudo sed -i s/%%/%/g "${logrotateConfFile}"
sed -i s/%%/%/g "${logrotateConfFile}"
# workaround: remove extra 'OURCE' text
# from amd_smi.conf. Unsure if CMAKE,
# bash, or here document
# issue (only seen on RHEL 8.7)
sudo sed -i s/OURCE//g "${logrotateConfFile}"
sed -i s/OURCE//g "${logrotateConfFile}"
fi
# check if logrotate uses system timers, Ubuntu/modern OS's do
# Several older OS's like RHEL 8.7, do not. Instead defaults
# to use daily cron jobs - see https://stackoverflow.com/a/69465677
sudo systemctl list-timers|grep -iq logrotate
if [ $? -ne 0 ]; then
# confirm logrotate file exists in daily
if [ -f /etc/cron.daily/logrotate ]; then
# move logrotate daily to hourly
if [ -d /etc/cron.hourly ]; then
sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
else
echo "[WARNING] Could find and configure hourly cron for $packageName's"\
" logrotate. $packageName logs (when turned on) will not rotate properly."
return
fi
else
# confirm that it's already been moved to hourly
sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly
if [ $? -ne 0 ]; then
echo "[WARNING] Could not configure an hourly cron for $packageName's logrotate."\
"$packageName logs (when turned on) may not rotate properly."
fi
if [ -d /run/systemd/system ]; then
systemctl list-timers | grep -iq logrotate
if [ $? -eq 0 ]; then
IS_SYSTEMD=1
fi
return #done configuring for non-systemd timers
else
fi
if [ "$IS_SYSTEMD" -eq 1 ]; then
# Configure systemd timers - the typical setup for modern Linux logrotation setups
if [ -f /lib/systemd/system/logrotate.timer ]; then
if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then
sudo cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup
cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup
fi
cat <<'EOF' | sudo tee /lib/systemd/system/logrotate.timer >/dev/null
cat << EOF > /lib/systemd/system/logrotate.timer
[Unit]
Description=Hourly rotation of log files
Documentation=man:logrotate(8) man:logrotate.conf(5)
@@ -98,12 +80,19 @@ Persistent=true
[Install]
WantedBy=timers.target
EOF
sudo systemctl reenable --now logrotate.timer
systemctl reenable --now logrotate.timer
else
echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\
"$packageName logs (when turned on) will not rotate properly."
fi
return #done configuring for systemd timers
else
# $IS_SYSTEMD -eq 0
if [ -f /etc/cron.daily/logrotate ]; then
# move logrotate daily to hourly
if [ -d /etc/cron.hourly ]; then
mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate
fi
fi
fi
}
@@ -115,14 +104,8 @@ do_ldconfig() {
fi
}
# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build
if [ "@ENABLE_LDCONFIG@" == "ON" ]; then
echo -e "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@" > /etc/ld.so.conf.d/x86_64-libamd_smi_lib.conf
ldconfig
fi
# post install or upgrade, $i is 1 or 2 -> do these actions
if [ $1 -ge 1 ]; then
do_addLogFolder
do_configureLogrotate
if [ "$1" -ge 1 ]; then
do_ldconfig
do_configureLogrotate || return 0
fi
+1 -1
Просмотреть файл
@@ -1,7 +1,7 @@
#!/bin/bash
# second term originates from ENABLE_LDCONFIG = ON/OFF at package build
if [ $1 -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then
if [ "$1" -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then
# perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
rm -f /etc/ld.so.conf.d/x86_64-libamd_smi_lib.conf
ldconfig
+20 -28
Просмотреть файл
@@ -1,31 +1,4 @@
#!/bin/bash
#set -x
set -e
packageName="amd-smi-lib"
logPath=/var/log/amd_smi_lib
logName=AMD-SMI-lib.log
logFile="${logPath}/${logName}"
logrotateConfFile=/etc/logrotate.d/amd_smi.conf
rm_logFolder() {
sudo rm -rf "$logPath"
}
return_logrotateToOrigConfig() {
if [ -f $logrotateConfFile ]; then
sudo rm -rf "${logrotateConfFile}"
fi
if [ -f /etc/cron.hourly/logrotate ]; then
sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate
fi
if [ -f /lib/systemd/system/logrotate.timer.backup ]; then
sudo cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer
sudo rm -rf /lib/systemd/system/logrotate.timer.backup
sudo systemctl reenable --now logrotate.timer
fi
}
rm_pyc() {
# remove pyc files generated by python
@@ -33,7 +6,26 @@ rm_pyc() {
rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi/__pycache__
}
if [ $1 -le 1 ]; then
rm_logFolder() {
rm -rf /var/log/amd_smi_lib
}
return_logrotateToOrigConfig() {
local logrotateConfFile=/etc/logrotate.d/amd_smi.conf
if [ -f $logrotateConfFile ]; then
rm -rf "$logrotateConfFile"
fi
if [ -f /etc/cron.hourly/logrotate ]; then
mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate
fi
if [ -f /lib/systemd/system/logrotate.timer.backup ]; then
cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer
rm -rf /lib/systemd/system/logrotate.timer.backup
systemctl reenable --now logrotate.timer
fi
}
if [ "$1" -le 1 ]; then
# perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
rm_pyc
rm_logFolder
+2
Просмотреть файл
@@ -7,3 +7,5 @@
/_templates/
/html/
/latex/
404.md
data/AMD-404.png
+1
Просмотреть файл
@@ -0,0 +1 @@
docBin/
+1 -1
Просмотреть файл
@@ -844,7 +844,7 @@ int main() {
}
CHK_RSMI_NOT_SUPPORTED_RET(ret)
std::cout << "\t**Averge Power Usage: ";
std::cout << "\t**Average Power Usage: ";
ret = rsmi_dev_power_ave_get(i, 0, &val_ui64);
if (ret == RSMI_STATUS_SUCCESS) {
std::cout << static_cast<float>(val_ui64)/1000 << " W" << std::endl;
+99 -10
Просмотреть файл
@@ -480,6 +480,19 @@ typedef enum {
RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type
} rsmi_temperature_type_t;
/**
* @brief Activity (Utilization) Metrics. This enum is used to identify
* various activity metrics.
*
*/
typedef enum {
/* Utilization */
RSMI_ACTIVITY_GFX = (0x1 << 0),
RSMI_ACTIVITY_UMC = (0x1 << 1), //!< memory controller
RSMI_ACTIVITY_MM = (0x1 << 2) //!< UVD or VCN
} rsmi_activity_metric_t;
/**
* @brief Voltage Metrics. This enum is used to identify various
* Volatge metrics. Corresponding values will be in millivolt.
@@ -788,6 +801,17 @@ typedef struct {
typedef rsmi_pcie_bandwidth_t rsmi_pcie_bandwidth;
/// \endcond
/**
* @brief This structure holds information about the possible activity
* averages. Specifically, the utilization counters.
*/
typedef struct {
/* Utilization */
uint16_t average_gfx_activity;
uint16_t average_umc_activity; //!< memory controller
uint16_t average_mm_activity; //!< UVD or VCN
} rsmi_activity_metric_counter_t;
/**
* @brief This structure holds version information.
*/
@@ -898,14 +922,28 @@ struct metrics_table_header_t {
#define RSMI_GPU_METRICS_API_FORMAT_VER 1
// The content version increments when gpu_metrics is extended with new and/or
// existing field sizes are changed.
/**
* @brief The GPU metrics version 1
*/
#define RSMI_GPU_METRICS_API_CONTENT_VER_1 1
/**
* @brief The GPU metrics version 2
*/
#define RSMI_GPU_METRICS_API_CONTENT_VER_2 2
/**
* @brief The GPU metrics version 3
*/
#define RSMI_GPU_METRICS_API_CONTENT_VER_3 3
// This should match NUM_HBM_INSTANCES
/**
* @brief This should match NUM_HBM_INSTANCES
*/
#define RSMI_NUM_HBM_INSTANCES 4
// Unit conversion factor for HBM temperatures
/**
* @brief Unit conversion factor for HBM temperatures
*/
#define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000
typedef struct {
@@ -964,7 +1002,7 @@ typedef struct {
uint16_t padding; // new in v1
uint32_t gfx_activity_acc; // new in v1
uint32_t mem_actvity_acc; // new in v1
uint32_t mem_activity_acc; // new in v1
uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1
/// \endcond
} rsmi_gpu_metrics_t;
@@ -2288,7 +2326,7 @@ rsmi_dev_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent);
* If the function reutrns RSMI_STATUS_SUCCESS, the counter will be set in the value field of
* the rsmi_utilization_counter_t.
*
* @param[in] count The size of @utilization_counters array.
* @param[in] count The size of utilization_counters array.
*
* @param[inout] timestamp The timestamp when the counter is retreived. Resolution: 1 ns.
* @retval ::RSMI_STATUS_SUCCESS call was successful
@@ -2303,6 +2341,57 @@ rsmi_utilization_count_get(uint32_t dv_ind,
uint32_t count,
uint64_t *timestamp);
/**
* @brief Get activity metric average utilization counter of the specified device
*
* @details Given a device index @p dv_ind, the activity metric type,
* this function returns the requested utilization counters
*
* @param[in] dv_ind a device index
*
* @param[in] activity_metric_type a metric type
*
* @param[inout] activity_metric_counter Multiple utilization counters can be retrieved with a single
* call. The caller must allocate enough space to the rsmi_activity_metric_counter_t structure.
*
* If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding
* field of the counter will be set in the value field of
* the activity_metric_counter_t.
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*
*/
rsmi_status_t
rsmi_dev_activity_metric_get(uint32_t dv_ind,
rsmi_activity_metric_t activity_metric_type,
rsmi_activity_metric_counter_t* activity_metric_counter);
/**
* @brief Get activity metric bandwidth average utilization counter of the specified device
*
* @details Given a device index @p dv_ind, the activity metric type,
* this function returns the requested utilization counters
*
* @param[in] dv_ind a device index
*
* @param[inout] avg_activity average bandwidth utilization counters can be retrieved
*
* If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding
* field of the counter will be set in the value field of
* the activity_metric_counter_t.
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*
*/
rsmi_status_t
rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity);
/**
* @brief Get the performance level of the device with provided
* device index.
@@ -2450,7 +2539,7 @@ rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind,
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*
*/
rsmi_status_t rsmi_dev_gpu_reset(int32_t dv_ind);
rsmi_status_t rsmi_dev_gpu_reset(uint32_t dv_ind);
/**
* @brief This function retrieves the voltage/frequency curve information
@@ -2684,7 +2773,7 @@ rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t sensor_ind,
*
*/
rsmi_status_t
rsmi_dev_perf_level_set(int32_t dv_ind, rsmi_dev_perf_level_t perf_lvl);
rsmi_dev_perf_level_set(uint32_t dv_ind, rsmi_dev_perf_level_t perf_lvl);
/**
* @brief Set the PowerPlay performance level associated with the device with
@@ -2750,7 +2839,7 @@ rsmi_dev_perf_level_set_v1(uint32_t dv_ind, rsmi_dev_perf_level_t perf_lvl);
* @retval ::RSMI_STATUS_PERMISSION function requires root access
*
*/
rsmi_status_t rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od);
rsmi_status_t rsmi_dev_overdrive_level_set(uint32_t dv_ind, uint32_t od);
/**
* @brief Set the overdrive percent associated with the device with provided
@@ -3398,7 +3487,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices,
* @brief Get the info of a process on a specific device.
*
* @details Given a process id @p pid, a @p dv_ind, this function will
* write the process information for @p pid on the device, if available, to
* write the process information for pid on the device, if available, to
* the memory pointed to by @p proc.
*
* @param[in] pid The process id of the process for which the gpu
@@ -3406,7 +3495,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices,
*
* @param[in] dv_ind a device index where the process running on.
*
* @param[inout] procs a pointer to memory provided by the caller to which
* @param[inout] proc a pointer to memory provided by the caller to which
* process information will be written.
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call
@@ -3598,7 +3687,7 @@ rsmi_topo_get_link_type(uint32_t dv_ind_src, uint32_t dv_ind_dst,
*
* @details Given a source device index @p dv_ind_src and
* a destination device index @p dv_ind_dst, and a pointer to a
* bool @accessible, this function will write the P2P connection status
* bool @p accessible, this function will write the P2P connection status
* between the device @p dv_ind_src and @p dv_ind_dst to the memory
* pointed to by @p accessible.
*
+1 -1
Просмотреть файл
@@ -90,7 +90,7 @@
/* This group of macros is used to facilitate checking of support for rsmi_dev*
* "getter" functions. When the return buffer is set to nullptr, the macro will
* check the previously gathered device support data to see if the function,
* with possible variants (e.g., memory types, firware types,...) and
* with possible variants (e.g., memory types, firmware types,...) and
* subvariants (e.g. monitors/sensors) are supported.
*/
// This macro assumes dev already available
+4
Просмотреть файл
@@ -118,6 +118,10 @@ GetProcessGPUs(uint32_t pid, std::unordered_set<uint64_t> *gpu_count);
int
ReadKFDDeviceProperties(uint32_t dev_id, std::vector<std::string> *retVec);
int read_node_properties(uint32_t node, std::string property_name,
uint64_t *val);
int get_gpu_id(uint32_t node, uint64_t *gpu_id);
} // namespace smi
} // namespace amd
+2 -1
Просмотреть файл
@@ -113,7 +113,8 @@ class RocmSMI {
uint64_t *weight);
int get_node_index(uint32_t dv_ind, uint32_t *node_ind);
const RocmSMI_env_vars& getEnv(void);
void printEnvVarInfo(void);
std::string getRSMIEnvVarInfo(void);
void debugRSMIEnvVarInfo();
bool isLoggingOn(void);
uint32_t getLogSetting(void);
static const std::map<amd::smi::DevInfoTypes, std::string> devInfoTypesStrings;
+5 -1
Просмотреть файл
@@ -99,7 +99,8 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type,
rsmi_status_t ErrnoToRsmiStatus(int err);
std::string getRSMIStatusString(rsmi_status_t ret);
std::tuple<bool, std::string, std::string, std::string, std::string,
std::string, std::string, std::string, std::string>
std::string, std::string, std::string, std::string,
std::string, std::string, std::string>
getSystemDetails(void);
void logSystemDetails(void);
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str);
@@ -107,6 +108,9 @@ rsmi_status_t getBDFWithDomain(uint64_t bdf_id, std::string& bfd_str);
void logHexDump(const char *desc, const void *addr, const size_t len,
size_t perLine);
bool isSystemBigEndian();
std::string getBuildType();
std::string getMyLibPath();
int subDirectoryCountInPath(const std::string path);
template <typename T>
std::string print_int_as_hex(T i, bool showHexNotation=true) {
std::stringstream ss;
+163 -100
Просмотреть файл
@@ -173,10 +173,12 @@ def formatMatrixToJSON(deviceList, matrix, metricName):
printSysLog(metricName.format(deviceList[row_indx], deviceList[col_ind]), valueStr)
def getBus(device):
def getBus(device, silent=False):
""" Return the bus identifier of a given device
@param device: DRM device identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is off.
"""
bdfid = c_uint64(0)
ret = rocmsmi.rsmi_dev_pci_id_get(device, byref(bdfid))
@@ -188,16 +190,18 @@ def getBus(device):
function = bdfid.value & 0x7
pic_id = '{:04X}:{:02X}:{:02X}.{:0X}'.format(domain, bus, device, function)
if rsmi_ret_ok(ret, device, 'get_pci_id'):
if rsmi_ret_ok(ret, device, 'get_pci_id', silent):
return pic_id
def getFanSpeed(device):
def getFanSpeed(device, silent=True):
""" Return a tuple with the fan speed (value,%) for a specified device,
or (None,None) if either current fan speed or max fan speed cannot be
obtained
@param device: DRM device identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is on.
"""
fanLevel = c_int64()
fanMax = c_int64()
@@ -209,7 +213,7 @@ def getFanSpeed(device):
/sys/class/drm/cardX/device/hwmon/hwmonX/pwmX
"""
ret = rocmsmi.rsmi_dev_fan_speed_get(device, sensor_ind, byref(fanLevel))
if rsmi_ret_ok(ret, device, 'get_fan_speed', True):
if rsmi_ret_ok(ret, device, 'get_fan_speed', silent):
fl = fanLevel.value
last_ret = ret
@@ -217,7 +221,7 @@ def getFanSpeed(device):
/sys/class/drm/cardX/device/hwmon/hwmonX/pwmX
"""
ret = rocmsmi.rsmi_dev_fan_speed_max_get(device, sensor_ind, byref(fanMax))
if rsmi_ret_ok(ret, device, 'get_fan_max_speed', True):
if rsmi_ret_ok(ret, device, 'get_fan_max_speed', silent):
fm = fanMax.value
""" In case we had an error before, we don't overwrite it with a
@@ -232,59 +236,67 @@ def getFanSpeed(device):
return (last_ret, fl, round((float(fl) / float(fm)) * 100, 2))
def getGpuUse(device):
def getGpuUse(device, silent=False):
""" Return the current GPU usage as a percentage
@param device: DRM device identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is off.
"""
percent = c_uint32()
ret = rocmsmi.rsmi_dev_busy_percent_get(device, byref(percent))
if rsmi_ret_ok(ret, device, 'GPU Utilization '):
if rsmi_ret_ok(ret, device, 'GPU Utilization ', silent):
return percent.value
return -1
def getId(device):
def getId(device, silent=False):
""" Return the hexadecimal value of a device's ID
@param device: DRM device identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is off.
"""
dv_id = c_short()
ret = rocmsmi.rsmi_dev_id_get(device, byref(dv_id))
if rsmi_ret_ok(ret, device, 'get_device_id'):
if rsmi_ret_ok(ret, device, 'get_device_id', silent):
return hex(dv_id.value)
def getRev(device):
def getRev(device, silent=False):
""" Return the hexadecimal value of a device's Revision
@param device: DRM device identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is off.
"""
dv_rev = c_short()
ret = rocmsmi.rsmi_dev_revision_get(device, byref(dv_rev))
if rsmi_ret_ok(ret, device, 'get_device_rev'):
if rsmi_ret_ok(ret, device, 'get_device_rev', silent):
return hex(dv_rev.value)
def getMaxPower(device):
def getMaxPower(device, silent=False):
""" Return the maximum power cap of a given device
@param device: DRM device identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is off.
"""
power_cap = c_uint64()
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
if rsmi_ret_ok(ret, device, 'get_power_cap'):
if rsmi_ret_ok(ret, device, 'get_power_cap', silent):
return power_cap.value / 1000000
return -1
def getMemInfo(device, memType, quiet=False):
def getMemInfo(device, memType, silent=False):
""" Returns a tuple of (memory_used, memory_total) of
the requested memory type usage for the device specified
@param device: DRM device identifier
@param type: [vram|vis_vram|gtt] Memory type to return
@param quiet=Turn on to silience error output
@param silent=Turn on to silence error output
(you plan to handle manually). Default is off,
which exposes any issue accessing the different
memory types.
@@ -300,11 +312,11 @@ def getMemInfo(device, memType, quiet=False):
memTotal = None
ret = rocmsmi.rsmi_dev_memory_usage_get(device, memory_type_l.index(memType), byref(memoryUse))
if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), quiet):
if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), silent):
memUsed = memoryUse.value
ret = rocmsmi.rsmi_dev_memory_total_get(device, memory_type_l.index(memType), byref(memoryTot))
if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), quiet):
if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), silent):
memTotal = memoryTot.value
return (memUsed, memTotal)
@@ -334,14 +346,16 @@ def getProcessName(pid):
return pName
def getPerfLevel(device):
def getPerfLevel(device, silent=False):
""" Return the current performance level of a given device
@param device: DRM device identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is off.
"""
perf = rsmi_dev_perf_level_t()
ret = rocmsmi.rsmi_dev_perf_level_get(device, byref(perf))
if rsmi_ret_ok(ret, device, 'get_perf_level'):
if rsmi_ret_ok(ret, device, 'get_perf_level', silent):
return perf_level_string(perf.value)
return 'N/A'
@@ -369,42 +383,48 @@ def getPidList():
return
def getPower(device):
def getPower(device, silent=False):
""" Return the current power level of a given device
@param device: DRM device identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is off.
"""
power = c_uint32()
ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power))
if rsmi_ret_ok(ret, device, 'get_power_avg'):
if rsmi_ret_ok(ret, device, 'get_power_avg', silent):
return power.value / 1000000
return 'N/A'
def getRasEnablement(device, block):
def getRasEnablement(device, block, silent=True):
""" Return RAS enablement state for a given device
@param device: DRM device identifier
@param block: RAS block identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is on.
"""
state = rsmi_ras_err_state_t()
ret = rocmsmi.rsmi_dev_ecc_status_get(device, rsmi_gpu_block_d[block], byref(state))
if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), True):
if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), silent):
return rsmi_ras_err_stale_machine[state.value].upper()
return 'N/A'
def getTemp(device, sensor):
def getTemp(device, sensor, silent=True):
""" Display the current temperature from a given device's sensor
@param device: DRM device identifier
@param sensor: Temperature sensor identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is on.
"""
temp = c_int64(0)
metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT
ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), temp_type_lst.index(sensor), metric, byref(temp))
if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), True):
if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), silent):
return temp.value / 1000
return 'N/A'
@@ -428,52 +448,60 @@ def findFirstAvailableTemp(device):
continue
return (ret_temp_type, ret_temp)
def getVbiosVersion(device):
def getVbiosVersion(device, silent=False):
""" Returns the VBIOS version for a given device
@param device: DRM device identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is off.
"""
vbios = create_string_buffer(256)
ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256)
if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
return "Unsupported"
elif rsmi_ret_ok(ret, device):
elif rsmi_ret_ok(ret, device, silent=silent):
return vbios.value.decode()
def getVersion(deviceList, component):
def getVersion(deviceList, component, silent=False):
""" Return the software version for the specified component
@param deviceList: List of DRM devices (can be a single-item list)
@param component: Component (currently only driver)
@param silent=Turn on to silence error output
(you plan to handle manually). Default is off.
"""
ver_str = create_string_buffer(256)
ret = rocmsmi.rsmi_version_str_get(component, ver_str, 256)
if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component)):
if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component), silent):
return ver_str.value.decode()
return None
def getComputePartition(device):
def getComputePartition(device, silent=True):
""" Return the current compute partition of a given device
@param device: DRM device identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is on.
"""
currentComputePartition = create_string_buffer(256)
ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256)
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode():
if rsmi_ret_ok(ret, device, 'get_compute_partition', silent) and currentComputePartition.value.decode():
return str(currentComputePartition.value.decode())
return "N/A"
def getMemoryPartition(device):
def getMemoryPartition(device, silent=True):
""" Return the current memory partition of a given device
@param device: DRM device identifier
@param silent=Turn on to silence error output
(you plan to handle manually). Default is on.
"""
currentNPSMode = create_string_buffer(256)
ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256)
if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode():
if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent) and currentNPSMode.value.decode():
return str(currentNPSMode.value.decode())
return "N/A"
@@ -610,10 +638,21 @@ def printLog(device, metricName, value=None, extraSpace=False, useItalics=False)
lock.acquire()
if useItalics:
logstr = italics + logstr + end
if extraSpace:
print('\n' + logstr + '\n', end='', flush=True)
else:
print(logstr + '\n', end='', flush=True)
try:
if extraSpace:
print('\n', end='')
print(logstr + '\n', end='')
sys.stdout.flush()
# when piped into programs like 'head' - print throws an error.
# silently ignore instead
except(BrokenPipeError, IOError):
# https://docs.python.org/3/library/signal.html#note-on-sigpipe
# Python flushes standard streams on exit; redirect remaining output
# to devnull to avoid another BrokenPipeError at shutdown
devnull = os.open(os.devnull, os.O_WRONLY)
os.dup2(devnull, sys.stdout.fileno())
sys.exit(1) # Python exits with error code 1 on EPIPE
lock.release()
@@ -785,12 +824,10 @@ def resetFans(deviceList):
for device in deviceList:
sensor_ind = c_uint32(0)
ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind)
if (ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED) or (ret == rsmi_status_t.RSMI_STATUS_PERMISSION):
if not rsmi_ret_ok(rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED, device, 'reset_fan'):
continue
if rsmi_ret_ok(ret, device, silent=True):
printLog(device, 'Successfully reset fan speed to driver control', None)
else:
if rsmi_ret_ok(ret, device, 'reset_fan'):
printLog(device, 'Successfully reset fan speed to driver control', None)
printLog(device, 'Not supported on the given system', None)
printLogSpacer()
@@ -1311,8 +1348,10 @@ def setFanSpeed(deviceList, fan):
else:
fanLevel = int(str(fan))
ret = rocmsmi.rsmi_dev_fan_speed_set(device, 0, int(fanLevel))
if rsmi_ret_ok(ret, device, 'set_fan_speed'):
if rsmi_ret_ok(ret, device, silent=True):
printLog(device, 'Successfully set fan speed to level %s' % (str(int(fanLevel))), None)
else:
printLog(device, 'Not supported on the given system', None)
printLogSpacer()
@@ -1595,10 +1634,13 @@ def showAllConcise(deviceList):
MAX_ALL_CONCISE_WIDTH = 100
appWidth_temp = appWidth
appWidth = MAX_ALL_CONCISE_WIDTH
silent = True
printLogSpacer(' Concise Info ')
deviceList.sort()
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
temp_type = '(' + temp_type_lst[0] + ')'
if len(deviceList) >= 1:
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
available_temp_type = temp_type.lower()
available_temp_type = available_temp_type.replace('(', '')
available_temp_type = available_temp_type.replace(')', '')
@@ -1620,9 +1662,9 @@ def showAllConcise(deviceList):
values = {}
degree_sign = u'\N{DEGREE SIGN}'
for device in deviceList:
gpu_dev_product_info = getDevProductInfo(device)
gpu_dev_product_info = getDevProductInfo(device, silent)
gpu_dev_product_info_names = list(gpu_dev_product_info[device])
temp_val = str(getTemp(device, available_temp_type))
temp_val = str(getTemp(device, available_temp_type, silent))
if temp_val != 'N/A':
temp_val += degree_sign + 'C'
avgPwr = str(getPower(device))
@@ -1630,26 +1672,25 @@ def showAllConcise(deviceList):
avgPwr += 'W'
else:
avgPwr = 'N/A'
combined_partition = (getMemoryPartition(device) + ", "
+ getComputePartition(device))
concise = True
sclk = showCurrentClocks([device], 'sclk', concise)
mclk = showCurrentClocks([device], 'mclk', concise)
(retCode, fanLevel, fanSpeed) = getFanSpeed(device)
combined_partition = (getMemoryPartition(device, silent) + ", "
+ getComputePartition(device, silent))
sclk = showCurrentClocks([device], 'sclk', concise=silent)
mclk = showCurrentClocks([device], 'mclk', concise=silent)
(retCode, fanLevel, fanSpeed) = getFanSpeed(device, silent)
fan = str(fanSpeed) + '%'
if getPerfLevel(device) != -1:
perf = getPerfLevel(device)
if getPerfLevel(device, silent) != -1:
perf = getPerfLevel(device, silent)
else:
perf = 'Unsupported'
if getMaxPower(device) != -1:
pwrCap = str(getMaxPower(device)) + 'W'
if getMaxPower(device, silent) != -1:
pwrCap = str(getMaxPower(device, silent)) + 'W'
else:
pwrCap = 'Unsupported'
if getGpuUse(device) != -1:
gpu_busy = str(getGpuUse(device)) + '%'
if getGpuUse(device, silent) != -1:
gpu_busy = str(getGpuUse(device, silent)) + '%'
else:
gpu_busy = 'Unsupported'
vram_used, vram_total = getMemInfo(device, 'vram', True)
vram_used, vram_total = getMemInfo(device, 'vram', silent)
mem_use_pct = 0
if vram_used is None:
mem_use_pct='Unsupported'
@@ -1683,7 +1724,7 @@ def showAllConcise(deviceList):
for device in deviceList:
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
zip(range(len(max_widths)), values['card%s' % (str(device))])), None)
gpu_dev_product_info = getDevProductInfo(device)
gpu_dev_product_info = getDevProductInfo(device, silent)
gpu_dev_product_info_names = list(gpu_dev_product_info[device])
if (len(gpu_dev_product_info_names) > 1):
printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in
@@ -1707,19 +1748,20 @@ def showAllConciseHw(deviceList):
header = ['GPU', 'DID', 'DREV', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS']
head_widths = [len(head) + 2 for head in header]
values = {}
silent = True
for device in deviceList:
gpuid = getId(device)
gpuid = getId(device, silent)
if str(gpuid).startswith('0x'):
gpuid = str(gpuid)[2:]
gpurev = getRev(device)
gpurev = getRev(device, silent)
if str(gpurev).startswith('0x'):
gpurev = str(gpurev)[2:]
gfxRas = getRasEnablement(device, 'GFX')
sdmaRas = getRasEnablement(device, 'SDMA')
umcRas = getRasEnablement(device, 'UMC')
vbios = getVbiosVersion(device)
bus = getBus(device)
gfxRas = getRasEnablement(device, 'GFX', silent)
sdmaRas = getRasEnablement(device, 'SDMA', silent)
umcRas = getRasEnablement(device, 'UMC', silent)
vbios = getVbiosVersion(device, silent)
bus = getBus(device, silent)
values['card%s' % (str(device))] = [device, gpuid, gpurev, gfxRas, sdmaRas, umcRas, vbios, bus]
val_widths = {}
for device in deviceList:
@@ -1760,15 +1802,19 @@ def showClocks(deviceList):
for clk_type in sorted(rsmi_clk_names_dict):
if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1:
ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq))
if rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True):
printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None)
for x in range(freq.num_supported):
fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000)
if x == freq.current:
printLog(device, str(x), str(fr) + ' *')
else:
printLog(device, str(x), str(fr))
printLog(device, '', None)
if ret == rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA:
printLog(device, 'Clock [%s] on device [%s] exists but EMPTY! Likely driver error!' % (clk_type, str(device)))
continue
if not rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True):
continue
printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None)
for x in range(freq.num_supported):
fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000)
if x == freq.current:
printLog(device, str(x), str(fr) + ' *')
else:
printLog(device, str(x), str(fr))
printLog(device, '', None)
else:
logging.debug('{} frequency is unsupported on device[{}]'.format(clk_type, device))
printLog(device, '', None)
@@ -1814,8 +1860,8 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
if concise: # in case function is used for concise output, no need to print.
return '{:.0f}Mhz'.format(fr)
printLog(device, '{} clock level'.format(clk_defined), '{} ({:.0f}Mhz)'.format(levl, fr))
else:
printErrLog(device, '%s clock is unsupported' % (clk_defined))
elif not concise:
logging.debug('{} clock is unsupported on device[{}]'.format(clk_defined, device))
else: # if clk is not defined, will display all current clk
for clk_type in sorted(rsmi_clk_names_dict):
@@ -1832,7 +1878,7 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
printLog(device, '%s clock level:' % (clk_type), levl)
else:
printLog(device, '%s clock level: %s' % (clk_type, levl), '(%sMhz)' % (str(fr)[:-2]))
else:
elif not concise:
logging.debug('{} clock is unsupported on device[{}]'.format(clk_type, device))
# pcie clocks
if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1:
@@ -1845,9 +1891,10 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
fr = '{:.1f}GT/s x{}'.format(bw.transfer_rate.frequency[current_f] / 1000000000,
bw.lanes[current_f])
printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr))
else:
logging.debug('PCIe clock is unsupported on device[{}]'.format(device))
printLogSpacer()
elif not concise:
logging.debug('{} clock is unsupported on device[{}]'.format('PCIe', device))
if not concise:
printLogSpacer()
def showCurrentFans(deviceList):
@@ -2113,6 +2160,7 @@ def showMemUse(deviceList):
@param deviceList: List of DRM devices (can be a single-item list)
"""
memoryUse = c_uint64()
avgMemBandwidth = c_uint16()
printLogSpacer(' Current Memory Use ')
for device in deviceList:
ret = rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse))
@@ -2124,6 +2172,12 @@ def showMemUse(deviceList):
printLog(device, utilization_counter_name[ut_counter.type], ut_counter.val)
else:
printLog(device, 'Memory Activity', 'N/A')
ret = rocmsmi.rsmi_dev_activity_avg_mm_get(device, byref(avgMemBandwidth))
if rsmi_ret_ok(ret, device, silent=True):
printLog(device, 'Avg. Memory Bandwidth', avgMemBandwidth.value)
else:
printLog(device, 'Not supported on the given system', None)
printLogSpacer()
@@ -2404,47 +2458,51 @@ def showProductName(deviceList):
printLogSpacer()
def getDevProductInfo(device):
def getDevProductInfo(device, silent=False):
""" Show the requested product name for the device requested
@param device: Device we want to get the info for
@param silent=Turn on to silence error output
(you plan to handle manually). Default is off.
"""
# Retrieve card vendor
MAX_BUFF_SIZE = 256
MAX_DESC_SIZE = 20
device_info = "N/A"
device_series = "N/A"
device_model = "N/A"
gpu_revision = "N/A"
device_list = {}
vendor = create_string_buffer(MAX_BUFF_SIZE)
ret = rocmsmi.rsmi_dev_vendor_name_get(device, vendor, MAX_BUFF_SIZE)
# Only continue if GPU vendor is AMD
if rsmi_ret_ok(ret, device, 'get_vendor_name') and isAmdDevice(device):
if rsmi_ret_ok(ret, device, 'get_vendor_name', silent) and isAmdDevice(device):
# Retrieve the device series
series = create_string_buffer(MAX_BUFF_SIZE)
ret = rocmsmi.rsmi_dev_name_get(device, series, MAX_BUFF_SIZE)
if rsmi_ret_ok(ret, device, 'get_name'):
if rsmi_ret_ok(ret, device, 'get_name', silent):
try:
device_series = series.value.decode()
except UnicodeDecodeError:
device_series = "N/A"
printErrLog(device, "Unable to read card series")
if not silent:
printErrLog(device, "Unable to read card series")
# Retrieve the device model
model = create_string_buffer(MAX_BUFF_SIZE)
ret = rocmsmi.rsmi_dev_subsystem_name_get(device, model, MAX_BUFF_SIZE)
if rsmi_ret_ok(ret, device, 'get_subsystem_name'):
if rsmi_ret_ok(ret, device, 'get_subsystem_name', silent):
try:
device_model = model.value.decode()
device_model = padHexValue(device_model, 4)
except UnicodeDecodeError:
device_model = "N/A"
printErrLog(device, "Unable to read device model")
if not silent:
printErrLog(device, "Unable to read device model")
try:
gpu_revision = padHexValue(getRev(device), 2)
except Exception as exc:
gpu_revision = "N/A"
printErrLog(device, "Unable to read card revision %s" % (exc))
if not silent:
printErrLog(device, "Unable to read card revision %s" % (exc))
device_series_str = str(device_series[:MAX_DESC_SIZE])
device_series_str = device_series_str.ljust(MAX_DESC_SIZE, ' ')
@@ -2790,7 +2848,9 @@ def getGraphColor(percentage):
def showTempGraph(deviceList):
deviceList.sort()
(temp_type, temp_value) = findFirstAvailableTemp(deviceList[0])
temp_type = '(' + temp_type_lst[0] + ')'
if len(deviceList) >= 1:
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
printLogSpacer(' Temperature Graph ' + temp_type + ' ')
temp_type = temp_type.lower()
temp_type = temp_type.replace('(', '')
@@ -3381,7 +3441,7 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
@param my_ret: Return of RSMI call (rocm_smi_lib API)
@param metric: Parameter of GPU currently being analyzed
@param silent: Echo verbose error reponse.
True siliences err output, False does not silience err output (default).
True silences err output, False does not silence err output (default).
"""
global RETCODE
global PRINT_JSON
@@ -3398,8 +3458,8 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False):
if err_str.value is not None:
returnString += '%s\t' % (err_str.value.decode())
if not PRINT_JSON:
logging.debug('%s', returnString)
if not silent:
logging.debug('%s', returnString)
if my_ret in rsmi_status_verbose_err_out:
printLog(device, metric + ", " + rsmi_status_verbose_err_out[my_ret], None)
RETCODE = my_ret
@@ -3465,8 +3525,7 @@ def save(deviceList, savefilepath):
# The code below is for when this script is run as an executable instead of when imported as a module
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='AMD ROCm System Management Interface | ROCM-SMI version: %s | Kernel version: %s' % (
__version__, getVersion(None, rsmi_sw_component_t.RSMI_SW_COMP_DRIVER)),
description=f'AMD ROCm System Management Interface | ROCM-SMI version: {__version__}',
formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=90, width=120))
groupDev = parser.add_argument_group()
groupDisplayOpt = parser.add_argument_group('Display Options')
@@ -3626,6 +3685,11 @@ if __name__ == '__main__':
args = parser.parse_args()
# Must set PRINT_JSON early so the prints can be silenced
if args.json or args.csv:
PRINT_JSON = True
# Initialize rsmiBindings
rocmsmi = initRsmiBindings(silent=PRINT_JSON)
# Initialize the rocm SMI library
initializeRsmi()
@@ -3661,8 +3725,7 @@ if __name__ == '__main__':
sys.exit(1)
# If we want JSON/CSV output, initialize the keys (devices)
if args.json or args.csv:
PRINT_JSON = True
if PRINT_JSON:
for device in deviceList:
JSON_DATA['card' + str(device)] = {}
+33 -26
Просмотреть файл
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
"""ROCm_SMI_LIB CLI Tool Python Bindings"""
# NOTE: You MUST call rsmiBindings.initRsmiBindings() when using this library!
# TODO: Get most (or all) of these from rocm_smi.h to avoid mismatches and redundancy
from __future__ import print_function
@@ -14,36 +15,42 @@ import os
# relative path changed accordingly.
# if ROCM_SMI_LIB_PATH is set, we can load 'librocm_smi64.so' from that location
#
# Library load is wrapped in a function so prints can be hidden for PRINT_JSON mode.
path_librocm = str()
rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH')
if (rocm_smi_lib_path != None):
path_librocm = rocm_smi_lib_path
else:
path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@'
def initRsmiBindings(silent=False):
def print_silent(*args):
if not silent:
print(args)
if not os.path.isfile(path_librocm):
print('Unable to find %s . Trying /opt/rocm*' % path_librocm)
for root, dirs, files in os.walk('/opt', followlinks=True):
if 'librocm_smi64.so.@VERSION_MAJOR@' in files:
path_librocm = os.path.join(os.path.realpath(root), 'librocm_smi64.so.@VERSION_MAJOR@')
if os.path.isfile(path_librocm):
print('Using lib from %s' % path_librocm)
rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH')
if (rocm_smi_lib_path != None):
path_librocm = rocm_smi_lib_path
else:
print('Unable to find librocm_smi64.so.@VERSION_MAJOR@')
else:
print('Library loaded from: %s ' % path_librocm)
path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@'
# ----------> TODO: Support static libs as well as SO
try:
cdll.LoadLibrary(path_librocm)
rocmsmi = CDLL(path_librocm)
except OSError:
print('Unable to load the rocm_smi library.\n'\
'Set LD_LIBRARY_PATH to the folder containing librocm_smi64.so.@VERSION_MAJOR@\n'\
'{0}Please refer to https://github.com/'\
'RadeonOpenCompute/rocm_smi_lib for the installation guide.{1}'\
.format('\33[33m', '\033[0m'))
exit()
if not os.path.isfile(path_librocm):
print_silent('Unable to find %s . Trying /opt/rocm*' % path_librocm)
for root, dirs, files in os.walk('/opt', followlinks=True):
if 'librocm_smi64.so.@VERSION_MAJOR@' in files:
path_librocm = os.path.join(os.path.realpath(root), 'librocm_smi64.so.@VERSION_MAJOR@')
if os.path.isfile(path_librocm):
print_silent('Using lib from %s' % path_librocm)
else:
print('Unable to find librocm_smi64.so.@VERSION_MAJOR@')
else:
print_silent('Library loaded from: %s ' % path_librocm)
# ----------> TODO: Support static libs as well as SO
try:
cdll.LoadLibrary(path_librocm)
return CDLL(path_librocm)
except OSError:
print('Unable to load the rocm_smi library.\n'\
'Set LD_LIBRARY_PATH to the folder containing librocm_smi64.so.@VERSION_MAJOR@\n'\
'{0}Please refer to https://github.com/'\
'RadeonOpenCompute/rocm_smi_lib for the installation guide.{1}'\
.format('\33[33m', '\033[0m'))
exit()
# Device ID
dv_id = c_uint64()
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+13 -13
Просмотреть файл
@@ -41,20 +41,20 @@
*
*/
#include <assert.h>
#include <string.h>
#include <linux/perf_event.h>
#include <unistd.h>
#include <asm/unistd.h>
#include <linux/perf_event.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <stdio.h>
#include <unistd.h>
#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <unordered_set>
#include "rocm_smi/rocm_smi.h"
@@ -164,8 +164,7 @@ GetSupportedEventGroups(uint32_t dev_num, dev_evt_grp_set_t *supported_grps) {
}
// /sys/bus/event_source/devices/<hw block>_<instance>/type
Event::Event(rsmi_event_type_t event, uint32_t dev_ind) :
event_type_(event), prev_cntr_val_(0) {
fd_ = -1;
event_type_(event), fd_(-1), prev_cntr_val_(0) {
rsmi_event_group_t grp = EvtGrpFromEvtID(event);
assert(grp != RSMI_EVNT_GRP_INVALID); // This should have failed before now
@@ -398,10 +397,11 @@ readn(int fd, void *buf, size_t n) {
return static_cast<ssize_t>(n - left);
}
if (bytes < 0) {
if (errno == EINTR) /* read got interrupted */
if (errno == EINTR) {
/* read got interrupted */
continue;
else
return -errno;
}
return -errno;
}
left -= static_cast<size_t>(bytes);
+50 -56
Просмотреть файл
@@ -43,30 +43,28 @@
#include <pthread.h>
#include <unistd.h>
#include <sys/types.h>
#include <assert.h>
#include <sys/stat.h>
#include <stdint.h>
#include <sys/types.h>
#include <string>
#include <map>
#include <fstream>
#include <cstdint>
#include <iostream>
#include <sstream>
#include <vector>
#include <memory>
#include <algorithm>
#include <iterator>
#include <cassert>
#include <cstdint>
#include <cstring>
#include <fstream>
#include <iostream>
#include <iterator>
#include <map>
#include <memory>
#include <sstream>
#include <string>
#include <type_traits>
#include <vector>
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_kfd.h"
#include "rocm_smi/rocm_smi_logger.h"
#include "shared_mutex.h" // NOLINT
@@ -611,7 +609,6 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
bool reg_file;
int ret = isRegularFile(sysfs_path, &reg_file);
if (ret != 0) {
ss << "File did not exist - SYSFS file (" << sysfs_path
<< ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type)
@@ -708,7 +705,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr) {
int ret;
std::ostringstream ss;
fs.rdbuf()->pubsetbuf(0,0);
fs.rdbuf()->pubsetbuf(nullptr,0);
ret = openSysfsFileStream(type, &fs, valStr.c_str());
if (ret != 0) {
ss << "Could not write device info string (" << valStr
@@ -822,7 +819,8 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
FILE *ptr;
sysfs_path += "/device/";
sysfs_path += kDevAttribNameMap.at(type);
ptr = fopen(sysfs_path.c_str(), "rb");
ptr = fopen(sysfs_path.c_str(), "rb");
if (!ptr) {
ss << "Could not read DevInfoBinary for DevInfoType ("
<< RocmSMI::devInfoTypesStrings.at(type) << ")"
@@ -874,21 +872,21 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type,
retVec->push_back(line);
}
if (retVec->size() == 0) {
if (retVec->empty()) {
ss << "Read devInfoMultiLineStr for DevInfoType ("
<< RocmSMI::devInfoTypesStrings.at(type) << ")"
<< ", but contained no string lines";
LOG_INFO(ss);
return 0;
LOG_ERROR(ss);
return ENXIO;
}
// Remove any *trailing* empty (whitespace) lines
while (retVec->size() != 0 &&
while (!retVec->empty() &&
retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) {
retVec->pop_back();
}
// allow logging output of multiline strings
for (auto l: *retVec) {
for (const auto& l: *retVec) {
allLines += "\n" + l;
}
@@ -902,6 +900,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type,
<< RocmSMI::devInfoTypesStrings.at(type) << ")"
<< ", but lines were empty";
LOG_INFO(ss);
return ENXIO;
}
return 0;
}
@@ -924,10 +923,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
ret = readDevInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
if (tempStr == "") {
if (tempStr.empty()) {
return EINVAL;
}
tmp_val = std::stoi(tempStr, 0, 16);
tmp_val = std::stoi(tempStr, nullptr, 16);
if (tmp_val < 0) {
return EINVAL;
}
@@ -949,10 +948,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
case kDevXGMIError:
ret = readDevInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
if (tempStr == "") {
if (tempStr.empty()) {
return EINVAL;
}
*val = std::stoul(tempStr, 0);
*val = std::stoul(tempStr, nullptr);
break;
case kDevUniqueId:
@@ -979,10 +978,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
case kDevFwVersionVcn:
ret = readDevInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
if (tempStr == "") {
if (tempStr.empty()) {
return EINVAL;
}
*val = std::stoul(tempStr, 0, 16);
*val = std::stoul(tempStr, nullptr, 16);
break;
case kDevGpuReset:
@@ -1120,13 +1119,9 @@ void Device::DumpSupportedFunctions(void) {
}
void Device::fillSupportedFuncs(void) {
if (supported_funcs_.size() != 0) {
if (!supported_funcs_.empty()) {
return;
}
if (monitor() == nullptr) {
return;
}
std::map<const char *, dev_depends_t>::const_iterator it =
kDevFuncDependsMap.begin();
std::string dev_rt = path_ + "/device";
@@ -1160,7 +1155,7 @@ void Device::fillSupportedFuncs(void) {
std::vector<DevInfoTypes>::const_iterator var =
it->second.variants.begin();
if (it->second.variants.size() == 0) {
if (it->second.variants.empty()) {
supported_funcs_[it->first] = nullptr;
it++;
continue;
@@ -1176,13 +1171,15 @@ void Device::fillSupportedFuncs(void) {
(*supported_variants)[kDevInfoVarTypeToRSMIVariant.at(*var)] = nullptr;
}
if ((*supported_variants).size() > 0) {
if (!(*supported_variants).empty()) {
supported_funcs_[it->first] = supported_variants;
}
it++;
}
monitor()->fillSupportedFuncs(&supported_funcs_);
if (monitor() != nullptr) {
monitor()->fillSupportedFuncs(&supported_funcs_);
}
// DumpSupportedFunctions();
}
@@ -1222,35 +1219,32 @@ bool Device::DeviceAPISupported(std::string name, uint64_t variant,
if (sub_variant == RSMI_DEFAULT_VARIANT) {
return true;
} else { // sub_variant != RSMI_DEFAULT_VARIANT
// if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr
assert(var_it->second != nullptr);
}
// sub_variant != RSMI_DEFAULT_VARIANT
// if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr
assert(var_it->second != nullptr);
return subvariant_match(&(var_it->second), sub_variant);
}
} else { // variant == RSMI_DEFAULT_VARIANT
if (func_it->second != nullptr) {
var_it = func_it->second->find(variant);
}
if (sub_variant == RSMI_DEFAULT_VARIANT) {
return true;
} else { // sub_variant != RSMI_DEFAULT_VARIANT
if (func_it->second == nullptr) {
return false;
}
return subvariant_match(&(var_it->second), sub_variant);
}
return subvariant_match(&(var_it->second), sub_variant);
}
assert(false); // We should not reach here
return false;
// variant == RSMI_DEFAULT_VARIANT
if (func_it->second != nullptr) {
var_it = func_it->second->find(variant);
}
if (sub_variant == RSMI_DEFAULT_VARIANT) {
return true;
}
// sub_variant != RSMI_DEFAULT_VARIANT
if (func_it->second == nullptr) {
return false;
}
return subvariant_match(&(var_it->second), sub_variant);
}
rsmi_status_t Device::restartAMDGpuDriver(void) {
REQUIRE_ROOT_ACCESS
bool restartSuccessful = true;
bool success = false;
std::string out = "";
std::string out;
bool wasGdmServiceActive = false;
// sudo systemctl is-active gdm
+12 -15
Просмотреть файл
@@ -41,24 +41,22 @@
*
*/
#include <assert.h>
#include <dirent.h>
#include <pthread.h>
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstring>
#include <fstream>
#include <string>
#include <cstdint>
#include <map>
#include <iostream>
#include <algorithm>
#include <map>
#include <regex> // NOLINT
#include <string>
#include <vector>
#include <pthread.h>
#include <string.h>
#include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_monitor.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_logger.h"
@@ -151,7 +149,7 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header,
const rsmi_gpu_metrics_v_1_2 *rsmi_gpu_metrics_v_1_2,
const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3,
const rsmi_gpu_metrics_t *rsmi_gpu_metrics) {
if (RocmSMI::getInstance().isLoggingOn() == false) {
if (!RocmSMI::getInstance().isLoggingOn()) {
return;
}
std::ostringstream ss;
@@ -171,9 +169,8 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header,
}
if (rsmi_gpu_metrics == nullptr) {
return;
} else {
// do nothing - continue
}
ss
/* Common Header */
<< print_unsigned_hex_and_int(
@@ -291,8 +288,8 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header,
rsmi_gpu_metrics->gfx_activity_acc,
"rsmi_gpu_metrics->gfx_activity_acc")
<< print_unsigned_hex_and_int(
rsmi_gpu_metrics->mem_actvity_acc,
"rsmi_gpu_metrics->mem_actvity_acc");
rsmi_gpu_metrics->mem_activity_acc,
"rsmi_gpu_metrics->mem_activity_acc");
for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) {
ss << print_unsigned_hex_and_int(
rsmi_gpu_metrics->temperature_hbm[i],
@@ -366,7 +363,7 @@ static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind,
}
#define ASSIGN_DATA_FIELD(FIELD, SRC) \
data->FIELD = SRC->FIELD;
data->FIELD = (SRC)->FIELD;
#define ASSIGN_COMMON_FORMATS(SRC) \
ASSIGN_DATA_FIELD(common_header, (SRC)) \
@@ -417,7 +414,7 @@ static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind,
// These fields didn't exist in v0
data->gfx_activity_acc = 0;
data->mem_actvity_acc = 0;
data->mem_activity_acc = 0;
(void)memset(data->temperature_hbm, 0,
RSMI_NUM_HBM_INSTANCES * sizeof(uint16_t));
} // else handle other conversions to format 1
+19 -21
Просмотреть файл
@@ -41,20 +41,19 @@
*
*/
#include <assert.h>
#include <sys/stat.h>
#include <dirent.h>
#include <sys/stat.h>
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <memory>
#include <sstream>
#include <string>
#include <unordered_set>
#include <fstream>
#include <cstdint>
#include <iostream>
#include <sstream>
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_io_link.h"
@@ -161,7 +160,7 @@ static int ReadLinkProperties(uint32_t node_indx, uint32_t link_indx,
retVec->push_back(line);
}
if (retVec->size() == 0) {
if (retVec->empty()) {
fs.close();
return 0;
}
@@ -182,7 +181,7 @@ static int DiscoverLinks(std::map<std::pair<uint32_t, uint32_t>,
if (links == nullptr) {
return EINVAL;
}
assert(links->size() == 0);
assert(links->empty());
links->clear();
@@ -229,8 +228,8 @@ static int DiscoverLinks(std::map<std::pair<uint32_t, uint32_t>,
}
link_indx = static_cast<uint32_t>(std::stoi(dentry_io_link->d_name));
link = std::shared_ptr<IOLink>(new IOLink(node_indx, link_indx,
directory));
link = std::make_shared<IOLink>(node_indx, link_indx,
directory);
link->Initialize();
@@ -273,7 +272,7 @@ static int DiscoverLinksPerNode(uint32_t node_indx, std::map<uint32_t,
if (links == nullptr) {
return EINVAL;
}
assert(links->size() == 0);
assert(links->empty());
links->clear();
@@ -297,8 +296,8 @@ static int DiscoverLinksPerNode(uint32_t node_indx, std::map<uint32_t,
}
link_indx = static_cast<uint32_t>(std::stoi(dentry->d_name));
link = std::shared_ptr<IOLink>(new IOLink(node_indx, link_indx,
directory));
link = std::make_shared<IOLink>(node_indx, link_indx,
directory);
link->Initialize();
@@ -323,16 +322,15 @@ int DiscoverP2PLinksPerNode(uint32_t node_indx, std::map<uint32_t,
return DiscoverLinksPerNode(node_indx, links, P2P_LINK_DIRECTORY);
}
IOLink::~IOLink() {
}
IOLink::~IOLink() = default;
int IOLink::ReadProperties(void) {
int ret;
std::vector<std::string> propVec;
assert(properties_.size() == 0);
if (properties_.size() > 0) {
assert(properties_.empty());
if (!properties_.empty()) {
return 0;
}
@@ -347,8 +345,8 @@ int IOLink::ReadProperties(void) {
uint64_t val_int; // Assume all properties are unsigned integers for now
std::istringstream fs;
for (uint32_t i = 0; i < propVec.size(); ++i) {
fs.str(propVec[i]);
for (const auto & i : propVec) {
fs.str(i);
fs >> key_str;
fs >> val_int;
+107 -21
Просмотреть файл
@@ -41,28 +41,29 @@
*
*/
#include <assert.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <fcntl.h>
#include <dirent.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <memory>
#include <sstream>
#include <string>
#include <unordered_set>
#include <fstream>
#include <cstdint>
#include <iostream>
#include <sstream>
#include "rocm_smi/rocm_smi_io_link.h"
#include "rocm_smi/rocm_smi_kfd.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_logger.h"
namespace amd {
namespace smi {
@@ -195,7 +196,7 @@ int ReadKFDDeviceProperties(uint32_t kfd_node_id,
retVec->push_back(line);
}
if (retVec->size() == 0) {
if (retVec->empty()) {
fs.close();
return ENOENT;
}
@@ -517,7 +518,7 @@ int DiscoverKFDNodes(std::map<uint64_t, std::shared_ptr<KFDNode>> *nodes) {
if (nodes == nullptr) {
return EINVAL;
}
assert(nodes->size() == 0);
assert(nodes->empty());
nodes->clear();
@@ -548,7 +549,7 @@ int DiscoverKFDNodes(std::map<uint64_t, std::shared_ptr<KFDNode>> *nodes) {
continue;
}
node = std::shared_ptr<KFDNode>(new KFDNode(node_indx));
node = std::make_shared<KFDNode>(node_indx);
node->Initialize();
@@ -596,16 +597,15 @@ int DiscoverKFDNodes(std::map<uint64_t, std::shared_ptr<KFDNode>> *nodes) {
return 0;
}
KFDNode::~KFDNode() {
}
KFDNode::~KFDNode() = default;
int KFDNode::ReadProperties(void) {
int ret;
std::vector<std::string> propVec;
assert(properties_.size() == 0);
if (properties_.size() > 0) {
assert(properties_.empty());
if (!properties_.empty()) {
return 0;
}
@@ -620,8 +620,8 @@ int KFDNode::ReadProperties(void) {
uint64_t val_int; // Assume all properties are unsigned integers for now
std::istringstream fs;
for (uint32_t i = 0; i < propVec.size(); ++i) {
fs.str(propVec[i]);
for (const auto & i : propVec) {
fs.str(i);
fs >> key_str;
fs >> val_int;
@@ -776,20 +776,30 @@ KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth,
// /sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties
// size_in_bytes 68702699520
int KFDNode::get_total_memory(uint64_t* total) {
if (total == nullptr) return EINVAL;
std::ostringstream ss;
if (total == nullptr) {
return EINVAL;
}
*total = 0;
std::string f_path = kKFDNodesPathRoot;
f_path += "/";
f_path += std::to_string(node_indx_);
f_path += "/mem_banks";
int subDirCount = subDirectoryCountInPath(f_path);
ss << __PRETTY_FUNCTION__ << " | [before loop] Within " << f_path
<< " has subdirectory count = " << std::to_string(subDirCount);
LOG_DEBUG(ss);
auto kfd_node_dir = opendir(f_path.c_str());
if (kfd_node_dir == nullptr) {
return errno;
}
auto dentry = readdir(kfd_node_dir);
while (dentry != nullptr) {
while (dentry != nullptr && subDirCount > 0) {
ss << __PRETTY_FUNCTION__ << " | [inside loop] Within " << f_path
<< " has subdirectory count = " << std::to_string(subDirCount);
LOG_DEBUG(ss);
if (dentry->d_name[0] == '.') {
dentry = readdir(kfd_node_dir);
continue;
@@ -823,6 +833,7 @@ int KFDNode::get_total_memory(uint64_t* total) {
}
}
} // end loop for lines in property file
subDirCount--;
} // end loop for mem_bank directory
if (closedir(kfd_node_dir)) {
@@ -863,5 +874,80 @@ int KFDNode::get_used_memory(uint64_t* used) {
return 1;
}
// /sys/class/kfd/kfd/topology/nodes/*/properties
int read_node_properties(uint32_t node, std::string property_name,
uint64_t *val) {
std::ostringstream ss;
int retVal = EINVAL;
if (property_name.empty() || val == nullptr) {
ss << __PRETTY_FUNCTION__
<< " | Issue: Could not read node #" << std::to_string(node)
<< ", property_name is empty or *val is nullptr "
<< " | return = " << std::to_string(retVal)
<< " | ";
LOG_DEBUG(ss);
return retVal;
}
std::shared_ptr<KFDNode> myNode = std::shared_ptr<KFDNode>(new KFDNode(node));
myNode->Initialize();
if (KFDNodeSupported(node)) {
retVal = myNode->get_property_value(property_name, val);
ss << __PRETTY_FUNCTION__
<< " | Successfully read node #" << std::to_string(node)
<< " for property_name = " << property_name
<< " | Data (" << property_name << ") * val = "
<< std::to_string(*val)
<< " | return = " << std::to_string(retVal)
<< " | ";
LOG_DEBUG(ss);
} else {
retVal = 1;
ss << __PRETTY_FUNCTION__
<< " | Issue: Could not read node #" << std::to_string(node)
<< ", KFD node was an unsupported node."
<< " | return = " << std::to_string(retVal)
<< " | ";
LOG_ERROR(ss);
}
return retVal;
}
// /sys/class/kfd/kfd/topology/nodes/*/gpu_id
int get_gpu_id(uint32_t node, uint64_t *gpu_id) {
std::ostringstream ss;
int retVal = EINVAL;
if (gpu_id == nullptr) {
ss << __PRETTY_FUNCTION__
<< " | Issue: Could not read node #" << std::to_string(node)
<< ", gpu_id is a nullptr "
<< " | return = " << std::to_string(retVal)
<< " | ";
LOG_DEBUG(ss);
return retVal;
}
std::shared_ptr<KFDNode> myNode = std::shared_ptr<KFDNode>(new KFDNode(node));
myNode->Initialize();
if (KFDNodeSupported(node)) {
retVal = ReadKFDGpuId(node, gpu_id);
ss << __PRETTY_FUNCTION__
<< " | Successfully read node #" << std::to_string(node)
<< " for gpu_id"
<< " | Data (gpu_id) *gpu_id = "
<< std::to_string(*gpu_id)
<< " | return = " << std::to_string(retVal)
<< " | ";
LOG_DEBUG(ss);
} else {
retVal = 1;
ss << __PRETTY_FUNCTION__
<< " | Issue: Could not read node #" << std::to_string(node)
<< ", KFD node was an unsupported node."
<< " | return = " << std::to_string(retVal)
<< " | ";
LOG_ERROR(ss);
}
return retVal;
}
} // namespace smi
} // namespace amd
+11 -12
Просмотреть файл
@@ -55,7 +55,7 @@
* be printed, unless RSMI_LOGGING is enabled.
*
* BUFFER log type should be use while logging raw buffer or raw messages
* Having direct interface as well as C++ Singleton inface. Can use
* Having direct interface as well as C++ Singleton iface. Can use
* whatever interface fits your needs.
*/
@@ -70,7 +70,6 @@
// Code Specific Header Files(s)
#include "rocm_smi/rocm_smi_logger.h"
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_utils.h"
using namespace ROCmLogging;
@@ -117,7 +116,7 @@ void Logger::logIntoFile(std::string& data) {
if(!m_File.is_open()) {
initialize_resources();
if (!m_File.is_open()) {
std::cout << "WARNING: re-initializing resources was unsuccessfull."
std::cout << "WARNING: re-initializing resources was unsuccessful."
<<" Unable to print the following message." << std::endl;
logOnConsole(data);
unlock();
@@ -164,7 +163,7 @@ void Logger::error(const char* text) throw() {
// By default, logging is disabled
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
if (m_loggingIsOn == false) {
if (!m_loggingIsOn) {
return;
}
@@ -198,7 +197,7 @@ void Logger::alarm(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
if (m_loggingIsOn == false) {
if (!m_loggingIsOn) {
return;
}
@@ -232,7 +231,7 @@ void Logger::always(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
if (m_loggingIsOn == false) {
if (!m_loggingIsOn) {
return;
}
@@ -270,7 +269,7 @@ void Logger::buffer(const char* text) throw() {
if(!m_File.is_open()) {
initialize_resources();
if (!m_File.is_open()) {
std::cout << "WARNING: re-initializing resources was unsuccessfull."
std::cout << "WARNING: re-initializing resources was unsuccessful."
<<" Unable to print the following message." << std::endl;
std::string txtStr(text);
std::cout << txtStr << std::endl;
@@ -300,7 +299,7 @@ void Logger::info(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
if (m_loggingIsOn == false) {
if (!m_loggingIsOn) {
return;
}
@@ -334,7 +333,7 @@ void Logger::trace(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
if (m_loggingIsOn == false) {
if (!m_loggingIsOn) {
return;
}
@@ -368,7 +367,7 @@ void Logger::debug(const char* text) throw() {
// By default, logging is disabled (ie. no RSMI_LOGGING)
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
if (m_loggingIsOn == false) {
if (!m_loggingIsOn) {
return;
}
@@ -426,7 +425,7 @@ void Logger::enableFileLogging() {
// Returns a string of details on current log settings
std::string Logger::getLogSettings() {
std::string logSettings = "";
std::string logSettings;
if (m_File.is_open()) {
logSettings += "OpenStatus = File (" + logFileName + ") is open";
@@ -490,7 +489,7 @@ void Logger::initialize_resources() {
// The check below allows us to toggle logging through RSMI_LOGGING
// set or unset
m_loggingIsOn = amd::smi::RocmSMI::getInstance().isLoggingOn();
if (m_loggingIsOn == false) {
if (!m_loggingIsOn) {
return;
}
m_File.open(logFileName.c_str(), std::ios::out | std::ios::app);
+183 -72
Просмотреть файл
@@ -39,25 +39,26 @@
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <dirent.h>
#include <assert.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <stdlib.h>
#include <string>
#include <cstdint>
#include <memory>
#include <fstream>
#include <vector>
#include <set>
#include <utility>
#include <functional>
#include <dirent.h>
#include <sys/types.h>
#include <unistd.h>
#include <cassert>
#include <cerrno>
#include <unordered_map>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <functional>
#include <iostream>
#include <memory>
#include <set>
#include <sstream>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_device.h"
@@ -285,7 +286,8 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
// We are looking for the last element in the path that has the form
// XXXX:XX:XX.X, where X is a hex integer (lower case is expected)
std::size_t slash_i, end_i;
std::size_t slash_i;
std::size_t end_i;
std::string tmp;
std::string tpath_str(tpath);
@@ -332,9 +334,9 @@ RocmSMI::Initialize(uint64_t flags) {
GetEnvVariables();
// To help debug env variable issues
// printEnvVarInfo();
// debugRSMIEnvVarInfo();
while (std::string(kAMDMonitorTypes[i]) != "") {
while (!std::string(kAMDMonitorTypes[i]).empty()) {
amd_monitor_types_.insert(kAMDMonitorTypes[i]);
++i;
}
@@ -348,12 +350,12 @@ RocmSMI::Initialize(uint64_t flags) {
}
uint64_t bdfid;
for (uint32_t i = 0; i < devices_.size(); ++i) {
if (ConstructBDFID(devices_[i]->path(), &bdfid) != 0) {
for (auto & device : devices_) {
if (ConstructBDFID(device->path(), &bdfid) != 0) {
std::cerr << "Failed to construct BDFID." << std::endl;
ret = 1;
} else {
devices_[i]->set_bdfid(bdfid);
device->set_bdfid(bdfid);
}
}
if (ret != 0) {
@@ -389,7 +391,7 @@ RocmSMI::Initialize(uint64_t flags) {
uint64_t bdfid = (*dev_iter)->bdfid();
if (tmp_map.find(bdfid) == tmp_map.end()) {
ss << __PRETTY_FUNCTION__ << " | removing device = "
<< (*dev_iter)->path();
<< (*dev_iter)->path() << "; bdfid = " << std::to_string(bdfid);
dev_iter = devices_.erase(dev_iter);
LOG_DEBUG(ss);
continue;
@@ -444,8 +446,7 @@ RocmSMI::RocmSMI(uint64_t flags) : init_options_(flags),
kfd_notif_evt_fh_(-1), kfd_notif_evt_fh_refcnt_(0) {
}
RocmSMI::~RocmSMI() {
}
RocmSMI::~RocmSMI() = default;
RocmSMI& RocmSMI::getInstance(uint64_t flags) {
// Assume c++11 or greater. static objects will be created by only 1 thread
@@ -494,7 +495,7 @@ static inline std::unordered_set<uint32_t> GetEnvVarUIntegerSets(
if(ev_str == nullptr) { return returnSet; }
std::string stringEnv = ev_str;
if (stringEnv.empty() == false) {
if (!stringEnv.empty()) {
// parse out values by commas
std::string parsedVal;
std::istringstream ev_str_ss(stringEnv);
@@ -549,48 +550,54 @@ uint32_t RocmSMI::getLogSetting() {
return this->env_vars_.logging_on;
}
void RocmSMI::printEnvVarInfo(void) {
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_output_bitfield = "
<< ((env_vars_.debug_output_bitfield == 0) ? "<undefined>"
: std::to_string(env_vars_.debug_output_bitfield))
<< std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_DRM_root_override = "
<< ((env_vars_.path_DRM_root_override == nullptr)
? "<undefined>" : env_vars_.path_DRM_root_override)
<< std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_HWMon_root_override = "
<< ((env_vars_.path_HWMon_root_override == nullptr)
? "<undefined>" : env_vars_.path_HWMon_root_override)
<< std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_power_root_override = "
<< ((env_vars_.path_power_root_override == nullptr)
? "<undefined>" : env_vars_.path_power_root_override)
<< std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_inf_loop = "
<< ((env_vars_.debug_inf_loop == 0) ? "<undefined>"
: std::to_string(env_vars_.debug_inf_loop))
<< std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
void RocmSMI::debugRSMIEnvVarInfo(void) {
std::cout << __PRETTY_FUNCTION__
<< RocmSMI::getInstance().getRSMIEnvVarInfo();
}
std::string RocmSMI::getRSMIEnvVarInfo(void) {
std::ostringstream ss;
ss << "\n\tRSMI_DEBUG_BITFIELD = "
<< ((env_vars_.debug_output_bitfield == 0) ? "<undefined>"
: std::to_string(env_vars_.debug_output_bitfield))
<< std::endl;
ss << "\tRSMI_DEBUG_DRM_ROOT_OVERRIDE = "
<< ((env_vars_.path_DRM_root_override == nullptr)
? "<undefined>" : env_vars_.path_DRM_root_override)
<< std::endl;
ss << "\tRSMI_DEBUG_HWMON_ROOT_OVERRIDE = "
<< ((env_vars_.path_HWMon_root_override == nullptr)
? "<undefined>" : env_vars_.path_HWMon_root_override)
<< std::endl;
ss << "\tRSMI_DEBUG_PP_ROOT_OVERRIDE = "
<< ((env_vars_.path_power_root_override == nullptr)
? "<undefined>" : env_vars_.path_power_root_override)
<< std::endl;
ss << "\tRSMI_DEBUG_INFINITE_LOOP = "
<< ((env_vars_.debug_inf_loop == 0) ? "<undefined>"
: std::to_string(env_vars_.debug_inf_loop))
<< std::endl;
ss << "\tRSMI_LOGGING = "
<< getLogSetting() << std::endl;
bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
<< (isLoggingOn ? "true" : "false") << std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {";
ss << "\tRSMI_LOGGING (are logs on) = "
<< (isLoggingOn ? "TRUE" : "FALSE") << std::endl;
ss << "\tRSMI_DEBUG_ENUM_OVERRIDE = {";
if (env_vars_.enum_overrides.empty()) {
std::cout << "}" << std::endl;
return;
ss << "}" << std::endl;
return ss.str();
}
for (auto it=env_vars_.enum_overrides.begin();
it != env_vars_.enum_overrides.end(); ++it) {
DevInfoTypes type = static_cast<DevInfoTypes>(*it);
std::cout << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type)
+ ")");
ss << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) + ")");
auto temp_it = it;
if(++temp_it != env_vars_.enum_overrides.end()) {
std::cout << ", ";
ss << ", ";
}
}
std::cout << "}" << std::endl;
ss << "}" << std::endl;
return ss.str();
}
std::shared_ptr<Monitor>
@@ -638,7 +645,7 @@ RocmSMI::FindMonitor(std::string monitor_path) {
fs.close();
if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) {
m = std::shared_ptr<Monitor>(new Monitor(mon_name, &env_vars_));
m = std::make_shared<Monitor>(mon_name, &env_vars_);
m->setTempSensorLabelMap();
m->setVoltSensorLabelMap();
break;
@@ -666,12 +673,12 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
dev_path += "/";
dev_path += dev_name;
auto dev = std::shared_ptr<Device>(new Device(dev_path, &env_vars_));
auto dev = std::make_shared<Device>(dev_path, &env_vars_);
std::shared_ptr<Monitor> m = FindMonitor(dev_path + "/device/hwmon");
dev->set_monitor(m);
std::string d_name = dev_name;
const std::string& d_name = dev_name;
uint32_t card_indx = GetDeviceIndex(d_name);
dev->set_drm_render_minor(GetDrmRenderMinor(dev_path));
dev->set_card_index(card_indx);
@@ -682,8 +689,6 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
<< dev_name << " | path = " << dev_path
<< " | card index = " << std::to_string(card_indx) << " | ";
LOG_DEBUG(ss);
return;
}
static const uint32_t kAmdGpuId = 0x1002;
@@ -694,8 +699,7 @@ static bool isAMDGPU(std::string dev_path) {
std::string vend_path = dev_path + "/device/vendor";
if (!FileExists(vend_path.c_str())) {
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
"is an amdgpu device - FALSE");
<< " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE");
LOG_DEBUG(ss);
return isAmdGpu;
}
@@ -705,8 +709,7 @@ static bool isAMDGPU(std::string dev_path) {
if (!fs.is_open()) {
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
"is an amdgpu device - FALSE");
<< " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE");
LOG_DEBUG(ss);
return isAmdGpu;
}
@@ -721,8 +724,7 @@ static bool isAMDGPU(std::string dev_path) {
isAmdGpu = true;
}
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
"is an amdgpu device - FALSE");
<< " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE");
LOG_DEBUG(ss);
return isAmdGpu;
}
@@ -730,6 +732,7 @@ static bool isAMDGPU(std::string dev_path) {
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
std::string err_msg;
uint32_t count = 0;
std::ostringstream ss;
// If this gets called more than once, clear previous findings.
devices_.clear();
@@ -756,17 +759,125 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
}
dentry = readdir(drm_dir);
}
ss << __PRETTY_FUNCTION__ << " | Discovered a potential of "
<< std::to_string(count) << " cards" << " | ";
LOG_DEBUG(ss);
struct systemNode {
uint32_t s_node_id = 0;
uint64_t s_gpu_id = 0;
uint64_t s_unique_id = 0;
};
// allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id}
std::multimap<uint64_t, systemNode> allSystemNodes;
uint32_t node_id = 0;
while (true) {
uint64_t gpu_id = 0, unique_id = 0;
int ret_gpu_id = get_gpu_id(node_id, &gpu_id);
int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id);
if (ret_gpu_id == 0 || ret_unique_id == 0) {
systemNode myNode;
myNode.s_node_id = node_id;
myNode.s_gpu_id = gpu_id;
myNode.s_unique_id = unique_id;
if(gpu_id != 0) { // only add gpu nodes, 0 = CPU
allSystemNodes.emplace(unique_id, myNode);
}
} else {
break;
}
node_id++;
}
ss << __PRETTY_FUNCTION__ << " | Ordered system nodes found = {";
for(auto i: allSystemNodes) {
ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
<< "; gpu_id = " << std::to_string(i.second.s_gpu_id)
<< "; unique_id = " << std::to_string(i.second.s_unique_id)
<< "], "
;
}
ss << "}";
LOG_DEBUG(ss);
// Discover all root cards & gpu partitions associated with each
for (uint32_t node_id = 0; node_id < count; node_id++) {
std::string path = kPathDRMRoot;
path += "/card";
path += std::to_string(node_id);
uint64_t primary_unique_id = 0;
// each identified gpu card node is a primary node for
// potential matching unique ids
if (isAMDGPU(path) ||
(init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) {
std::string d_name = "card";
d_name += std::to_string(node_id);
AddToDeviceList(d_name);
}
ss << __PRETTY_FUNCTION__
<< " | Ordered system nodes seen in lookup = {";
for (auto i : allSystemNodes) {
ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
<< "; gpu_id = " << std::to_string(i.second.s_gpu_id)
<< "; unique_id = " << std::to_string(i.second.s_unique_id)
<< "], ";
}
ss << "}";
LOG_DEBUG(ss);
uint64_t temp_primary_unique_id = 0;
if (allSystemNodes.empty()) {
continue;
}
// get lowest key 1st to keep order of nodes matching card
uint32_t lowest_NodeId = 0;
uint32_t curr_NodeId = 0;
for (auto it = allSystemNodes.begin(), end = allSystemNodes.end();
it != end; it = allSystemNodes.upper_bound(it->first)) {
curr_NodeId = it->second.s_node_id;
if (it == allSystemNodes.begin()) {
lowest_NodeId = it->second.s_node_id;
}
if (curr_NodeId <= lowest_NodeId) {
lowest_NodeId = curr_NodeId;
temp_primary_unique_id = it->second.s_unique_id;
}
}
ss << __PRETTY_FUNCTION__
<< " | lowest_NodeId = " << std::to_string(lowest_NodeId)
<< " | curr_NodeId = " << std::to_string(curr_NodeId)
<< " | temp_primary_unique_id = "
<< std::to_string(temp_primary_unique_id);
LOG_DEBUG(ss);
if (temp_primary_unique_id != 0) {
primary_unique_id = temp_primary_unique_id;
} else {
allSystemNodes.erase(primary_unique_id);
continue;
}
auto numb_nodes = allSystemNodes.count(primary_unique_id);
ss << __PRETTY_FUNCTION__ << " | REFRESH - primary_unique_id = "
<< std::to_string(primary_unique_id) << " has "
<< std::to_string(numb_nodes) << " known gpu nodes";
LOG_DEBUG(ss);
while (numb_nodes > 1) {
std::string secNode = "card";
secNode += std::to_string(node_id); // add the primary node id
AddToDeviceList(secNode);
numb_nodes--;
}
// remove already added nodes associated with current card
auto erasedNodes = allSystemNodes.erase(primary_unique_id);
ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = "
<< std::to_string(primary_unique_id) << " erased "
<< std::to_string(erasedNodes) << " nodes";
LOG_DEBUG(ss);
}
}
if (closedir(drm_dir)) {
@@ -790,7 +901,7 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) {
power_mons_.clear();
}
if (power_mons_.size() != 0) {
if (!power_mons_.empty()) {
return 0;
}
@@ -818,7 +929,7 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) {
if (FileExists(tmp.c_str())) {
std::shared_ptr<PowerMon> mon =
std::shared_ptr<PowerMon>(new PowerMon(mon_name, &env_vars_));
std::make_shared<PowerMon>(mon_name, &env_vars_);
power_mons_.push_back(mon);
mon->set_dev_index(GetDeviceIndex(dentry->d_name));
}
@@ -831,8 +942,8 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) {
return errno;
}
for (auto m : power_mons_) {
for (auto d : devices_) {
for (const auto& m : power_mons_) {
for (const auto& d : devices_) {
if (m->dev_index() == d->index()) {
d->set_power_monitor(m);
break;
+23 -25
Просмотреть файл
@@ -41,19 +41,18 @@
*
*/
#include <assert.h>
#include <dirent.h>
#include <fstream>
#include <string>
#include <cstdint>
#include <map>
#include <iostream>
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <map>
#include <regex> // NOLINT
#include <string>
#include <vector>
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_monitor.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_exception.h"
@@ -286,8 +285,7 @@ static const std::map<const char *, monitor_depends_t> kMonFuncDependsMap = {
env_ = nullptr;
#endif
}
Monitor::~Monitor(void) {
}
Monitor::~Monitor(void) = default;
std::string
Monitor::MakeMonitorPath(MonitorTypes type, uint32_t sensor_id) {
@@ -339,7 +337,7 @@ Monitor::setTempSensorLabelMap(void) {
std::string type_str;
int ret;
if (temp_type_index_map_.size() > 0) {
if (!temp_type_index_map_.empty()) {
return 0; // We've already filled in the map
}
auto add_temp_sensor_entry = [&](uint32_t file_index) {
@@ -377,7 +375,7 @@ Monitor::setVoltSensorLabelMap(void) {
std::string type_str;
int ret;
if (volt_type_index_map_.size() > 0) {
if (!volt_type_index_map_.empty()) {
return 0; // We've already filled in the map
}
auto add_volt_sensor_entry = [&](uint32_t file_index) {
@@ -513,10 +511,10 @@ typedef enum {
static monitor_types getFuncType(std::string f_name) {
monitor_types ret = eDefaultMonitor;
if (f_name.compare("rsmi_dev_temp_metric_get") == 0) {
if (f_name == "rsmi_dev_temp_metric_get") {
ret = eTempMonitor;
}
if (f_name.compare("rsmi_dev_volt_metric_get") == 0) {
if (f_name == "rsmi_dev_volt_metric_get") {
ret = eVoltMonitor;
}
return ret;
@@ -617,22 +615,22 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) {
} else {
supported_monitors = intersect;
}
if (supported_monitors.size() > 0) {
for (uint32_t i = 0; i < supported_monitors.size(); ++i) {
if (!supported_monitors.empty()) {
for (unsigned long & supported_monitor : supported_monitors) {
if (m_type == eDefaultMonitor) {
assert(supported_monitors[i] > 0);
supported_monitors[i] |=
(supported_monitors[i] - 1) << MONITOR_TYPE_BIT_POSITION;
assert(supported_monitor > 0);
supported_monitor |=
(supported_monitor - 1) << MONITOR_TYPE_BIT_POSITION;
} else if (m_type == eTempMonitor) {
// Temp sensor file names are 1-based
assert(supported_monitors[i] > 0);
supported_monitors[i] |=
static_cast<uint64_t>(getTempSensorEnum(supported_monitors[i]))
assert(supported_monitor > 0);
supported_monitor |=
static_cast<uint64_t>(getTempSensorEnum(supported_monitor))
<< MONITOR_TYPE_BIT_POSITION;
} else if (m_type == eVoltMonitor) {
// Voltage sensor file names are 0-based
supported_monitors[i] |=
static_cast<uint64_t>(getVoltSensorEnum(supported_monitors[i]))
supported_monitor |=
static_cast<uint64_t>(getVoltSensorEnum(supported_monitor))
<< MONITOR_TYPE_BIT_POSITION;
} else {
assert(false); // Unexpected monitor type
@@ -643,10 +641,10 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) {
}
}
if (it->second.variants.size() == 0) {
if (it->second.variants.empty()) {
(*supported_funcs)[it->first] = nullptr;
supported_variants = nullptr; // Invoke destructor
} else if ((*supported_variants).size() > 0) {
} else if (!(*supported_variants).empty()) {
(*supported_funcs)[it->first] = supported_variants;
}
+5 -9
Просмотреть файл
@@ -41,17 +41,14 @@
*
*/
#include <assert.h>
#include <fstream>
#include <string>
#include <cassert>
#include <cstdint>
#include <map>
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_monitor.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi_exception.h"
@@ -70,8 +67,7 @@ static const std::map<PowerMonTypes, const char *> kMonitorNameMap = {
PowerMon::PowerMon(std::string path, RocmSMI_env_vars const *e) :
path_(path), env_(e) {
}
PowerMon::~PowerMon(void) {
}
PowerMon::~PowerMon(void) = default;
static int parse_power_str(std::string s, PowerMonTypes type, uint64_t *val) {
std::stringstream ss(s);
+141 -52
Просмотреть файл
@@ -40,27 +40,28 @@
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <assert.h>
#include <errno.h>
#include <sys/stat.h>
#include <unistd.h>
#define _GNU_SOURCE 1 // REQUIRED: to utilize some GNU features/functions, see
// _GNU_SOURCE functions which check
#include <dirent.h>
#include <dlfcn.h>
#include <glob.h>
#include <sys/stat.h>
#include <sys/utsname.h>
#include <unistd.h>
#include <fstream>
#include <string>
#include <cstring>
#include <algorithm>
#include <cassert>
#include <cerrno>
#include <cstdint>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <algorithm>
#include <vector>
#include <regex>
#include <iomanip>
#include <sstream>
#include <string>
#include <type_traits>
#include <vector>
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_utils.h"
@@ -138,7 +139,7 @@ std::vector<std::string> globFilesExist(const std::string& filePattern) {
glob_t result_glob;
memset(&result_glob, 0, sizeof(result_glob));
if (glob(filePattern.c_str(), GLOB_TILDE, NULL, &result_glob) != 0) {
if (glob(filePattern.c_str(), GLOB_TILDE, nullptr, &result_glob) != 0) {
globfree(&result_glob);
// Leaving below to help debug issues discovering future glob file searches
// debugFilesDiscovered(fileNames);
@@ -146,7 +147,7 @@ std::vector<std::string> globFilesExist(const std::string& filePattern) {
}
for(size_t i = 0; i < result_glob.gl_pathc; ++i) {
fileNames.push_back(std::string(result_glob.gl_pathv[i]));
fileNames.emplace_back(result_glob.gl_pathv[i]);
}
globfree(&result_glob);
@@ -159,17 +160,26 @@ int isRegularFile(std::string fname, bool *is_reg) {
struct stat file_stat;
int ret;
assert(is_reg != nullptr);
ret = stat(fname.c_str(), &file_stat);
if (ret) {
return errno;
}
*is_reg = S_ISREG(file_stat.st_mode);
if (is_reg != nullptr) {
*is_reg = S_ISREG(file_stat.st_mode);
}
return 0;
}
int WriteSysfsStr(std::string path, std::string val) {
// On success, zero is returned. On error, -1 is returned, and
// errno is set to indicate the error.
auto is_regular_file_result = isRegularFile(path, nullptr);
if (is_regular_file_result != 0) {
return ENOENT;
}
std::ofstream fs;
int ret = 0;
std::ostringstream ss;
@@ -196,6 +206,13 @@ int WriteSysfsStr(std::string path, std::string val) {
}
int ReadSysfsStr(std::string path, std::string *retStr) {
// On success, zero is returned. On error, -1 is returned, and
// errno is set to indicate the error.
auto is_regular_file_result = isRegularFile(path, nullptr);
if (is_regular_file_result != 0) {
return ENOENT;
}
std::stringstream ss;
int ret = 0;
std::ostringstream oss;
@@ -381,7 +398,7 @@ std::string removeString(const std::string origStr,
// defaults to trim stdOut
std::pair<bool, std::string> executeCommand(std::string command, bool stdOut) {
char buffer[128];
std::string stdoutAndErr = "";
std::string stdoutAndErr;
bool successfulRun = true;
command = "stdbuf -i0 -o0 -e0 " + command; // remove stdOut and err buffering
@@ -411,14 +428,10 @@ std::pair<bool, std::string> executeCommand(std::string command, bool stdOut) {
return std::make_pair(successfulRun, stdoutAndErr);
}
// originalstring - string to search for substring
// originalString - string to search for substring
// substring - string looking to find
bool containsString(std::string originalString, std::string substring) {
if (originalString.find(substring) != std::string::npos) {
return true;
} else {
return false;
}
return (originalString.find(substring) != std::string::npos);
}
// Creates and stores supplied data into a temporary file (within /tmp/).
@@ -429,9 +442,9 @@ bool containsString(std::string originalString, std::string substring) {
// https://man7.org/linux/man-pages/man3/mkstemp.3.html
//
// Temporary file name format:
// <app prefix>_<state name>_<paramenter name>_<device id>
// <app prefix>_<state name>_<parameter name>_<device id>
// <app prefix> - prefix for our application's identifier (see kTmpFilePrefix)
// <paramenter name> - name of parameter being stored
// <parameter name> - name of parameter being stored
// <state name> - state at which the stored value captures
// <device index> - device identifier
//
@@ -466,9 +479,8 @@ rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName,
close(fd);
if (rc_write == -1) {
return RSMI_STATUS_FILE_ERROR;
} else {
return RSMI_STATUS_SUCCESS;
}
return RSMI_STATUS_SUCCESS;
}
std::vector<std::string> getListOfAppTmpFiles() {
@@ -477,16 +489,18 @@ std::vector<std::string> getListOfAppTmpFiles() {
struct dirent *ent;
std::vector<std::string> tmpFiles;
if ((dir = opendir(path.c_str())) != nullptr) {
// captures all files & directories under specified path
while ((ent = readdir(dir)) != nullptr) {
std::string fileDirName = ent->d_name;
// we only want our app specific files
if (containsString(fileDirName, kTmpFilePrefix)) {
tmpFiles.emplace_back(path + "/" + fileDirName);
} else {
continue;
}
dir = opendir(path.c_str());
if (dir == nullptr) {
return tmpFiles;
}
// captures all files & directories under specified path
while ((ent = readdir(dir)) != nullptr) {
std::string fileDirName = ent->d_name;
// we only want our app specific files
if (containsString(fileDirName, kTmpFilePrefix)) {
tmpFiles.emplace_back(path + "/" + fileDirName);
} else {
continue;
}
}
return tmpFiles;
@@ -515,7 +529,7 @@ std::vector<std::string> readEntireFile(std::string path) {
std::string line;
while (std::getline(inFileStream, line)) {
std::istringstream ss(line);
if(line.size() > 0) {
if (!line.empty()) {
fileContent.push_back(line);
}
}
@@ -527,7 +541,7 @@ std::vector<std::string> readEntireFile(std::string path) {
// and their content
void displayAppTmpFilesContent() {
std::vector<std::string> tmpFiles = getListOfAppTmpFiles();
if (tmpFiles.empty() == false) {
if (!tmpFiles.empty()) {
for (auto &x: tmpFiles) {
std::string out = readFile(x);
std::cout << __PRETTY_FUNCTION__ << " | Temporary file: " << x
@@ -543,7 +557,7 @@ void displayAppTmpFilesContent() {
std::string debugVectorContent(std::vector<std::string> v) {
std::ostringstream ss;
ss << "Vector = {";
if (v.size() > 0) {
if (!v.empty()) {
for (auto it=v.begin(); it < v.end(); it++) {
ss << *it;
auto temp_it = it;
@@ -561,7 +575,7 @@ std::string debugVectorContent(std::vector<std::string> v) {
std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v) {
std::ostringstream ss;
ss << "Vector = {";
if (v.size() > 0) {
if (!v.empty()) {
for (auto it=v.begin(); it < v.end(); it++) {
ss << (*it)->path();
auto temp_it = it;
@@ -576,7 +590,7 @@ std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v) {
}
// Attempts to read application specific temporary file
// This method is to be used for reading (or determing if it exists),
// This method is to be used for reading (or determining if it exists),
// in order to keep file naming scheme consistent.
//
// dv_ind - device index
@@ -594,7 +608,7 @@ std::tuple<bool, std::string> readTmpFile(uint32_t dv_ind,
"_" + std::to_string(dv_ind);
std::string fileContent;
std::vector<std::string> tmpFiles = getListOfAppTmpFiles();
if (tmpFiles.empty() == false) {
if (!tmpFiles.empty()) {
for (auto &x: tmpFiles) {
if (containsString(x, tmpFileName)) {
fileContent = readFile(x);
@@ -629,15 +643,23 @@ std::string getRSMIStatusString(rsmi_status_t ret) {
// Big Endian (BE), multi-bit symbols encoded as big endian (MSB first)
// Little Endian (LE), multi-bit symbols encoded as little endian (LSB first)
std::tuple<bool, std::string, std::string, std::string, std::string,
std::string, std::string, std::string, std::string>
std::string, std::string, std::string, std::string,
std::string, std::string, std::string>
getSystemDetails(void) {
struct utsname buf;
bool errorDetected = false;
std::string temp_data;
std::string sysname, nodename, release, version, machine;
std::string sysname;
std::string nodename;
std::string release;
std::string version;
std::string machine;
std::string domainName = "<undefined>";
std::string os_distribution = "<undefined>";
std::string endianness = "<undefined>";
std::string rocm_lib_path = "<undefined>";
std::string rocm_build_type = "<undefined>";
std::string rocm_env_variables = "<undefined>";
if (uname(&buf) < 0) {
errorDetected = true;
@@ -654,7 +676,7 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
std::string filePath = "/etc/os-release";
bool fileExists = FileExists(filePath.c_str());
if (fileExists == true) {
if (fileExists) {
std::vector<std::string> fileContent = readEntireFile(filePath);
for (auto &line: fileContent) {
if (line.find("PRETTY_NAME=") != std::string::npos) {
@@ -672,9 +694,13 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
endianness = "Little Endian, multi-bit symbols encoded as"
" little endian (LSB first)";
}
rocm_build_type = getBuildType();
rocm_lib_path = getMyLibPath();
rocm_env_variables = RocmSMI::getInstance().getRSMIEnvVarInfo();
return std::make_tuple(errorDetected, sysname, nodename, release,
version, machine, domainName, os_distribution,
endianness);
endianness, rocm_build_type, rocm_lib_path,
rocm_env_variables);
}
// If logging is enabled through RSMI_LOGGING environment variable.
@@ -683,9 +709,10 @@ void logSystemDetails(void) {
std::ostringstream ss;
bool errorDetected;
std::string sysname, node, release, version, machine, domain, distName,
endianness;
endianness, rocm_build_type, lib_path, rocm_env_vars;
std::tie(errorDetected, sysname, node, release, version, machine, domain,
distName, endianness) = getSystemDetails();
distName, endianness, rocm_build_type, lib_path,
rocm_env_vars) = getSystemDetails();
if (errorDetected == false) {
ss << "====== Gathered system details ============\n"
<< "SYSTEM NAME: " << sysname << "\n"
@@ -695,7 +722,10 @@ void logSystemDetails(void) {
<< "VERSION: " << version << "\n"
<< "MACHINE TYPE: " << machine << "\n"
<< "DOMAIN: " << domain << "\n"
<< "ENDIANNESS: " << endianness << "\n";
<< "ENDIANNESS: " << endianness << "\n"
<< "ROCM BUILD TYPE: " << rocm_build_type << "\n"
<< "ROCM-SMI-LIB PATH: " << lib_path << "\n"
<< "ROCM ENV VARIABLES: " << rocm_env_vars << "\n";
LOG_INFO(ss);
} else {
ss << "====== Gathered system details ============\n"
@@ -724,7 +754,7 @@ void logHexDump(
// Output description if given.
// if (desc != NULL) printf("%s:\n", desc);
if (desc != NULL) ss << "\n" << desc << "\n";
if (desc != nullptr) ss << "\n" << desc << "\n";
// Length checks.
if (len == 0) {
@@ -816,6 +846,36 @@ rsmi_status_t getBDFWithDomain(uint64_t bdf_id, std::string& bfd_str)
return result;
}
std::string getBuildType() {
std::string build = "<unknown>";
#ifndef DEBUG
build = "release";
#else
build = "debug";
#endif
return build;
}
const char *my_fname(void) {
std::string emptyRet="";
#ifdef _GNU_SOURCE
Dl_info dl_info;
dladdr((void *)my_fname, &dl_info);
return (dl_info.dli_fname);
#else
return emptyRet.c_str();
#endif
}
std::string getMyLibPath(void) {
std::string libName = "rocm-smi-lib";
std::string path = std::string(my_fname());
if (path.empty()) {
path = "Could not find library path for " + libName;
}
return path;
}
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str)
{
auto result = rsmi_status_t::RSMI_STATUS_SUCCESS;
@@ -837,6 +897,35 @@ rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str)
return result;
}
int subDirectoryCountInPath(const std::string path) {
int dir_count = 0;
struct dirent *dent;
DIR *srcdir = opendir(path.c_str());
if (srcdir == NULL) {
perror("opendir");
return -1;
}
while ((dent = readdir(srcdir)) != NULL) {
struct stat st;
if (strcmp(dent->d_name, ".") == 0 || strcmp(dent->d_name, "..") == 0) {
continue;
}
if (fstatat(dirfd(srcdir), dent->d_name, &st, 0) < 0) {
perror(dent->d_name);
continue;
}
if (S_ISDIR(st.st_mode)) {
dir_count++;
}
}
closedir(srcdir);
return dir_count;
}
} // namespace smi
} // namespace amd
+26 -19
Просмотреть файл
@@ -90,7 +90,6 @@ AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id) {
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kClkTypes) |
static_cast<AMDGpuPropertyOffsetType>(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes);
auto property_type_offset = (static_cast<AMDGpuPropertyOffsetType>(property_type_offset_mask) & (property_id));
auto property_type_id = (static_cast<AMDGpuPropertyOffsetType>(property_id) & ~(property_type_offset_mask));
return property_type_id;
@@ -167,6 +166,7 @@ const AMDGpuVerbList_t amdgpu_verb_check_list {
{ AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, "amdsmi_get_gpu_od_volt_curve_regions" }
};
const uint16_t kDevIDAll(0xFFFF);
const uint16_t kDevRevIDAll(0xFFFF);
const AMDGpuPropertyList_t amdgpu_property_reinforcement_list {
//
@@ -177,6 +177,14 @@ const AMDGpuPropertyList_t amdgpu_property_reinforcement_list {
// rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL = rsmi_dev_clk_range_set;
//
// AMD All Families
{kDevIDAll, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes,
MonitorTypes::kMonFanCntrlEnable),
AMDGpuVerbTypes_t::kResetGpuFan,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
// AMD Instinct MI210
{0x740F, {0x02,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
@@ -240,12 +248,6 @@ const AMDGpuPropertyList_t amdgpu_property_reinforcement_list {
AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets,
AMDGpuPropertyOpModeTypes_t::kBoth, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes,
DevInfoTypes::kDevGpuReset),
AMDGpuVerbTypes_t::kResetGpu,
AMDGpuPropertyOpModeTypes_t::kSrIov, false }
},
{0x74A1, {kDevRevIDAll,
make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes,
rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_DETERMINISM),
@@ -351,7 +353,7 @@ rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbT
// likely the reinforcement table does not contain any entries/rules for the
// dev_id in question.
//
auto amdgpu_property_query_result_hdlr = [](rsmi_status_t query_result) {
auto amdgpu_property_query_result_hdlr = [&](const rsmi_status_t query_result) {
switch (query_result) {
case (rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR):
case (rsmi_status_t::RSMI_STATUS_NO_DATA):
@@ -364,7 +366,7 @@ rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbT
break;
default:
return rsmi_status_t::RSMI_STATUS_NOT_FOUND;
return actual_error_code;
break;
}
};
@@ -416,7 +418,7 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx
std::ostringstream osstream;
auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR);
AMDGpuPropertyQuery_t amdgpu_property_query = [&]() {
auto amdgpu_property_query = [&]() {
AMDGpuPropertyQuery_t amdgpu_property_query_init{};
amdgpu_property_query_init.m_asic_id = 0;
amdgpu_property_query_init.m_pci_rev_id = 0;
@@ -435,7 +437,7 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx
id_filter_result = rsmi_dev_revision_get(dev_idx, &tmp_amdgpu_query.m_pci_rev_id);
}
}
is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) ? true : false;
is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS);
return tmp_amdgpu_query;
};
@@ -446,6 +448,18 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx
LOG_TRACE(osstream);
bool is_proper_query(false);
// Generic filter for checking properties for all asics and revisions.
auto amdgpu_property_query_all_asics = amdgpu_property_query;
amdgpu_property_query_all_asics.m_asic_id = kDevIDAll;
amdgpu_property_query_all_asics.m_pci_rev_id = kDevRevIDAll;
auto amdgpu_property_query_result = run_amdgpu_property_reinforcement_query(amdgpu_property_query_all_asics);
// We found a generic entry for all asics and revisions
if (amdgpu_property_query_result != rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR) {
return amdgpu_property_query_result;
}
// If no generic entry, then we query for specific asic and revision ids.
amdgpu_property_query = build_asic_id_filters(amdgpu_property_query, is_proper_query);
if (!is_proper_query) {
rsmi_status = rsmi_status_t::RSMI_STATUS_NO_DATA;
@@ -475,13 +489,6 @@ rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuProper
return (amdgpu_property_reinforcement_list.find(asic_id) != amdgpu_property_reinforcement_list.end());
};
auto ends_with = [](const std::string& value, const std::string& ending) {
if (value.size() < ending.size()) {
return false;
}
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
};
// Traverse through all values for a given key
osstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n";
LOG_TRACE(osstream);
@@ -495,7 +502,7 @@ rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuProper
osstream << __PRETTY_FUNCTION__ << " asic id found: " << itr_begin->first << "\n";
// Pci_rev_id matches the filter or ALL Revisions
if ((itr_begin->second.m_pci_rev_id == amdgpu_property_query.m_pci_rev_id) ||
(itr_begin->second.m_pci_rev_id == kDevRevIDAll)) {
(itr_begin->second.m_pci_rev_id == kDevRevIDAll)) {
osstream << __PRETTY_FUNCTION__ << " asic rev.id found: " << itr_begin->second.m_pci_rev_id << "\n";
// Do we have the property we are looking for?
if (((amdgpu_property_query.m_property != 0) &&
+13 -9
Просмотреть файл
@@ -126,16 +126,20 @@ void TestFrequenciesRead::Run(void) {
} else if (err == AMDSMI_STATUS_NOT_YET_IMPLEMENTED) {
std::cout << "\t**Get " << name <<
": Not implemented on this machine" << std::endl;
// special driver issue, shouldn't normally occur
} else if (err == AMDSMI_STATUS_UNEXPECTED_DATA) {
std::cerr << "WARN: Clock file [" << FreqEnumToStr(t) << "] exists on device [" << i << "] but empty!" << std::endl;
std::cerr << " Likely a driver issue!" << std::endl;
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Supported " << name << " clock frequencies: ";
std::cout << f.num_supported << std::endl;
print_frequencies(&f);
// Verify api support checking functionality is working
err = amdsmi_get_clk_freq(processor_handles_[i], t, nullptr);
ASSERT_EQ(err, AMDSMI_STATUS_INVAL);
}
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Supported " << name << " clock frequencies: ";
std::cout << f.num_supported << std::endl;
print_frequencies(&f);
// Verify api support checking functionality is working
err = amdsmi_get_clk_freq(processor_handles_[i], t, nullptr);
ASSERT_EQ(err, AMDSMI_STATUS_INVAL);
}
}
};
+24 -56
Просмотреть файл
@@ -104,8 +104,7 @@ void TestFrequenciesReadWrite::Run(void) {
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
PrintDeviceHeader(processor_handles_[dv_ind]);
for (uint32_t clk = (uint32_t)CLK_TYPE_FIRST;
clk <= CLK_TYPE__MAX; ++clk) {
for (uint32_t clk = CLK_TYPE_FIRST; clk <= CLK_TYPE__MAX; ++clk) {
amdsmi_clk = (amdsmi_clk_type_t)clk;
auto freq_read = [&]() -> bool {
@@ -121,14 +120,20 @@ void TestFrequenciesReadWrite::Run(void) {
std::cout << "\t**Set " << FreqEnumToStr(amdsmi_clk) <<
": Not supported on this machine" << std::endl;
return false;
} else {
// CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "Initial frequency for clock " <<
FreqEnumToStr(amdsmi_clk) << " is " << f.current << std::endl;
}
return true;
}
// special driver issue, shouldn't normally occur
if (ret == AMDSMI_STATUS_UNEXPECTED_DATA) {
std::cerr << "WARN: Clock file [" << FreqEnumToStr(amdsmi_clk) << "] exists on device [" << dv_ind << "] but empty!" << std::endl;
std::cerr << " Likely a driver issue!" << std::endl;
}
// CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "Initial frequency for clock " <<
FreqEnumToStr(amdsmi_clk) << " is " << f.current << std::endl;
}
return true;
};
auto freq_write = [&]() {
@@ -152,19 +157,18 @@ void TestFrequenciesReadWrite::Run(void) {
std::endl;
}
ret = amdsmi_set_clk_freq(processor_handles_[dv_ind], amdsmi_clk, freq_bitmask);
//Certain ASICs does not allow to set particular clocks. If set function for a clock returns
//permission error despite root access, manually set ret value to success and return
if (ret == AMDSMI_STATUS_NO_PERM && geteuid() == 0) {
std::cout << "\t**Set " << FreqEnumToStr(amdsmi_clk) <<
": Not supported on this machine. Skipping..." << std::endl;
ret = AMDSMI_STATUS_SUCCESS;
return;
} else if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
// Certain ASICs does not allow to set particular clocks. If set function for a clock returns
// permission error despite root access, manually set ret value to success and return
//
// Sometimes setting clock frequencies is completely not supported
if ((ret == AMDSMI_STATUS_NO_PERM && geteuid() == 0) ||
(ret == AMDSMI_STATUS_NOT_SUPPORTED)) {
std::cout << "\t**Set " << FreqEnumToStr(amdsmi_clk) <<
": Not supported on this machine. Skipping..." << std::endl;
ret = AMDSMI_STATUS_SUCCESS;
return;
}
CHK_ERR_ASRT(ret)
ret = amdsmi_get_clk_freq(processor_handles_[dv_ind], amdsmi_clk, &f);
if (ret != AMDSMI_STATUS_SUCCESS) {
@@ -187,7 +191,9 @@ void TestFrequenciesReadWrite::Run(void) {
}
ret = amdsmi_set_gpu_perf_level(processor_handles_[dv_ind], AMDSMI_DEV_PERF_LEVEL_AUTO);
if (ret != AMDSMI_STATUS_SUCCESS) {
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
std::cout << "\t**Setting performance level is not supported on this machine. Skipping..." << std::endl;
ret = AMDSMI_STATUS_SUCCESS;
return;
}
};
@@ -199,44 +205,6 @@ void TestFrequenciesReadWrite::Run(void) {
}
freq_write();
CHK_ERR_ASRT(ret)
#if 0
ret = amdsmi_get_clk_freq(dv_ind, amdsmi_clk, &f);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "Initial frequency for clock " << amdsmi_clk << " is " <<
f.current << std::endl;
}
// Set clocks to something other than the usual default of the lowest
// frequency.
freq_bitmask = 0b01100; // Try the 3rd and 4th clocks
std::string freq_bm_str =
std::bitset<AMDSMI_MAX_NUM_FREQUENCIES>(freq_bitmask).to_string();
freq_bm_str.erase(0, std::min(freq_bm_str.find_first_not_of('0'),
freq_bm_str.size()-1));
IF_VERB(STANDARD) {
std::cout << "Setting frequency mask for clock " << amdsmi_clk <<
" to 0b" << freq_bm_str << " ..." << std::endl;
}
ret = amdsmi_set_clk_freq(dv_ind, amdsmi_clk, freq_bitmask);
CHK_ERR_ASRT(ret)
ret = amdsmi_get_clk_freq(dv_ind, amdsmi_clk, &f);
CHK_ERR_ASRT(ret)
IF_VERB(STANDARD) {
std::cout << "Frequency is now index " << f.current << std::endl;
std::cout << "Resetting mask to all frequencies." << std::endl;
}
ret = amdsmi_set_clk_freq(dv_ind, amdsmi_clk, 0xFFFFFFFF);
CHK_ERR_ASRT(ret)
ret = amdsmi_set_gpu_perf_level(dv_ind, AMDSMI_DEV_PERF_LEVEL_AUTO);
CHK_ERR_ASRT(ret)
#endif
}
}
}
+2 -2
Просмотреть файл
@@ -177,8 +177,8 @@ void TestGpuMetricsRead::Run(void) {
<< std::to_string(smu.pcie_link_speed) << '\n';
std::cout << "gfx_activity_acc="
<< std::dec << smu.gfx_activity_acc << '\n';
std::cout << "mem_actvity_acc="
<< std::dec << smu.mem_actvity_acc << '\n';
std::cout << "mem_activity_acc="
<< std::dec << smu.mem_activity_acc << '\n';
for (int i = 0; i < AMDSMI_NUM_HBM_INSTANCES; ++i) {
std::cout << "temperature_hbm[" << i << "]=" << std::dec <<
+7
Просмотреть файл
@@ -112,6 +112,13 @@ void TestPowerCapReadWrite::Run(void) {
max = info.max_power_cap;
orig = info.default_power_cap;
// Check if power cap is within the range
// skip the test otherwise
if (orig < min || orig > max) {
std::cout << "Power cap is not within the range. Skipping test for " << dv_ind << std::endl;
continue;
}
new_cap = (max + min)/2;
IF_VERB(STANDARD) {
+4
Просмотреть файл
@@ -126,6 +126,10 @@ void TestPowerReadWrite::Run(void) {
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
std::cout << "The power profile presets settings is not supported. "
<< std::endl;
// Verify api support checking functionality is working
ret = amdsmi_get_gpu_power_profile_presets(processor_handles_[dv_ind], 0, nullptr);
ASSERT_EQ(ret, AMDSMI_STATUS_NOT_SUPPORTED);
continue;
}
CHK_ERR_ASRT(ret)
-3
Просмотреть файл
@@ -106,14 +106,12 @@ static void RunCustomTestProlog(TestBase *test) {
}
test->SetUp();
test->Run();
return;
}
static void RunCustomTestEpilog(TestBase *tst) {
if (sRSMIGlvalues->verbosity >= TestBase::VERBOSE_STANDARD) {
tst->DisplayResults();
}
tst->Close();
return;
}
// If the test case one big test, you should use RunGenericTest()
@@ -125,7 +123,6 @@ static void RunCustomTestEpilog(TestBase *tst) {
static void RunGenericTest(TestBase *test) {
RunCustomTestProlog(test);
RunCustomTestEpilog(test);
return;
}
+1 -4
Просмотреть файл
@@ -63,10 +63,7 @@ $BLACKLIST_ALL_ASICS\
# /sys/class/kfd/kfd/topology/nodes/*/properties
FILTER[90400]=\
$BLACKLIST_ALL_ASICS\
"rsmitstReadOnly.TestVoltCurvRead:"\
"rsmitstReadOnly.TestFrequenciesRead:"\
"rsmitstReadWrite.TestFrequenciesReadWrite:"\
"rsmitstReadWrite.TestPowerReadWrite"
"rsmitstReadOnly.TestVoltCurvRead"
FILTER[90401]=${FILTER[90400]}
FILTER[90402]=${FILTER[90400]}
+4 -7
Просмотреть файл
@@ -43,7 +43,7 @@
*
*/
#include <assert.h>
#include <cassert>
#include "amd_smi/amdsmi.h"
#include "test_base.h"
@@ -61,10 +61,9 @@ static const char kResultsLabel[] = "TEST RESULTS";
// This one is used outside this file
const char kSetupLabel[] = "TEST SETUP";
TestBase::TestBase() : setup_failed_(false), description_("") {
}
TestBase::~TestBase() {
TestBase::TestBase() : setup_failed_(false) {
}
TestBase::~TestBase() = default;
void TestBase::MakeHeaderStr(const char *inStr,
std::string *outStr) const {
@@ -155,8 +154,6 @@ void TestBase::SetUp(uint64_t init_flags) {
std::cout << "No AMD SMI tests can be run." << std::endl;
}
}
return;
}
void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) {
@@ -254,7 +251,7 @@ void TestBase::set_description(std::string d) {
size_t endlptr;
for (size_t i = le; i < description_.size(); i += le) {
endlptr = description_.find_last_of(" ", i);
endlptr = description_.find_last_of(' ', i);
description_.replace(endlptr, 1, "\n");
i = endlptr;
}
+2 -2
Просмотреть файл
@@ -45,6 +45,7 @@
#ifndef TESTS_AMD_SMI_TEST_TEST_BASE_H_
#define TESTS_AMD_SMI_TEST_TEST_BASE_H_
#include <cstdint>
#include <string>
#include <vector>
#include "amd_smi/amdsmi.h"
@@ -150,9 +151,8 @@ class TestBase {
"\t===> Abort is over-ridden due to dont_fail command line option." \
<< std::endl; \
return; \
} else { \
ASSERT_EQ(AMDSMI_STATUS_SUCCESS, (RET)); \
} \
ASSERT_EQ(AMDSMI_STATUS_SUCCESS, (RET)); \
}
void MakeHeaderStr(const char *inStr, std::string *outStr);
+3 -3
Просмотреть файл
@@ -43,13 +43,13 @@
*
*/
#include <assert.h>
#include <stdint.h>
#include <getopt.h>
#include <cassert>
#include <cstdint>
#include <iostream>
#include <string>
#include <map>
#include <string>
#include "test_base.h"
#include "test_common.h"
+2 -2
Просмотреть файл
@@ -74,7 +74,7 @@ void DumpMonitorInfo(const TestBase *test);
#endif
#define DISPLAY_AMDSMI_ERR(RET) { \
if (RET != AMDSMI_STATUS_SUCCESS) { \
if ((RET) != AMDSMI_STATUS_SUCCESS) { \
const char *err_str; \
std::cout << "\t===> ERROR: AMDSMI call returned " << (RET) << std::endl; \
amdsmi_status_code_to_string((RET), &err_str); \
@@ -91,7 +91,7 @@ void DumpMonitorInfo(const TestBase *test);
} \
}
#define CHK_AMDSMI_PERM_ERR(RET) { \
if (RET == AMDSMI_STATUS_NO_PERM) { \
if ((RET) == AMDSMI_STATUS_NO_PERM) { \
std::cout << "This command requires root access." << std::endl; \
} else { \
DISPLAY_AMDSMI_ERR(RET) \