diff --git a/DEBIAN/postinst.in b/DEBIAN/postinst.in index eed81b8ba7..097bb34bed 100755 --- a/DEBIAN/postinst.in +++ b/DEBIAN/postinst.in @@ -1,21 +1,18 @@ #!/bin/bash -#set -x - -packageName="amd-smi-lib" -logPath=/var/log/amd_smi_lib -logName=AMD-SMI-lib.log -logFile="${logPath}/${logName}" -logrotateConfFile=/etc/logrotate.d/amd_smi.conf - -do_addLogFolder() { - sudo mkdir -p "${logPath}" - sudo touch "${logFile}" - sudo chmod -R a+rw "${logPath}" - sudo chmod a+rw "${logFile}" -} do_configureLogrotate() { - logrotate --version &>/dev/null + local IS_SYSTEMD=0 + local packageName="amd-smi-lib" + local logPath=/var/log/amd_smi_lib + local logFile="${logPath}/AMD-SMI-lib.log" + local logrotateConfFile=/etc/logrotate.d/amd_smi.conf + + mkdir -p "${logPath}" + touch "${logFile}" + chmod -R a+rw "${logPath}" + chmod a+rw "${logFile}" + + command -v logrotate &>/dev/null if [ $? -ne 0 ]; then echo "[WARNING] Detected logrotate is not installed."\ "$packageName logs (when turned on) will not rotate properly." @@ -23,14 +20,14 @@ do_configureLogrotate() { fi if [ ! -f $logrotateConfFile ]; then - sudo touch "${logrotateConfFile}" - sudo chmod 644 "${logrotateConfFile}" # root r/w, all others read + touch "${logrotateConfFile}" + chmod 644 "${logrotateConfFile}" # root r/w, all others read # AMD SMI logging rotation, rotates files using root user/group # Hourly logrotation check # Only rotates if size grew larger than 1MB # Max of 4 rotation files, oldest will be removed # Rotated files use date extention of ex. AMD-SMI-lib.log.2023-05-09_16:51:42 - cat << EOF | sudo tee "${logrotateConfFile}" >/dev/null + cat << EOF > "${logrotateConfFile}" ${logFile} { su root root hourly @@ -47,44 +44,29 @@ EOF # issue was RPM build thought we were using macros # https://gitlab.kitware.com/cmake/cmake/-/issues/22965 # https://rpm-software-management.github.io/rpm/manual/spec.html - sudo sed -i s/%%/%/g "${logrotateConfFile}" + sed -i s/%%/%/g "${logrotateConfFile}" # workaround: remove extra 'OURCE' text # from amd_smi.conf. Unsure if CMAKE, # bash, or here document # issue (only seen on RHEL 8.7) - sudo sed -i s/OURCE//g "${logrotateConfFile}" + sed -i s/OURCE//g "${logrotateConfFile}" fi # check if logrotate uses system timers, Ubuntu/modern OS's do # Several older OS's like RHEL 8.7, do not. Instead defaults # to use daily cron jobs - see https://stackoverflow.com/a/69465677 - sudo systemctl list-timers|grep -iq logrotate - if [ $? -ne 0 ]; then - # confirm logrotate file exists in daily - if [ -f /etc/cron.daily/logrotate ]; then - # move logrotate daily to hourly - if [ -d /etc/cron.hourly ]; then - sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate - else - echo "[WARNING] Could find and configure hourly cron for $packageName's"\ - " logrotate. $packageName logs (when turned on) will not rotate properly." - return - fi - else - # confirm that it's already been moved to hourly - sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly - if [ $? -ne 0 ]; then - echo "[WARNING] Could not configure an hourly cron for $packageName's logrotate."\ - "$packageName logs (when turned on) may not rotate properly." - fi + if [ -d /run/systemd/system ]; then + systemctl list-timers | grep -iq logrotate + if [ $? -eq 0 ]; then + IS_SYSTEMD=1 fi - return #done configuring for non-systemd timers - else + fi + if [ "$IS_SYSTEMD" -eq 1 ]; then # Configure systemd timers - the typical setup for modern Linux logrotation setups if [ -f /lib/systemd/system/logrotate.timer ]; then if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then - sudo cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup + cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup fi - cat <<'EOF' | sudo tee /lib/systemd/system/logrotate.timer >/dev/null + cat << EOF > /lib/systemd/system/logrotate.timer [Unit] Description=Hourly rotation of log files Documentation=man:logrotate(8) man:logrotate.conf(5) @@ -98,12 +80,19 @@ Persistent=true [Install] WantedBy=timers.target EOF - sudo systemctl reenable --now logrotate.timer + systemctl reenable --now logrotate.timer else echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ "$packageName logs (when turned on) will not rotate properly." fi - return #done configuring for systemd timers + else + # $IS_SYSTEMD -eq 0 + if [ -f /etc/cron.daily/logrotate ]; then + # move logrotate daily to hourly + if [ -d /etc/cron.hourly ]; then + mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate + fi + fi fi } @@ -118,8 +107,7 @@ do_ldconfig() { case "$1" in ( configure ) do_ldconfig - do_addLogFolder - do_configureLogrotate + do_configureLogrotate || return 0 ;; ( abort-upgrade | abort-remove | abort-deconfigure ) echo "$1" diff --git a/DEBIAN/prerm.in b/DEBIAN/prerm.in index 307ce4146d..e0aae8f396 100755 --- a/DEBIAN/prerm.in +++ b/DEBIAN/prerm.in @@ -1,29 +1,4 @@ #!/bin/bash -set -e - -packageName="amd-smi-lib" -logPath=/var/log/amd_smi_lib -logName=AMD-SMI-lib.log -logFile="${logPath}/${logName}" -logrotateConfFile=/etc/logrotate.d/amd_smi.conf - -rm_logFolder() { - sudo rm -rf "$logPath" -} - -return_logrotateToOrigConfig() { - if [ -f $logrotateConfFile ]; then - sudo rm -rf "${logrotateConfFile}" - fi - if [ -f /etc/cron.hourly/logrotate ]; then - sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate - fi - if [ -f /lib/systemd/system/logrotate.timer.backup ]; then - sudo cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer - sudo rm -rf /lib/systemd/system/logrotate.timer.backup - sudo systemctl reenable --now logrotate.timer - fi -} rm_ldconfig() { # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build @@ -39,6 +14,25 @@ rm_pyc() { rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi/__pycache__ } +rm_logFolder() { + rm -rf /var/log/amd_smi_lib +} + +return_logrotateToOrigConfig() { + local logrotateConfFile=/etc/logrotate.d/amd_smi.conf + if [ -f $logrotateConfFile ]; then + rm -rf "$logrotateConfFile" + fi + if [ -f /etc/cron.hourly/logrotate ]; then + mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate + fi + if [ -f /lib/systemd/system/logrotate.timer.backup ]; then + cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer + rm -rf /lib/systemd/system/logrotate.timer.backup + systemctl reenable --now logrotate.timer + fi +} + case "$1" in ( remove | upgrade) diff --git a/RPM/post.in b/RPM/post.in index 1e10be1849..454ad2909f 100755 --- a/RPM/post.in +++ b/RPM/post.in @@ -1,21 +1,18 @@ #!/bin/bash -#set -x - -packageName="amd-smi-lib" -logPath=/var/log/amd_smi_lib -logName=AMD-SMI-lib.log -logFile="${logPath}/${logName}" -logrotateConfFile=/etc/logrotate.d/amd_smi.conf - -do_addLogFolder() { - sudo mkdir -p "${logPath}" - sudo touch "${logFile}" - sudo chmod -R a+rw "${logPath}" - sudo chmod a+rw "${logFile}" -} do_configureLogrotate() { - logrotate --version &>/dev/null + local IS_SYSTEMD=0 + local packageName="amd-smi-lib" + local logPath=/var/log/amd_smi_lib + local logFile="${logPath}/AMD-SMI-lib.log" + local logrotateConfFile=/etc/logrotate.d/amd_smi.conf + + mkdir -p "${logPath}" + touch "${logFile}" + chmod -R a+rw "${logPath}" + chmod a+rw "${logFile}" + + command -v logrotate &>/dev/null if [ $? -ne 0 ]; then echo "[WARNING] Detected logrotate is not installed."\ "$packageName logs (when turned on) will not rotate properly." @@ -23,14 +20,14 @@ do_configureLogrotate() { fi if [ ! -f $logrotateConfFile ]; then - sudo touch "${logrotateConfFile}" - sudo chmod 644 "${logrotateConfFile}" # root r/w, all others read + touch "${logrotateConfFile}" + chmod 644 "${logrotateConfFile}" # root r/w, all others read # AMD SMI logging rotation, rotates files using root user/group # Hourly logrotation check # Only rotates if size grew larger than 1MB # Max of 4 rotation files, oldest will be removed # Rotated files use date extention of ex. AMD-SMI-lib.log.2023-05-09_16:51:42 - cat << EOF | sudo tee "${logrotateConfFile}" >/dev/null + cat << EOF > "${logrotateConfFile}" ${logFile} { su root root hourly @@ -47,44 +44,29 @@ EOF # issue was RPM build thought we were using macros # https://gitlab.kitware.com/cmake/cmake/-/issues/22965 # https://rpm-software-management.github.io/rpm/manual/spec.html - sudo sed -i s/%%/%/g "${logrotateConfFile}" + sed -i s/%%/%/g "${logrotateConfFile}" # workaround: remove extra 'OURCE' text # from amd_smi.conf. Unsure if CMAKE, # bash, or here document # issue (only seen on RHEL 8.7) - sudo sed -i s/OURCE//g "${logrotateConfFile}" + sed -i s/OURCE//g "${logrotateConfFile}" fi # check if logrotate uses system timers, Ubuntu/modern OS's do # Several older OS's like RHEL 8.7, do not. Instead defaults # to use daily cron jobs - see https://stackoverflow.com/a/69465677 - sudo systemctl list-timers|grep -iq logrotate - if [ $? -ne 0 ]; then - # confirm logrotate file exists in daily - if [ -f /etc/cron.daily/logrotate ]; then - # move logrotate daily to hourly - if [ -d /etc/cron.hourly ]; then - sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate - else - echo "[WARNING] Could find and configure hourly cron for $packageName's"\ - " logrotate. $packageName logs (when turned on) will not rotate properly." - return - fi - else - # confirm that it's already been moved to hourly - sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly - if [ $? -ne 0 ]; then - echo "[WARNING] Could not configure an hourly cron for $packageName's logrotate."\ - "$packageName logs (when turned on) may not rotate properly." - fi + if [ -d /run/systemd/system ]; then + systemctl list-timers | grep -iq logrotate + if [ $? -eq 0 ]; then + IS_SYSTEMD=1 fi - return #done configuring for non-systemd timers - else + fi + if [ "$IS_SYSTEMD" -eq 1 ]; then # Configure systemd timers - the typical setup for modern Linux logrotation setups if [ -f /lib/systemd/system/logrotate.timer ]; then if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then - sudo cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup + cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup fi - cat <<'EOF' | sudo tee /lib/systemd/system/logrotate.timer >/dev/null + cat << EOF > /lib/systemd/system/logrotate.timer [Unit] Description=Hourly rotation of log files Documentation=man:logrotate(8) man:logrotate.conf(5) @@ -98,12 +80,19 @@ Persistent=true [Install] WantedBy=timers.target EOF - sudo systemctl reenable --now logrotate.timer + systemctl reenable --now logrotate.timer else echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ "$packageName logs (when turned on) will not rotate properly." fi - return #done configuring for systemd timers + else + # $IS_SYSTEMD -eq 0 + if [ -f /etc/cron.daily/logrotate ]; then + # move logrotate daily to hourly + if [ -d /etc/cron.hourly ]; then + mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate + fi + fi fi } @@ -115,14 +104,8 @@ do_ldconfig() { fi } -# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build -if [ "@ENABLE_LDCONFIG@" == "ON" ]; then - echo -e "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@" > /etc/ld.so.conf.d/x86_64-libamd_smi_lib.conf - ldconfig -fi - # post install or upgrade, $i is 1 or 2 -> do these actions -if [ $1 -ge 1 ]; then - do_addLogFolder - do_configureLogrotate +if [ "$1" -ge 1 ]; then + do_ldconfig + do_configureLogrotate || return 0 fi diff --git a/RPM/postun.in b/RPM/postun.in index 7b71daf8eb..143ce178e2 100755 --- a/RPM/postun.in +++ b/RPM/postun.in @@ -1,7 +1,7 @@ #!/bin/bash # second term originates from ENABLE_LDCONFIG = ON/OFF at package build -if [ $1 -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then +if [ "$1" -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then # perform the below actions for rpm remove($1=0) or upgrade($1=1) operations rm -f /etc/ld.so.conf.d/x86_64-libamd_smi_lib.conf ldconfig diff --git a/RPM/preun.in b/RPM/preun.in index d813219a08..2b8654046e 100755 --- a/RPM/preun.in +++ b/RPM/preun.in @@ -1,31 +1,4 @@ #!/bin/bash -#set -x - -set -e - -packageName="amd-smi-lib" -logPath=/var/log/amd_smi_lib -logName=AMD-SMI-lib.log -logFile="${logPath}/${logName}" -logrotateConfFile=/etc/logrotate.d/amd_smi.conf - -rm_logFolder() { - sudo rm -rf "$logPath" -} - -return_logrotateToOrigConfig() { - if [ -f $logrotateConfFile ]; then - sudo rm -rf "${logrotateConfFile}" - fi - if [ -f /etc/cron.hourly/logrotate ]; then - sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate - fi - if [ -f /lib/systemd/system/logrotate.timer.backup ]; then - sudo cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer - sudo rm -rf /lib/systemd/system/logrotate.timer.backup - sudo systemctl reenable --now logrotate.timer - fi -} rm_pyc() { # remove pyc files generated by python @@ -33,7 +6,26 @@ rm_pyc() { rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@SHARE_INSTALL_PREFIX@/amdsmi/__pycache__ } -if [ $1 -le 1 ]; then +rm_logFolder() { + rm -rf /var/log/amd_smi_lib +} + +return_logrotateToOrigConfig() { + local logrotateConfFile=/etc/logrotate.d/amd_smi.conf + if [ -f $logrotateConfFile ]; then + rm -rf "$logrotateConfFile" + fi + if [ -f /etc/cron.hourly/logrotate ]; then + mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate + fi + if [ -f /lib/systemd/system/logrotate.timer.backup ]; then + cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer + rm -rf /lib/systemd/system/logrotate.timer.backup + systemctl reenable --now logrotate.timer + fi +} + +if [ "$1" -le 1 ]; then # perform the below actions for rpm remove($1=0) or upgrade($1=1) operations rm_pyc rm_logFolder diff --git a/docs/.gitignore b/docs/.gitignore index b8ea6fcbcd..b84233aed8 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -7,3 +7,5 @@ /_templates/ /html/ /latex/ +404.md +data/AMD-404.png diff --git a/docs/doxygen/.gitignore b/docs/doxygen/.gitignore new file mode 100644 index 0000000000..5ebfac1dea --- /dev/null +++ b/docs/doxygen/.gitignore @@ -0,0 +1 @@ +docBin/ diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc index 9d1aea0d90..8db5c7c810 100755 --- a/rocm_smi/example/rocm_smi_example.cc +++ b/rocm_smi/example/rocm_smi_example.cc @@ -844,7 +844,7 @@ int main() { } CHK_RSMI_NOT_SUPPORTED_RET(ret) - std::cout << "\t**Averge Power Usage: "; + std::cout << "\t**Average Power Usage: "; ret = rsmi_dev_power_ave_get(i, 0, &val_ui64); if (ret == RSMI_STATUS_SUCCESS) { std::cout << static_cast(val_ui64)/1000 << " W" << std::endl; diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 37d1c6d013..0d9e3d7665 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -480,6 +480,19 @@ typedef enum { RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type } rsmi_temperature_type_t; +/** + * @brief Activity (Utilization) Metrics. This enum is used to identify + * various activity metrics. + * + */ +typedef enum { + /* Utilization */ + RSMI_ACTIVITY_GFX = (0x1 << 0), + RSMI_ACTIVITY_UMC = (0x1 << 1), //!< memory controller + RSMI_ACTIVITY_MM = (0x1 << 2) //!< UVD or VCN +} rsmi_activity_metric_t; + + /** * @brief Voltage Metrics. This enum is used to identify various * Volatge metrics. Corresponding values will be in millivolt. @@ -788,6 +801,17 @@ typedef struct { typedef rsmi_pcie_bandwidth_t rsmi_pcie_bandwidth; /// \endcond +/** + * @brief This structure holds information about the possible activity + * averages. Specifically, the utilization counters. + */ +typedef struct { + /* Utilization */ + uint16_t average_gfx_activity; + uint16_t average_umc_activity; //!< memory controller + uint16_t average_mm_activity; //!< UVD or VCN +} rsmi_activity_metric_counter_t; + /** * @brief This structure holds version information. */ @@ -898,14 +922,28 @@ struct metrics_table_header_t { #define RSMI_GPU_METRICS_API_FORMAT_VER 1 // The content version increments when gpu_metrics is extended with new and/or // existing field sizes are changed. + +/** + * @brief The GPU metrics version 1 + */ #define RSMI_GPU_METRICS_API_CONTENT_VER_1 1 +/** + * @brief The GPU metrics version 2 + */ #define RSMI_GPU_METRICS_API_CONTENT_VER_2 2 +/** + * @brief The GPU metrics version 3 + */ #define RSMI_GPU_METRICS_API_CONTENT_VER_3 3 -// This should match NUM_HBM_INSTANCES +/** + * @brief This should match NUM_HBM_INSTANCES + */ #define RSMI_NUM_HBM_INSTANCES 4 -// Unit conversion factor for HBM temperatures +/** + * @brief Unit conversion factor for HBM temperatures + */ #define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000 typedef struct { @@ -964,7 +1002,7 @@ typedef struct { uint16_t padding; // new in v1 uint32_t gfx_activity_acc; // new in v1 - uint32_t mem_actvity_acc; // new in v1 + uint32_t mem_activity_acc; // new in v1 uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1 /// \endcond } rsmi_gpu_metrics_t; @@ -2288,7 +2326,7 @@ rsmi_dev_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent); * If the function reutrns RSMI_STATUS_SUCCESS, the counter will be set in the value field of * the rsmi_utilization_counter_t. * - * @param[in] count The size of @utilization_counters array. + * @param[in] count The size of utilization_counters array. * * @param[inout] timestamp The timestamp when the counter is retreived. Resolution: 1 ns. * @retval ::RSMI_STATUS_SUCCESS call was successful @@ -2303,6 +2341,57 @@ rsmi_utilization_count_get(uint32_t dv_ind, uint32_t count, uint64_t *timestamp); +/** + * @brief Get activity metric average utilization counter of the specified device + * + * @details Given a device index @p dv_ind, the activity metric type, + * this function returns the requested utilization counters + * + * @param[in] dv_ind a device index + * + * @param[in] activity_metric_type a metric type + * + * @param[inout] activity_metric_counter Multiple utilization counters can be retrieved with a single + * call. The caller must allocate enough space to the rsmi_activity_metric_counter_t structure. + * + * If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding + * field of the counter will be set in the value field of + * the activity_metric_counter_t. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t +rsmi_dev_activity_metric_get(uint32_t dv_ind, + rsmi_activity_metric_t activity_metric_type, + rsmi_activity_metric_counter_t* activity_metric_counter); + +/** + * @brief Get activity metric bandwidth average utilization counter of the specified device + * + * @details Given a device index @p dv_ind, the activity metric type, + * this function returns the requested utilization counters + * + * @param[in] dv_ind a device index + * + * @param[inout] avg_activity average bandwidth utilization counters can be retrieved + * + * If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding + * field of the counter will be set in the value field of + * the activity_metric_counter_t. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t +rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity); + /** * @brief Get the performance level of the device with provided * device index. @@ -2450,7 +2539,7 @@ rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid * */ -rsmi_status_t rsmi_dev_gpu_reset(int32_t dv_ind); +rsmi_status_t rsmi_dev_gpu_reset(uint32_t dv_ind); /** * @brief This function retrieves the voltage/frequency curve information @@ -2684,7 +2773,7 @@ rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t sensor_ind, * */ rsmi_status_t -rsmi_dev_perf_level_set(int32_t dv_ind, rsmi_dev_perf_level_t perf_lvl); +rsmi_dev_perf_level_set(uint32_t dv_ind, rsmi_dev_perf_level_t perf_lvl); /** * @brief Set the PowerPlay performance level associated with the device with @@ -2750,7 +2839,7 @@ rsmi_dev_perf_level_set_v1(uint32_t dv_ind, rsmi_dev_perf_level_t perf_lvl); * @retval ::RSMI_STATUS_PERMISSION function requires root access * */ -rsmi_status_t rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od); +rsmi_status_t rsmi_dev_overdrive_level_set(uint32_t dv_ind, uint32_t od); /** * @brief Set the overdrive percent associated with the device with provided @@ -3398,7 +3487,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices, * @brief Get the info of a process on a specific device. * * @details Given a process id @p pid, a @p dv_ind, this function will - * write the process information for @p pid on the device, if available, to + * write the process information for pid on the device, if available, to * the memory pointed to by @p proc. * * @param[in] pid The process id of the process for which the gpu @@ -3406,7 +3495,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices, * * @param[in] dv_ind a device index where the process running on. * - * @param[inout] procs a pointer to memory provided by the caller to which + * @param[inout] proc a pointer to memory provided by the caller to which * process information will be written. * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call @@ -3598,7 +3687,7 @@ rsmi_topo_get_link_type(uint32_t dv_ind_src, uint32_t dv_ind_dst, * * @details Given a source device index @p dv_ind_src and * a destination device index @p dv_ind_dst, and a pointer to a - * bool @accessible, this function will write the P2P connection status + * bool @p accessible, this function will write the P2P connection status * between the device @p dv_ind_src and @p dv_ind_dst to the memory * pointed to by @p accessible. * diff --git a/rocm_smi/include/rocm_smi/rocm_smi_common.h b/rocm_smi/include/rocm_smi/rocm_smi_common.h index bff8c8edc5..f29e427789 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_common.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_common.h @@ -90,7 +90,7 @@ /* This group of macros is used to facilitate checking of support for rsmi_dev* * "getter" functions. When the return buffer is set to nullptr, the macro will * check the previously gathered device support data to see if the function, - * with possible variants (e.g., memory types, firware types,...) and + * with possible variants (e.g., memory types, firmware types,...) and * subvariants (e.g. monitors/sensors) are supported. */ // This macro assumes dev already available diff --git a/rocm_smi/include/rocm_smi/rocm_smi_kfd.h b/rocm_smi/include/rocm_smi/rocm_smi_kfd.h index 9cf8fd8e40..90c7f6ff3b 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_kfd.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_kfd.h @@ -118,6 +118,10 @@ GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_count); int ReadKFDDeviceProperties(uint32_t dev_id, std::vector *retVec); +int read_node_properties(uint32_t node, std::string property_name, + uint64_t *val); +int get_gpu_id(uint32_t node, uint64_t *gpu_id); + } // namespace smi } // namespace amd diff --git a/rocm_smi/include/rocm_smi/rocm_smi_main.h b/rocm_smi/include/rocm_smi/rocm_smi_main.h index f276bd85bb..8b60324988 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_main.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_main.h @@ -113,7 +113,8 @@ class RocmSMI { uint64_t *weight); int get_node_index(uint32_t dv_ind, uint32_t *node_ind); const RocmSMI_env_vars& getEnv(void); - void printEnvVarInfo(void); + std::string getRSMIEnvVarInfo(void); + void debugRSMIEnvVarInfo(); bool isLoggingOn(void); uint32_t getLogSetting(void); static const std::map devInfoTypesStrings; diff --git a/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/rocm_smi/include/rocm_smi/rocm_smi_utils.h index 04e49b1d41..f66eedf314 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -99,7 +99,8 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type, rsmi_status_t ErrnoToRsmiStatus(int err); std::string getRSMIStatusString(rsmi_status_t ret); std::tuple + std::string, std::string, std::string, std::string, + std::string, std::string, std::string> getSystemDetails(void); void logSystemDetails(void); rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str); @@ -107,6 +108,9 @@ rsmi_status_t getBDFWithDomain(uint64_t bdf_id, std::string& bfd_str); void logHexDump(const char *desc, const void *addr, const size_t len, size_t perLine); bool isSystemBigEndian(); +std::string getBuildType(); +std::string getMyLibPath(); +int subDirectoryCountInPath(const std::string path); template std::string print_int_as_hex(T i, bool showHexNotation=true) { std::stringstream ss; diff --git a/rocm_smi/python_smi_tools/rocm_smi.py b/rocm_smi/python_smi_tools/rocm_smi.py index 4771a29f8f..2a0a4655d7 100755 --- a/rocm_smi/python_smi_tools/rocm_smi.py +++ b/rocm_smi/python_smi_tools/rocm_smi.py @@ -173,10 +173,12 @@ def formatMatrixToJSON(deviceList, matrix, metricName): printSysLog(metricName.format(deviceList[row_indx], deviceList[col_ind]), valueStr) -def getBus(device): +def getBus(device, silent=False): """ Return the bus identifier of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ bdfid = c_uint64(0) ret = rocmsmi.rsmi_dev_pci_id_get(device, byref(bdfid)) @@ -188,16 +190,18 @@ def getBus(device): function = bdfid.value & 0x7 pic_id = '{:04X}:{:02X}:{:02X}.{:0X}'.format(domain, bus, device, function) - if rsmi_ret_ok(ret, device, 'get_pci_id'): + if rsmi_ret_ok(ret, device, 'get_pci_id', silent): return pic_id -def getFanSpeed(device): +def getFanSpeed(device, silent=True): """ Return a tuple with the fan speed (value,%) for a specified device, or (None,None) if either current fan speed or max fan speed cannot be obtained @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ fanLevel = c_int64() fanMax = c_int64() @@ -209,7 +213,7 @@ def getFanSpeed(device): /sys/class/drm/cardX/device/hwmon/hwmonX/pwmX """ ret = rocmsmi.rsmi_dev_fan_speed_get(device, sensor_ind, byref(fanLevel)) - if rsmi_ret_ok(ret, device, 'get_fan_speed', True): + if rsmi_ret_ok(ret, device, 'get_fan_speed', silent): fl = fanLevel.value last_ret = ret @@ -217,7 +221,7 @@ def getFanSpeed(device): /sys/class/drm/cardX/device/hwmon/hwmonX/pwmX """ ret = rocmsmi.rsmi_dev_fan_speed_max_get(device, sensor_ind, byref(fanMax)) - if rsmi_ret_ok(ret, device, 'get_fan_max_speed', True): + if rsmi_ret_ok(ret, device, 'get_fan_max_speed', silent): fm = fanMax.value """ In case we had an error before, we don't overwrite it with a @@ -232,59 +236,67 @@ def getFanSpeed(device): return (last_ret, fl, round((float(fl) / float(fm)) * 100, 2)) -def getGpuUse(device): +def getGpuUse(device, silent=False): """ Return the current GPU usage as a percentage @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ percent = c_uint32() ret = rocmsmi.rsmi_dev_busy_percent_get(device, byref(percent)) - if rsmi_ret_ok(ret, device, 'GPU Utilization '): + if rsmi_ret_ok(ret, device, 'GPU Utilization ', silent): return percent.value return -1 -def getId(device): +def getId(device, silent=False): """ Return the hexadecimal value of a device's ID @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ dv_id = c_short() ret = rocmsmi.rsmi_dev_id_get(device, byref(dv_id)) - if rsmi_ret_ok(ret, device, 'get_device_id'): + if rsmi_ret_ok(ret, device, 'get_device_id', silent): return hex(dv_id.value) -def getRev(device): +def getRev(device, silent=False): """ Return the hexadecimal value of a device's Revision @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ dv_rev = c_short() ret = rocmsmi.rsmi_dev_revision_get(device, byref(dv_rev)) - if rsmi_ret_ok(ret, device, 'get_device_rev'): + if rsmi_ret_ok(ret, device, 'get_device_rev', silent): return hex(dv_rev.value) -def getMaxPower(device): +def getMaxPower(device, silent=False): """ Return the maximum power cap of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ power_cap = c_uint64() ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap)) - if rsmi_ret_ok(ret, device, 'get_power_cap'): + if rsmi_ret_ok(ret, device, 'get_power_cap', silent): return power_cap.value / 1000000 return -1 -def getMemInfo(device, memType, quiet=False): +def getMemInfo(device, memType, silent=False): """ Returns a tuple of (memory_used, memory_total) of the requested memory type usage for the device specified @param device: DRM device identifier @param type: [vram|vis_vram|gtt] Memory type to return - @param quiet=Turn on to silience error output + @param silent=Turn on to silence error output (you plan to handle manually). Default is off, which exposes any issue accessing the different memory types. @@ -300,11 +312,11 @@ def getMemInfo(device, memType, quiet=False): memTotal = None ret = rocmsmi.rsmi_dev_memory_usage_get(device, memory_type_l.index(memType), byref(memoryUse)) - if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), quiet): + if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), silent): memUsed = memoryUse.value ret = rocmsmi.rsmi_dev_memory_total_get(device, memory_type_l.index(memType), byref(memoryTot)) - if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), quiet): + if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), silent): memTotal = memoryTot.value return (memUsed, memTotal) @@ -334,14 +346,16 @@ def getProcessName(pid): return pName -def getPerfLevel(device): +def getPerfLevel(device, silent=False): """ Return the current performance level of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ perf = rsmi_dev_perf_level_t() ret = rocmsmi.rsmi_dev_perf_level_get(device, byref(perf)) - if rsmi_ret_ok(ret, device, 'get_perf_level'): + if rsmi_ret_ok(ret, device, 'get_perf_level', silent): return perf_level_string(perf.value) return 'N/A' @@ -369,42 +383,48 @@ def getPidList(): return -def getPower(device): +def getPower(device, silent=False): """ Return the current power level of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ power = c_uint32() ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power)) - if rsmi_ret_ok(ret, device, 'get_power_avg'): + if rsmi_ret_ok(ret, device, 'get_power_avg', silent): return power.value / 1000000 return 'N/A' -def getRasEnablement(device, block): +def getRasEnablement(device, block, silent=True): """ Return RAS enablement state for a given device @param device: DRM device identifier @param block: RAS block identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ state = rsmi_ras_err_state_t() ret = rocmsmi.rsmi_dev_ecc_status_get(device, rsmi_gpu_block_d[block], byref(state)) - if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), True): + if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), silent): return rsmi_ras_err_stale_machine[state.value].upper() return 'N/A' -def getTemp(device, sensor): +def getTemp(device, sensor, silent=True): """ Display the current temperature from a given device's sensor @param device: DRM device identifier @param sensor: Temperature sensor identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ temp = c_int64(0) metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), temp_type_lst.index(sensor), metric, byref(temp)) - if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), True): + if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), silent): return temp.value / 1000 return 'N/A' @@ -428,52 +448,60 @@ def findFirstAvailableTemp(device): continue return (ret_temp_type, ret_temp) -def getVbiosVersion(device): +def getVbiosVersion(device, silent=False): """ Returns the VBIOS version for a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ vbios = create_string_buffer(256) ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256) if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: return "Unsupported" - elif rsmi_ret_ok(ret, device): + elif rsmi_ret_ok(ret, device, silent=silent): return vbios.value.decode() -def getVersion(deviceList, component): +def getVersion(deviceList, component, silent=False): """ Return the software version for the specified component @param deviceList: List of DRM devices (can be a single-item list) @param component: Component (currently only driver) + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ ver_str = create_string_buffer(256) ret = rocmsmi.rsmi_version_str_get(component, ver_str, 256) - if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component)): + if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component), silent): return ver_str.value.decode() return None -def getComputePartition(device): +def getComputePartition(device, silent=True): """ Return the current compute partition of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ currentComputePartition = create_string_buffer(256) ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256) - if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode(): + if rsmi_ret_ok(ret, device, 'get_compute_partition', silent) and currentComputePartition.value.decode(): return str(currentComputePartition.value.decode()) return "N/A" -def getMemoryPartition(device): +def getMemoryPartition(device, silent=True): """ Return the current memory partition of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ currentNPSMode = create_string_buffer(256) ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256) - if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode(): + if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent) and currentNPSMode.value.decode(): return str(currentNPSMode.value.decode()) return "N/A" @@ -610,10 +638,21 @@ def printLog(device, metricName, value=None, extraSpace=False, useItalics=False) lock.acquire() if useItalics: logstr = italics + logstr + end - if extraSpace: - print('\n' + logstr + '\n', end='', flush=True) - else: - print(logstr + '\n', end='', flush=True) + try: + if extraSpace: + print('\n', end='') + print(logstr + '\n', end='') + sys.stdout.flush() + # when piped into programs like 'head' - print throws an error. + # silently ignore instead + except(BrokenPipeError, IOError): + # https://docs.python.org/3/library/signal.html#note-on-sigpipe + # Python flushes standard streams on exit; redirect remaining output + # to devnull to avoid another BrokenPipeError at shutdown + devnull = os.open(os.devnull, os.O_WRONLY) + os.dup2(devnull, sys.stdout.fileno()) + sys.exit(1) # Python exits with error code 1 on EPIPE + lock.release() @@ -785,12 +824,10 @@ def resetFans(deviceList): for device in deviceList: sensor_ind = c_uint32(0) ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind) - if (ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED) or (ret == rsmi_status_t.RSMI_STATUS_PERMISSION): - if not rsmi_ret_ok(rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED, device, 'reset_fan'): - continue + if rsmi_ret_ok(ret, device, silent=True): + printLog(device, 'Successfully reset fan speed to driver control', None) else: - if rsmi_ret_ok(ret, device, 'reset_fan'): - printLog(device, 'Successfully reset fan speed to driver control', None) + printLog(device, 'Not supported on the given system', None) printLogSpacer() @@ -1311,8 +1348,10 @@ def setFanSpeed(deviceList, fan): else: fanLevel = int(str(fan)) ret = rocmsmi.rsmi_dev_fan_speed_set(device, 0, int(fanLevel)) - if rsmi_ret_ok(ret, device, 'set_fan_speed'): + if rsmi_ret_ok(ret, device, silent=True): printLog(device, 'Successfully set fan speed to level %s' % (str(int(fanLevel))), None) + else: + printLog(device, 'Not supported on the given system', None) printLogSpacer() @@ -1595,10 +1634,13 @@ def showAllConcise(deviceList): MAX_ALL_CONCISE_WIDTH = 100 appWidth_temp = appWidth appWidth = MAX_ALL_CONCISE_WIDTH + silent = True printLogSpacer(' Concise Info ') deviceList.sort() - (temp_type, _) = findFirstAvailableTemp(deviceList[0]) + temp_type = '(' + temp_type_lst[0] + ')' + if len(deviceList) >= 1: + (temp_type, _) = findFirstAvailableTemp(deviceList[0]) available_temp_type = temp_type.lower() available_temp_type = available_temp_type.replace('(', '') available_temp_type = available_temp_type.replace(')', '') @@ -1620,9 +1662,9 @@ def showAllConcise(deviceList): values = {} degree_sign = u'\N{DEGREE SIGN}' for device in deviceList: - gpu_dev_product_info = getDevProductInfo(device) + gpu_dev_product_info = getDevProductInfo(device, silent) gpu_dev_product_info_names = list(gpu_dev_product_info[device]) - temp_val = str(getTemp(device, available_temp_type)) + temp_val = str(getTemp(device, available_temp_type, silent)) if temp_val != 'N/A': temp_val += degree_sign + 'C' avgPwr = str(getPower(device)) @@ -1630,26 +1672,25 @@ def showAllConcise(deviceList): avgPwr += 'W' else: avgPwr = 'N/A' - combined_partition = (getMemoryPartition(device) + ", " - + getComputePartition(device)) - concise = True - sclk = showCurrentClocks([device], 'sclk', concise) - mclk = showCurrentClocks([device], 'mclk', concise) - (retCode, fanLevel, fanSpeed) = getFanSpeed(device) + combined_partition = (getMemoryPartition(device, silent) + ", " + + getComputePartition(device, silent)) + sclk = showCurrentClocks([device], 'sclk', concise=silent) + mclk = showCurrentClocks([device], 'mclk', concise=silent) + (retCode, fanLevel, fanSpeed) = getFanSpeed(device, silent) fan = str(fanSpeed) + '%' - if getPerfLevel(device) != -1: - perf = getPerfLevel(device) + if getPerfLevel(device, silent) != -1: + perf = getPerfLevel(device, silent) else: perf = 'Unsupported' - if getMaxPower(device) != -1: - pwrCap = str(getMaxPower(device)) + 'W' + if getMaxPower(device, silent) != -1: + pwrCap = str(getMaxPower(device, silent)) + 'W' else: pwrCap = 'Unsupported' - if getGpuUse(device) != -1: - gpu_busy = str(getGpuUse(device)) + '%' + if getGpuUse(device, silent) != -1: + gpu_busy = str(getGpuUse(device, silent)) + '%' else: gpu_busy = 'Unsupported' - vram_used, vram_total = getMemInfo(device, 'vram', True) + vram_used, vram_total = getMemInfo(device, 'vram', silent) mem_use_pct = 0 if vram_used is None: mem_use_pct='Unsupported' @@ -1683,7 +1724,7 @@ def showAllConcise(deviceList): for device in deviceList: printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), values['card%s' % (str(device))])), None) - gpu_dev_product_info = getDevProductInfo(device) + gpu_dev_product_info = getDevProductInfo(device, silent) gpu_dev_product_info_names = list(gpu_dev_product_info[device]) if (len(gpu_dev_product_info_names) > 1): printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in @@ -1707,19 +1748,20 @@ def showAllConciseHw(deviceList): header = ['GPU', 'DID', 'DREV', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS'] head_widths = [len(head) + 2 for head in header] values = {} + silent = True for device in deviceList: - gpuid = getId(device) + gpuid = getId(device, silent) if str(gpuid).startswith('0x'): gpuid = str(gpuid)[2:] - gpurev = getRev(device) + gpurev = getRev(device, silent) if str(gpurev).startswith('0x'): gpurev = str(gpurev)[2:] - gfxRas = getRasEnablement(device, 'GFX') - sdmaRas = getRasEnablement(device, 'SDMA') - umcRas = getRasEnablement(device, 'UMC') - vbios = getVbiosVersion(device) - bus = getBus(device) + gfxRas = getRasEnablement(device, 'GFX', silent) + sdmaRas = getRasEnablement(device, 'SDMA', silent) + umcRas = getRasEnablement(device, 'UMC', silent) + vbios = getVbiosVersion(device, silent) + bus = getBus(device, silent) values['card%s' % (str(device))] = [device, gpuid, gpurev, gfxRas, sdmaRas, umcRas, vbios, bus] val_widths = {} for device in deviceList: @@ -1760,15 +1802,19 @@ def showClocks(deviceList): for clk_type in sorted(rsmi_clk_names_dict): if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1: ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq)) - if rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True): - printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None) - for x in range(freq.num_supported): - fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000) - if x == freq.current: - printLog(device, str(x), str(fr) + ' *') - else: - printLog(device, str(x), str(fr)) - printLog(device, '', None) + if ret == rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: + printLog(device, 'Clock [%s] on device [%s] exists but EMPTY! Likely driver error!' % (clk_type, str(device))) + continue + if not rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True): + continue + printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None) + for x in range(freq.num_supported): + fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000) + if x == freq.current: + printLog(device, str(x), str(fr) + ' *') + else: + printLog(device, str(x), str(fr)) + printLog(device, '', None) else: logging.debug('{} frequency is unsupported on device[{}]'.format(clk_type, device)) printLog(device, '', None) @@ -1814,8 +1860,8 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): if concise: # in case function is used for concise output, no need to print. return '{:.0f}Mhz'.format(fr) printLog(device, '{} clock level'.format(clk_defined), '{} ({:.0f}Mhz)'.format(levl, fr)) - else: - printErrLog(device, '%s clock is unsupported' % (clk_defined)) + elif not concise: + logging.debug('{} clock is unsupported on device[{}]'.format(clk_defined, device)) else: # if clk is not defined, will display all current clk for clk_type in sorted(rsmi_clk_names_dict): @@ -1832,7 +1878,7 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): printLog(device, '%s clock level:' % (clk_type), levl) else: printLog(device, '%s clock level: %s' % (clk_type, levl), '(%sMhz)' % (str(fr)[:-2])) - else: + elif not concise: logging.debug('{} clock is unsupported on device[{}]'.format(clk_type, device)) # pcie clocks if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1: @@ -1845,9 +1891,10 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): fr = '{:.1f}GT/s x{}'.format(bw.transfer_rate.frequency[current_f] / 1000000000, bw.lanes[current_f]) printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr)) - else: - logging.debug('PCIe clock is unsupported on device[{}]'.format(device)) - printLogSpacer() + elif not concise: + logging.debug('{} clock is unsupported on device[{}]'.format('PCIe', device)) + if not concise: + printLogSpacer() def showCurrentFans(deviceList): @@ -2113,6 +2160,7 @@ def showMemUse(deviceList): @param deviceList: List of DRM devices (can be a single-item list) """ memoryUse = c_uint64() + avgMemBandwidth = c_uint16() printLogSpacer(' Current Memory Use ') for device in deviceList: ret = rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse)) @@ -2124,6 +2172,12 @@ def showMemUse(deviceList): printLog(device, utilization_counter_name[ut_counter.type], ut_counter.val) else: printLog(device, 'Memory Activity', 'N/A') + + ret = rocmsmi.rsmi_dev_activity_avg_mm_get(device, byref(avgMemBandwidth)) + if rsmi_ret_ok(ret, device, silent=True): + printLog(device, 'Avg. Memory Bandwidth', avgMemBandwidth.value) + else: + printLog(device, 'Not supported on the given system', None) printLogSpacer() @@ -2404,47 +2458,51 @@ def showProductName(deviceList): printLogSpacer() -def getDevProductInfo(device): +def getDevProductInfo(device, silent=False): """ Show the requested product name for the device requested @param device: Device we want to get the info for + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ # Retrieve card vendor MAX_BUFF_SIZE = 256 MAX_DESC_SIZE = 20 - device_info = "N/A" + device_series = "N/A" + device_model = "N/A" + gpu_revision = "N/A" device_list = {} vendor = create_string_buffer(MAX_BUFF_SIZE) ret = rocmsmi.rsmi_dev_vendor_name_get(device, vendor, MAX_BUFF_SIZE) # Only continue if GPU vendor is AMD - if rsmi_ret_ok(ret, device, 'get_vendor_name') and isAmdDevice(device): + if rsmi_ret_ok(ret, device, 'get_vendor_name', silent) and isAmdDevice(device): # Retrieve the device series series = create_string_buffer(MAX_BUFF_SIZE) ret = rocmsmi.rsmi_dev_name_get(device, series, MAX_BUFF_SIZE) - if rsmi_ret_ok(ret, device, 'get_name'): + if rsmi_ret_ok(ret, device, 'get_name', silent): try: device_series = series.value.decode() except UnicodeDecodeError: - device_series = "N/A" - printErrLog(device, "Unable to read card series") + if not silent: + printErrLog(device, "Unable to read card series") # Retrieve the device model model = create_string_buffer(MAX_BUFF_SIZE) ret = rocmsmi.rsmi_dev_subsystem_name_get(device, model, MAX_BUFF_SIZE) - if rsmi_ret_ok(ret, device, 'get_subsystem_name'): + if rsmi_ret_ok(ret, device, 'get_subsystem_name', silent): try: device_model = model.value.decode() device_model = padHexValue(device_model, 4) except UnicodeDecodeError: - device_model = "N/A" - printErrLog(device, "Unable to read device model") + if not silent: + printErrLog(device, "Unable to read device model") try: gpu_revision = padHexValue(getRev(device), 2) except Exception as exc: - gpu_revision = "N/A" - printErrLog(device, "Unable to read card revision %s" % (exc)) + if not silent: + printErrLog(device, "Unable to read card revision %s" % (exc)) device_series_str = str(device_series[:MAX_DESC_SIZE]) device_series_str = device_series_str.ljust(MAX_DESC_SIZE, ' ') @@ -2790,7 +2848,9 @@ def getGraphColor(percentage): def showTempGraph(deviceList): deviceList.sort() - (temp_type, temp_value) = findFirstAvailableTemp(deviceList[0]) + temp_type = '(' + temp_type_lst[0] + ')' + if len(deviceList) >= 1: + (temp_type, _) = findFirstAvailableTemp(deviceList[0]) printLogSpacer(' Temperature Graph ' + temp_type + ' ') temp_type = temp_type.lower() temp_type = temp_type.replace('(', '') @@ -3381,7 +3441,7 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False): @param my_ret: Return of RSMI call (rocm_smi_lib API) @param metric: Parameter of GPU currently being analyzed @param silent: Echo verbose error reponse. - True siliences err output, False does not silience err output (default). + True silences err output, False does not silence err output (default). """ global RETCODE global PRINT_JSON @@ -3398,8 +3458,8 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False): if err_str.value is not None: returnString += '%s\t' % (err_str.value.decode()) if not PRINT_JSON: - logging.debug('%s', returnString) if not silent: + logging.debug('%s', returnString) if my_ret in rsmi_status_verbose_err_out: printLog(device, metric + ", " + rsmi_status_verbose_err_out[my_ret], None) RETCODE = my_ret @@ -3465,8 +3525,7 @@ def save(deviceList, savefilepath): # The code below is for when this script is run as an executable instead of when imported as a module if __name__ == '__main__': parser = argparse.ArgumentParser( - description='AMD ROCm System Management Interface | ROCM-SMI version: %s | Kernel version: %s' % ( - __version__, getVersion(None, rsmi_sw_component_t.RSMI_SW_COMP_DRIVER)), + description=f'AMD ROCm System Management Interface | ROCM-SMI version: {__version__}', formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=90, width=120)) groupDev = parser.add_argument_group() groupDisplayOpt = parser.add_argument_group('Display Options') @@ -3626,6 +3685,11 @@ if __name__ == '__main__': args = parser.parse_args() + # Must set PRINT_JSON early so the prints can be silenced + if args.json or args.csv: + PRINT_JSON = True + # Initialize rsmiBindings + rocmsmi = initRsmiBindings(silent=PRINT_JSON) # Initialize the rocm SMI library initializeRsmi() @@ -3661,8 +3725,7 @@ if __name__ == '__main__': sys.exit(1) # If we want JSON/CSV output, initialize the keys (devices) - if args.json or args.csv: - PRINT_JSON = True + if PRINT_JSON: for device in deviceList: JSON_DATA['card' + str(device)] = {} diff --git a/rocm_smi/python_smi_tools/rsmiBindings.py b/rocm_smi/python_smi_tools/rsmiBindings.py index 9ffcac138d..e6b141889f 100644 --- a/rocm_smi/python_smi_tools/rsmiBindings.py +++ b/rocm_smi/python_smi_tools/rsmiBindings.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 """ROCm_SMI_LIB CLI Tool Python Bindings""" +# NOTE: You MUST call rsmiBindings.initRsmiBindings() when using this library! # TODO: Get most (or all) of these from rocm_smi.h to avoid mismatches and redundancy from __future__ import print_function @@ -14,36 +15,42 @@ import os # relative path changed accordingly. # if ROCM_SMI_LIB_PATH is set, we can load 'librocm_smi64.so' from that location # +# Library load is wrapped in a function so prints can be hidden for PRINT_JSON mode. path_librocm = str() -rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH') -if (rocm_smi_lib_path != None): - path_librocm = rocm_smi_lib_path -else: - path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@' +def initRsmiBindings(silent=False): + def print_silent(*args): + if not silent: + print(args) -if not os.path.isfile(path_librocm): - print('Unable to find %s . Trying /opt/rocm*' % path_librocm) - for root, dirs, files in os.walk('/opt', followlinks=True): - if 'librocm_smi64.so.@VERSION_MAJOR@' in files: - path_librocm = os.path.join(os.path.realpath(root), 'librocm_smi64.so.@VERSION_MAJOR@') - if os.path.isfile(path_librocm): - print('Using lib from %s' % path_librocm) + rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH') + if (rocm_smi_lib_path != None): + path_librocm = rocm_smi_lib_path else: - print('Unable to find librocm_smi64.so.@VERSION_MAJOR@') -else: - print('Library loaded from: %s ' % path_librocm) + path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@' -# ----------> TODO: Support static libs as well as SO -try: - cdll.LoadLibrary(path_librocm) - rocmsmi = CDLL(path_librocm) -except OSError: - print('Unable to load the rocm_smi library.\n'\ - 'Set LD_LIBRARY_PATH to the folder containing librocm_smi64.so.@VERSION_MAJOR@\n'\ - '{0}Please refer to https://github.com/'\ - 'RadeonOpenCompute/rocm_smi_lib for the installation guide.{1}'\ - .format('\33[33m', '\033[0m')) - exit() + if not os.path.isfile(path_librocm): + print_silent('Unable to find %s . Trying /opt/rocm*' % path_librocm) + for root, dirs, files in os.walk('/opt', followlinks=True): + if 'librocm_smi64.so.@VERSION_MAJOR@' in files: + path_librocm = os.path.join(os.path.realpath(root), 'librocm_smi64.so.@VERSION_MAJOR@') + if os.path.isfile(path_librocm): + print_silent('Using lib from %s' % path_librocm) + else: + print('Unable to find librocm_smi64.so.@VERSION_MAJOR@') + else: + print_silent('Library loaded from: %s ' % path_librocm) + + # ----------> TODO: Support static libs as well as SO + try: + cdll.LoadLibrary(path_librocm) + return CDLL(path_librocm) + except OSError: + print('Unable to load the rocm_smi library.\n'\ + 'Set LD_LIBRARY_PATH to the folder containing librocm_smi64.so.@VERSION_MAJOR@\n'\ + '{0}Please refer to https://github.com/'\ + 'RadeonOpenCompute/rocm_smi_lib for the installation guide.{1}'\ + .format('\33[33m', '\033[0m')) + exit() # Device ID dv_id = c_uint64() diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index e52851a10f..655eacd2d8 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -41,29 +41,29 @@ * */ -#include -#include -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include #include -#include +#include +#include -#include #include -#include #include +#include +#include +#include #include -#include -#include -#include +#include #include #include +#include +#include #include +#include +#include #include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h #include "rocm_smi/rocm_smi.h" @@ -81,16 +81,24 @@ using namespace ROCmLogging; using namespace amd::smi; static const uint32_t kMaxOverdriveLevel = 20; -static const float kEnergyCounterResolution = 15.3f; +static const float kEnergyCounterResolution = 15.3F; -std::map ClkStateMap = { - {RSMI_CLK_TYPE_SYS, "SCLK"}, - {RSMI_CLK_TYPE_DF, "DFCLK"}, - {RSMI_CLK_TYPE_DCEF, "DCEFCLK"}, - {RSMI_CLK_TYPE_SOC, "SOCCLK"}, - {RSMI_CLK_TYPE_MEM, "MCLK"}, - {RSMI_CLK_TYPE_PCIE, "PCIECLK"}, - }; +static const std::map kClkStateMap = { + { RSMI_CLK_TYPE_SYS, "SCLK" }, + { RSMI_CLK_TYPE_DF, "DFCLK" }, + { RSMI_CLK_TYPE_DCEF, "DCEFCLK" }, + { RSMI_CLK_TYPE_SOC, "SOCCLK" }, + { RSMI_CLK_TYPE_MEM, "MCLK" }, + { RSMI_CLK_TYPE_PCIE, "PCIECLK" }, +}; + +static const std::map kClkTypeMap = { + { RSMI_CLK_TYPE_SYS, amd::smi::kDevGPUSClk }, + { RSMI_CLK_TYPE_MEM, amd::smi::kDevGPUMClk }, + { RSMI_CLK_TYPE_DF, amd::smi::kDevFClk }, + { RSMI_CLK_TYPE_DCEF, amd::smi::kDevDCEFClk }, + { RSMI_CLK_TYPE_SOC, amd::smi::kDevSOCClk }, +}; #define TRY try { #define CATCH } catch (...) {return amd::smi::handleException();} @@ -156,7 +164,7 @@ static uint64_t freq_string_to_int(const std::vector &freq_lines, } if (is_curr != nullptr) { - if (freq_lines[i].find("*") != std::string::npos) { + if (freq_lines[i].find('*') != std::string::npos) { *is_curr = true; } else { *is_curr = false; @@ -167,7 +175,7 @@ static uint64_t freq_string_to_int(const std::vector &freq_lines, if (star_str[0] == 'x') { assert(lanes != nullptr && "Lanes are provided but null lanes pointer"); if (lanes) { - if (star_str.substr(1) == "") { + if (star_str.substr(1).empty()) { throw amd::smi::rsmi_exception(RSMI_STATUS_NO_DATA, __FUNCTION__); } @@ -209,8 +217,6 @@ static void freq_volt_string_to_point(std::string in_line, multiplier = get_multiplier_from_str(volts_units_str[0]); pt->voltage = static_cast(volts*multiplier); - - return; } static void od_value_pair_str_to_range(std::string in_line, rsmi_range_t *rg) { @@ -237,8 +243,6 @@ static void od_value_pair_str_to_range(std::string in_line, rsmi_range_t *rg) { multiplier = get_multiplier_from_str(hi_units_str[0]); rg->upper_bound = static_cast(hi*multiplier); - - return; } /** @@ -258,7 +262,7 @@ power_prof_string_to_int(std::string pow_prof_line, bool *is_curr, fs >> *prof_ind; fs >> mode; - while (1) { + while (true) { tmp = mode.find_last_of("* :"); if (tmp == std::string::npos) { break; @@ -267,7 +271,7 @@ power_prof_string_to_int(std::string pow_prof_line, bool *is_curr, } if (is_curr != nullptr) { - if (pow_prof_line.find("*") != std::string::npos) { + if (pow_prof_line.find('*') != std::string::npos) { *is_curr = true; } else { *is_curr = false; @@ -403,6 +407,10 @@ static rsmi_status_t set_dev_mon_value(amd::smi::MonitorTypes type, } int ret = dev->monitor()->writeMonitor(type, sensor_ind, std::to_string(val)); + /// If the sysfs file doesn't exist, it is not supported. + if (ret == ENOENT) { + return rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; + } return amd::smi::ErrnoToRsmiStatus(ret); } @@ -786,14 +794,13 @@ rsmi_status_t rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node) { TRY rsmi_status_t ret; - uint64_t val = 0; CHK_SUPPORT_NAME_ONLY(numa_node) DEVICE_MUTEX std::string str_val; ret = get_dev_value_str(amd::smi::kDevNumaNode, dv_ind, &str_val); - *numa_node = std::stol(str_val, 0); + *numa_node = std::stoi(str_val, nullptr); return ret; CATCH @@ -1037,13 +1044,10 @@ rsmi_dev_mem_overdrive_level_get(uint32_t dv_ind, uint32_t *od) { } rsmi_status_t -rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od) { +rsmi_dev_overdrive_level_set(uint32_t dv_ind, uint32_t od) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); - if (dv_ind < 0) { - return RSMI_STATUS_INVALID_ARGS; - } return rsmi_dev_overdrive_level_set_v1(static_cast(dv_ind), od); } @@ -1070,11 +1074,11 @@ rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od) { } rsmi_status_t -rsmi_dev_perf_level_set(int32_t dv_ind, rsmi_dev_perf_level_t perf_level) { +rsmi_dev_perf_level_set(uint32_t dv_ind, rsmi_dev_perf_level_t perf_level) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); - return rsmi_dev_perf_level_set_v1(static_cast(dv_ind), perf_level); + return rsmi_dev_perf_level_set_v1(dv_ind, perf_level); } rsmi_status_t @@ -1118,7 +1122,7 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_ } assert(val_vec.size() <= RSMI_MAX_NUM_FREQUENCIES); - if (val_vec.size() == 0) { + if (val_vec.empty()) { return RSMI_STATUS_NOT_YET_IMPLEMENTED; } @@ -1133,7 +1137,8 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_ // Check that that is true. if (i > 0) { if (f->frequency[i] < f->frequency[i-1]) { - std::string sysvalue = ClkStateMap[clk_type]; + std::string sysvalue; + sysvalue += kClkStateMap.find(clk_type)->second; sysvalue += " Current Value"; sysvalue += ' ' + std::to_string(f->frequency[i]); sysvalue += " Previous Value"; @@ -1144,7 +1149,8 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_ if (current) { // set the current frequency if (f->current != RSMI_MAX_NUM_FREQUENCIES + 1) { - std::string sysvalue = ClkStateMap[clk_type]; + std::string sysvalue; + sysvalue += kClkStateMap.find(clk_type)->second; sysvalue += " Current Value"; sysvalue += ' ' + std::to_string(f->frequency[i]); sysvalue += " Previous Value"; @@ -1183,7 +1189,7 @@ static rsmi_status_t get_power_profiles(uint32_t dv_ind, return ret; } assert(val_vec.size() <= RSMI_MAX_NUM_POWER_PROFILES); - if (val_vec.size() > RSMI_MAX_NUM_POWER_PROFILES + 1 || val_vec.size() < 1) { + if (val_vec.size() > RSMI_MAX_NUM_POWER_PROFILES + 1 || val_vec.empty()) { // Guest may not have power related information. if (amd::smi::is_vm_guest()) { return RSMI_STATUS_NOT_SUPPORTED; @@ -1386,8 +1392,9 @@ rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, return RSMI_STATUS_NOT_SUPPORTED; } - std::string min_sysvalue, max_sysvalue; - std::map ClkStateMap = { + std::string min_sysvalue; + std::string max_sysvalue; + std::map clk_char_map = { {RSMI_CLK_TYPE_SYS, "s"}, {RSMI_CLK_TYPE_MEM, "m"}, }; @@ -1405,11 +1412,11 @@ rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, // minimum clock. And 1 if to set maximum clock. E.g., "s 0 500" will update // minimum sclk to be 500 MHz. "m 1 800" will update maximum mclk to 800Mhz. - min_sysvalue = ClkStateMap[clkType]; + min_sysvalue = clk_char_map[clkType]; min_sysvalue += ' ' + std::to_string(RSMI_FREQ_IND_MIN); min_sysvalue += ' ' + std::to_string(minclkvalue); min_sysvalue += '\n'; - max_sysvalue = ClkStateMap[clkType]; + max_sysvalue = clk_char_map[clkType]; max_sysvalue += ' ' + std::to_string(RSMI_FREQ_IND_MAX); max_sysvalue += ' ' + std::to_string(maxclkvalue); max_sysvalue += '\n'; @@ -1441,7 +1448,7 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, LOG_TRACE(ss); std::string sysvalue; - std::map ClkStateMap = { + std::map clk_char_map = { {RSMI_CLK_TYPE_SYS, "s"}, {RSMI_CLK_TYPE_MEM, "m"}, }; @@ -1460,14 +1467,8 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, switch (clkType) { case RSMI_CLK_TYPE_SYS: - sysvalue = ClkStateMap[clkType]; - sysvalue += ' ' + std::to_string(level); - sysvalue += ' ' + std::to_string(clkvalue); - sysvalue += '\n'; - break; - case RSMI_CLK_TYPE_MEM: - sysvalue = ClkStateMap[clkType]; + sysvalue = clk_char_map[clkType]; sysvalue += ' ' + std::to_string(level); sysvalue += ' ' + std::to_string(clkvalue); sysvalue += '\n'; @@ -1547,7 +1548,6 @@ static void get_vc_region(uint32_t start_ind, } od_value_pair_str_to_range((*val_vec)[start_ind], &p->freq_range); od_value_pair_str_to_range((*val_vec)[start_ind + 1], &p->volt_range); - return; } /* @@ -1674,24 +1674,11 @@ rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, rsmi_clk_type_t clk_type, CHK_SUPPORT_VAR(f, clk_type) - switch (clk_type) { - case RSMI_CLK_TYPE_SYS: - dev_type = amd::smi::kDevGPUSClk; - break; - case RSMI_CLK_TYPE_MEM: - dev_type = amd::smi::kDevGPUMClk; - break; - case RSMI_CLK_TYPE_DF: - dev_type = amd::smi::kDevFClk; - break; - case RSMI_CLK_TYPE_DCEF: - dev_type = amd::smi::kDevDCEFClk; - break; - case RSMI_CLK_TYPE_SOC: - dev_type = amd::smi::kDevSOCClk; - break; - default: - return RSMI_STATUS_INVALID_ARGS; + const auto & clk_type_it = kClkTypeMap.find(clk_type); + if (clk_type_it != kClkTypeMap.end()) { + dev_type = clk_type_it->second; + } else { + return RSMI_STATUS_INVALID_ARGS; } DEVICE_MUTEX @@ -1713,72 +1700,35 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, std::string val_str; amd::smi::DevInfoTypes dev_type; - switch (block) { - case RSMI_FW_BLOCK_ASD: - dev_type = amd::smi::kDevFwVersionAsd; - break; - case RSMI_FW_BLOCK_CE: - dev_type = amd::smi::kDevFwVersionCe; - break; - case RSMI_FW_BLOCK_DMCU: - dev_type = amd::smi::kDevFwVersionDmcu; - break; - case RSMI_FW_BLOCK_MC: - dev_type = amd::smi::kDevFwVersionMc; - break; - case RSMI_FW_BLOCK_ME: - dev_type = amd::smi::kDevFwVersionMe; - break; - case RSMI_FW_BLOCK_MEC: - dev_type = amd::smi::kDevFwVersionMec; - break; - case RSMI_FW_BLOCK_MEC2: - dev_type = amd::smi::kDevFwVersionMec2; - break; - case RSMI_FW_BLOCK_PFP: - dev_type = amd::smi::kDevFwVersionPfp; - break; - case RSMI_FW_BLOCK_RLC: - dev_type = amd::smi::kDevFwVersionRlc; - break; - case RSMI_FW_BLOCK_RLC_SRLC: - dev_type = amd::smi::kDevFwVersionRlcSrlc; - break; - case RSMI_FW_BLOCK_RLC_SRLG: - dev_type = amd::smi::kDevFwVersionRlcSrlg; - break; - case RSMI_FW_BLOCK_RLC_SRLS: - dev_type = amd::smi::kDevFwVersionRlcSrls; - break; - case RSMI_FW_BLOCK_SDMA: - dev_type = amd::smi::kDevFwVersionSdma; - break; - case RSMI_FW_BLOCK_SDMA2: - dev_type = amd::smi::kDevFwVersionSdma2; - break; - case RSMI_FW_BLOCK_SMC: - dev_type = amd::smi::kDevFwVersionSmc; - break; - case RSMI_FW_BLOCK_SOS: - dev_type = amd::smi::kDevFwVersionSos; - break; - case RSMI_FW_BLOCK_TA_RAS: - dev_type = amd::smi::kDevFwVersionTaRas; - break; - case RSMI_FW_BLOCK_TA_XGMI: - dev_type = amd::smi::kDevFwVersionTaXgmi; - break; - case RSMI_FW_BLOCK_UVD: - dev_type = amd::smi::kDevFwVersionUvd; - break; - case RSMI_FW_BLOCK_VCE: - dev_type = amd::smi::kDevFwVersionVce; - break; - case RSMI_FW_BLOCK_VCN: - dev_type = amd::smi::kDevFwVersionVcn; - break; - default: - return RSMI_STATUS_INVALID_ARGS; + static const std::map kFWBlockTypeMap = { + { RSMI_FW_BLOCK_ASD, amd::smi::kDevFwVersionAsd }, + { RSMI_FW_BLOCK_CE, amd::smi::kDevFwVersionCe }, + { RSMI_FW_BLOCK_DMCU, amd::smi::kDevFwVersionDmcu }, + { RSMI_FW_BLOCK_MC, amd::smi::kDevFwVersionMc }, + { RSMI_FW_BLOCK_ME, amd::smi::kDevFwVersionMe }, + { RSMI_FW_BLOCK_MEC, amd::smi::kDevFwVersionMec }, + { RSMI_FW_BLOCK_MEC2, amd::smi::kDevFwVersionMec2 }, + { RSMI_FW_BLOCK_PFP, amd::smi::kDevFwVersionPfp }, + { RSMI_FW_BLOCK_RLC, amd::smi::kDevFwVersionRlc }, + { RSMI_FW_BLOCK_RLC_SRLC, amd::smi::kDevFwVersionRlcSrlc }, + { RSMI_FW_BLOCK_RLC_SRLG, amd::smi::kDevFwVersionRlcSrlg }, + { RSMI_FW_BLOCK_RLC_SRLS, amd::smi::kDevFwVersionRlcSrls }, + { RSMI_FW_BLOCK_SDMA, amd::smi::kDevFwVersionSdma }, + { RSMI_FW_BLOCK_SDMA2, amd::smi::kDevFwVersionSdma2 }, + { RSMI_FW_BLOCK_SMC, amd::smi::kDevFwVersionSmc }, + { RSMI_FW_BLOCK_SOS, amd::smi::kDevFwVersionSos }, + { RSMI_FW_BLOCK_TA_RAS, amd::smi::kDevFwVersionTaRas }, + { RSMI_FW_BLOCK_TA_XGMI, amd::smi::kDevFwVersionTaXgmi }, + { RSMI_FW_BLOCK_UVD, amd::smi::kDevFwVersionUvd }, + { RSMI_FW_BLOCK_VCE, amd::smi::kDevFwVersionVce }, + { RSMI_FW_BLOCK_VCN, amd::smi::kDevFwVersionVcn }, + }; + + const auto & dev_type_it = kFWBlockTypeMap.find(block); + if (dev_type_it != kFWBlockTypeMap.end()) { + dev_type = dev_type_it->second; + } else { + return RSMI_STATUS_INVALID_ARGS; } DEVICE_MUTEX @@ -1788,7 +1738,7 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, static std::string bitfield_to_freq_string(uint64_t bitf, uint32_t num_supported) { - std::string bf_str(""); + std::string bf_str; std::bitset bs(bitf); if (num_supported > RSMI_MAX_NUM_FREQUENCIES) { @@ -1858,24 +1808,11 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, int ret_i; amd::smi::DevInfoTypes dev_type; - switch (clk_type) { - case RSMI_CLK_TYPE_SYS: - dev_type = amd::smi::kDevGPUSClk; - break; - case RSMI_CLK_TYPE_MEM: - dev_type = amd::smi::kDevGPUMClk; - break; - case RSMI_CLK_TYPE_DF: - dev_type = amd::smi::kDevFClk; - break; - case RSMI_CLK_TYPE_SOC: - dev_type = amd::smi::kDevSOCClk; - break; - case RSMI_CLK_TYPE_DCEF: - dev_type = amd::smi::kDevDCEFClk; - break; - default: - return RSMI_STATUS_INVALID_ARGS; + const auto & clk_type_it = kClkTypeMap.find(clk_type); + if (clk_type_it != kClkTypeMap.end()) { + dev_type = clk_type_it->second; + } else { + return RSMI_STATUS_INVALID_ARGS; } ret_i = dev->writeDevInfo(dev_type, freq_enable_str); @@ -1943,7 +1880,7 @@ get_id_name_str_from_line(uint64_t id, std::string ln, *ln_str >> token1; - if (token1 == "") { + if (token1.empty()) { throw amd::smi::rsmi_exception(RSMI_STATUS_NO_DATA, __FUNCTION__); } @@ -2078,13 +2015,13 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name, } } - for (auto fl : pci_name_files) { + for (const auto& fl : pci_name_files) { std::ifstream id_file_strm(fl); while (std::getline(id_file_strm, ln)) { std::istringstream ln_str(ln); // parse line - if (ln[0] == '#' || ln.size() == 0) { + if (ln[0] == '#' || ln.empty()) { continue; } @@ -2095,29 +2032,28 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name, if (typ == NAME_STR_SUBSYS && found_device_id_for_subsys) { val_str = get_id_name_str_from_line(subsys_vend_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { // We've chopped the subsys_vend ID, now we need to get the // subsys description val_str = get_id_name_str_from_line(subsys_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { break; - } else { - val_str.clear(); } + val_str.clear(); } } } else if (typ == NAME_STR_DEVICE) { // ln[1] != '\t' // This is a device line val_str = get_id_name_str_from_line(device_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { break; } } else if (typ == NAME_STR_SUBSYS) { // match the device id line val_str = get_id_name_str_from_line(device_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { found_device_id_for_subsys = true; } } @@ -2135,22 +2071,21 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name, val_str = get_id_name_str_from_line(vendor_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { if (typ == NAME_STR_VENDOR) { break; - } else { - val_str.clear(); - found_device_vendor = true; } + val_str.clear(); + found_device_vendor = true; } } } - if (val_str.size() > 0) { + if (!val_str.empty()) { break; } } - if (val_str.size() == 0) { + if (val_str.empty()) { return get_backup_name(vendor_id, name, len); } size_t ct = val_str.copy(name, len); @@ -2409,8 +2344,8 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { const uint32_t SPEED_DATA_LENGTH = sizeof(link_speed)/sizeof(uint32_t); // Calculate the index - int width_index = -1; - int speed_index = -1; + uint32_t width_index = -1; + uint32_t speed_index = -1; uint32_t cur_index = 0; for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH; cur_index++) { if (link_width[cur_index] == gpu_metrics.pcie_link_width) { @@ -2418,8 +2353,7 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { break; } } - for (cur_index = 0; - cur_index < SPEED_DATA_LENGTH; cur_index++) { + for (cur_index = 0; cur_index < SPEED_DATA_LENGTH; cur_index++) { if (link_speed[cur_index] == gpu_metrics.pcie_link_speed) { speed_index = cur_index; break; @@ -2431,11 +2365,10 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { // Set possible lanes and frequencies b->transfer_rate.num_supported = WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; b->transfer_rate.current = speed_index*WIDTH_DATA_LENGTH + width_index; - for (cur_index = 0; - cur_index < WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; cur_index++) { - b->transfer_rate.frequency[cur_index] - = link_speed[cur_index/WIDTH_DATA_LENGTH] * 100 * 1000000L; - b->lanes[cur_index] = link_width[cur_index % WIDTH_DATA_LENGTH]; + for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; cur_index++) { + b->transfer_rate.frequency[cur_index] = + static_cast(link_speed[cur_index/WIDTH_DATA_LENGTH]) * 100 * 1000000L; + b->lanes[cur_index] = link_width[cur_index % WIDTH_DATA_LENGTH]; } /* frequency = {2500, 2500, 2500, 2500, 2500, 2500, @@ -2549,54 +2482,29 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, LOG_TRACE(ss); rsmi_status_t ret; - amd::smi::MonitorTypes mon_type; + amd::smi::MonitorTypes mon_type = amd::smi::kMonInvalid; uint16_t val_ui16; - switch (metric) { - case RSMI_TEMP_CURRENT: - mon_type = amd::smi::kMonTemp; - break; - case RSMI_TEMP_MAX: - mon_type = amd::smi::kMonTempMax; - break; - case RSMI_TEMP_MIN: - mon_type = amd::smi::kMonTempMin; - break; - case RSMI_TEMP_MAX_HYST: - mon_type = amd::smi::kMonTempMaxHyst; - break; - case RSMI_TEMP_MIN_HYST: - mon_type = amd::smi::kMonTempMinHyst; - break; - case RSMI_TEMP_CRITICAL: - mon_type = amd::smi::kMonTempCritical; - break; - case RSMI_TEMP_CRITICAL_HYST: - mon_type = amd::smi::kMonTempCriticalHyst; - break; - case RSMI_TEMP_EMERGENCY: - mon_type = amd::smi::kMonTempEmergency; - break; - case RSMI_TEMP_EMERGENCY_HYST: - mon_type = amd::smi::kMonTempEmergencyHyst; - break; - case RSMI_TEMP_CRIT_MIN: - mon_type = amd::smi::kMonTempCritMin; - break; - case RSMI_TEMP_CRIT_MIN_HYST: - mon_type = amd::smi::kMonTempCritMinHyst; - break; - case RSMI_TEMP_OFFSET: - mon_type = amd::smi::kMonTempOffset; - break; - case RSMI_TEMP_LOWEST: - mon_type = amd::smi::kMonTempLowest; - break; - case RSMI_TEMP_HIGHEST: - mon_type = amd::smi::kMonTempHighest; - break; - default: - mon_type = amd::smi::kMonInvalid; + static const std::map kMetricTypeMap = { + { RSMI_TEMP_CURRENT, amd::smi::kMonTemp }, + { RSMI_TEMP_MAX, amd::smi::kMonTempMax }, + { RSMI_TEMP_MIN, amd::smi::kMonTempMin }, + { RSMI_TEMP_MAX_HYST, amd::smi::kMonTempMaxHyst }, + { RSMI_TEMP_MIN_HYST, amd::smi::kMonTempMinHyst }, + { RSMI_TEMP_CRITICAL, amd::smi::kMonTempCritical }, + { RSMI_TEMP_CRITICAL_HYST, amd::smi::kMonTempCriticalHyst }, + { RSMI_TEMP_EMERGENCY, amd::smi::kMonTempEmergency }, + { RSMI_TEMP_EMERGENCY_HYST, amd::smi::kMonTempEmergencyHyst }, + { RSMI_TEMP_CRIT_MIN, amd::smi::kMonTempCritMin }, + { RSMI_TEMP_CRIT_MIN_HYST, amd::smi::kMonTempCritMinHyst }, + { RSMI_TEMP_OFFSET, amd::smi::kMonTempOffset }, + { RSMI_TEMP_LOWEST, amd::smi::kMonTempLowest }, + { RSMI_TEMP_HIGHEST, amd::smi::kMonTempHighest }, + }; + + const auto mon_type_it = kMetricTypeMap.find(metric); + if (mon_type_it != kMetricTypeMap.end()) { + mon_type = mon_type_it->second; } if (temperature == nullptr) { @@ -2612,80 +2520,81 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, return RSMI_STATUS_INVALID_ARGS; } - // The HBM temperature is retreived from the gpu_metrics - if (sensor_type == RSMI_TEMP_TYPE_HBM_0 - || sensor_type == RSMI_TEMP_TYPE_HBM_1 - || sensor_type == RSMI_TEMP_TYPE_HBM_2 - || sensor_type == RSMI_TEMP_TYPE_HBM_3) { - if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Type: " << monitorTypesToString.at(mon_type) - << " | Cause: To retreive HBM temp, we only support metric = " - << "RSMI_TEMP_CURRENT" - << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; - LOG_ERROR(ss); - return RSMI_STATUS_NOT_SUPPORTED; - } + // The HBM temperature is retrieved from the gpu_metrics + if (sensor_type == RSMI_TEMP_TYPE_HBM_0 || + sensor_type == RSMI_TEMP_TYPE_HBM_1 || + sensor_type == RSMI_TEMP_TYPE_HBM_2 || + sensor_type == RSMI_TEMP_TYPE_HBM_3) { + if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: To retrieve HBM temp, we only support metric = " + << "RSMI_TEMP_CURRENT" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } - rsmi_gpu_metrics_t gpu_metrics; - ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); - if (ret != RSMI_STATUS_SUCCESS) { - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Type: " << monitorTypesToString.at(mon_type) - << " | Cause: rsmi_dev_gpu_metrics_info_get returned " - << getRSMIStatusString(ret) - << " | Returning = " - << getRSMIStatusString(ret) << " |"; - LOG_ERROR(ss); - return ret; - } + rsmi_gpu_metrics_t gpu_metrics; + ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: rsmi_dev_gpu_metrics_info_get returned " + << getRSMIStatusString(ret) + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_ERROR(ss); + return ret; + } - switch (sensor_type) { - case RSMI_TEMP_TYPE_HBM_0: - val_ui16 = gpu_metrics.temperature_hbm[0]; - break; - case RSMI_TEMP_TYPE_HBM_1: - val_ui16 = gpu_metrics.temperature_hbm[1]; - break; - case RSMI_TEMP_TYPE_HBM_2: - val_ui16 = gpu_metrics.temperature_hbm[2]; - break; - case RSMI_TEMP_TYPE_HBM_3: - val_ui16 = gpu_metrics.temperature_hbm[3]; - break; - default: - return RSMI_STATUS_INVALID_ARGS; - } - if (val_ui16 == UINT16_MAX) { - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Type: " << monitorTypesToString.at(mon_type) - << " | Cause: Reached UINT16 max value, overflow" - << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; - LOG_ERROR(ss); - return RSMI_STATUS_NOT_SUPPORTED; - } else - *temperature = val_ui16 * CENTRIGRADE_TO_MILLI_CENTIGRADE; + switch (sensor_type) { + case RSMI_TEMP_TYPE_HBM_0: + val_ui16 = gpu_metrics.temperature_hbm[0]; + break; + case RSMI_TEMP_TYPE_HBM_1: + val_ui16 = gpu_metrics.temperature_hbm[1]; + break; + case RSMI_TEMP_TYPE_HBM_2: + val_ui16 = gpu_metrics.temperature_hbm[2]; + break; + case RSMI_TEMP_TYPE_HBM_3: + val_ui16 = gpu_metrics.temperature_hbm[3]; + break; + default: + return RSMI_STATUS_INVALID_ARGS; + } + if (val_ui16 == UINT16_MAX) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: Reached UINT16 max value, overflow" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } - ss << __PRETTY_FUNCTION__ << " | ======= end ======= " - << " | Success " - << " | Device #: " << dv_ind - << " | Type: " << monitorTypesToString.at(mon_type) - << " | Data: " << *temperature - << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " | "; - LOG_INFO(ss); - return RSMI_STATUS_SUCCESS; + *temperature = static_cast(val_ui16) * CENTRIGRADE_TO_MILLI_CENTIGRADE; + + ss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Data: " << *temperature + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " | "; + LOG_INFO(ss); + return RSMI_STATUS_SUCCESS; } // end HBM temperature DEVICE_MUTEX @@ -2846,9 +2755,8 @@ rsmi_dev_fan_reset(uint32_t dv_ind, uint32_t sensor_ind) { LOG_TRACE(ss); ++sensor_ind; // fan sysfs files have 1-based indices - + REQUIRE_ROOT_ACCESS DEVICE_MUTEX - ret = set_dev_mon_value(amd::smi::kMonFanCntrlEnable, dv_ind, sensor_ind, 2); return ret; @@ -2889,14 +2797,12 @@ rsmi_dev_fan_speed_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t speed) { // First need to set fan mode (pwm1_enable) to 1 (aka, "manual") ret = set_dev_mon_value(amd::smi::kMonFanCntrlEnable, dv_ind, sensor_ind, 1); - if (ret != RSMI_STATUS_SUCCESS) { return ret; } ret = set_dev_mon_value(amd::smi::kMonFanSpeed, dv_ind, sensor_ind, speed); - return ret; CATCH @@ -2936,7 +2842,7 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) { } rsmi_status_t -rsmi_dev_gpu_reset(int32_t dv_ind) { +rsmi_dev_gpu_reset(uint32_t dv_ind) { TRY std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; @@ -3110,7 +3016,8 @@ rsmi_status_t rsmi_dev_power_cap_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t cap) { TRY rsmi_status_t ret; - uint64_t min, max; + uint64_t min; + uint64_t max; std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); @@ -3219,10 +3126,24 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, if (mem_type == RSMI_MEM_TYPE_VRAM && *total == 0) { GET_DEV_AND_KFDNODE_FROM_INDX if (kfd_node->get_total_memory(total) == 0 && *total > 0) { + ss << __PRETTY_FUNCTION__ + << " | inside success fallback... " + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: total = " << std::to_string(*total) + << " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS); + LOG_DEBUG(ss); return RSMI_STATUS_SUCCESS; } } + ss << __PRETTY_FUNCTION__ + << " | after fallback... " + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: total = " << std::to_string(*total) + << " | ret = " << getRSMIStatusString(ret); + LOG_DEBUG(ss); return ret; CATCH } @@ -3264,11 +3185,36 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, GET_DEV_AND_KFDNODE_FROM_INDX uint64_t total = 0; ret = get_dev_value_int(amd::smi::kDevMemTotVRAM, dv_ind, &total); - if (total != 0) return ret; // do not need to fallback + if (total != 0) { + ss << __PRETTY_FUNCTION__ + << " no fallback needed! - " + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: Used = " << std::to_string(*used) + << " | Data: total = " << std::to_string(total) + << " | ret = " << getRSMIStatusString(ret); + LOG_DEBUG(ss); + return ret; // do not need to fallback + } if ( kfd_node->get_used_memory(used) == 0 ) { + ss << __PRETTY_FUNCTION__ + << " | in fallback == success ..." + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: Used = " << std::to_string(*used) + << " | Data: total = " << std::to_string(total) + << " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS); + LOG_DEBUG(ss); return RSMI_STATUS_SUCCESS; } } + ss << __PRETTY_FUNCTION__ + << " | at end!!!! after fallback ..." + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: Used = " << std::to_string(*used) + << " | ret = " << getRSMIStatusString(ret); + LOG_DEBUG(ss); return ret; CATCH @@ -3483,15 +3429,15 @@ rsmi_utilization_count_get(uint32_t dv_ind, val_ui32 = gpu_metrics.gfx_activity_acc; break; case RSMI_COARSE_GRAIN_MEM_ACTIVITY: - val_ui32 = gpu_metrics.mem_actvity_acc; + val_ui32 = gpu_metrics.mem_activity_acc; break; default: return RSMI_STATUS_INVALID_ARGS; } - if (val_ui32 == UINT32_MAX) + if (val_ui32 == UINT32_MAX) { return RSMI_STATUS_NOT_SUPPORTED; - else - utilization_counters[index].value = val_ui32; + } + utilization_counters[index].value = val_ui32; } *timestamp = gpu_metrics.system_clock_counter; @@ -3500,6 +3446,119 @@ rsmi_utilization_count_get(uint32_t dv_ind, CATCH } +rsmi_status_t +rsmi_dev_activity_metric_get(uint32_t dv_ind, + rsmi_activity_metric_t activity_metric_type, + rsmi_activity_metric_counter_t* activity_metric_counter) { + + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + if (!activity_metric_counter) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Metric Type: " << activity_metric_type + << " | Cause: rsmi_activity_metric_counter_t was a null ptr reference" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_ERROR(ostrstream); + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + rsmi_gpu_metrics_t gpu_metrics; + status_code = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Metric Type: " << activity_metric_type + << " | Cause: rsmi_dev_gpu_metrics_info_get returned " + << getRSMIStatusString(status_code) + << " | Returning = " + << status_code << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_GFX) { + activity_metric_counter->average_gfx_activity = gpu_metrics.average_gfx_activity; + ostrstream << __PRETTY_FUNCTION__ + << " | For GFX: " << activity_metric_counter->average_gfx_activity; + LOG_INFO(ostrstream); + } + if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_UMC) { + activity_metric_counter->average_umc_activity = gpu_metrics.average_umc_activity; + ostrstream << __PRETTY_FUNCTION__ + << " | For UMC: " << activity_metric_counter->average_umc_activity; + LOG_INFO(ostrstream); + } + if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_MM) { + activity_metric_counter->average_mm_activity = gpu_metrics.average_mm_activity; + ostrstream << __PRETTY_FUNCTION__ + << " | For MM: " << activity_metric_counter->average_mm_activity; + LOG_INFO(ostrstream); + } + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Metric Type: " << activity_metric_type + << " | Returning = " + << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity) { + + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + if (!avg_activity) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Metric Type: " << rsmi_activity_metric_t::RSMI_ACTIVITY_MM + << " | Cause: avg_activity was a null ptr reference" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_ERROR(ostrstream); + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + auto avg_mm_activity(uint16_t(0)); + rsmi_activity_metric_counter_t activity_metric_counter; + status_code = rsmi_dev_activity_metric_get(dv_ind, rsmi_activity_metric_t::RSMI_ACTIVITY_MM, &activity_metric_counter); + avg_activity = &activity_metric_counter.average_mm_activity; + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Metric Type: " << rsmi_activity_metric_t::RSMI_ACTIVITY_MM + << " | Returning = " + << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + + rsmi_status_t rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len) { TRY @@ -3717,7 +3776,7 @@ rsmi_dev_counter_destroy(rsmi_event_handle_t evnt_handle) { rsmi_status_t rsmi_counter_control(rsmi_event_handle_t evt_handle, - rsmi_counter_command_t cmd, void *) { + rsmi_counter_command_t cmd, void * /*unused*/) { TRY amd::smi::evt::Event *evt = @@ -3778,9 +3837,9 @@ rsmi_counter_read(rsmi_event_handle_t evt_handle, } if (ret == 0) { return RSMI_STATUS_SUCCESS; - } else { - return RSMI_STATUS_UNEXPECTED_SIZE; } + + return RSMI_STATUS_UNEXPECTED_SIZE; CATCH } @@ -3826,9 +3885,9 @@ rsmi_dev_counter_group_supported(uint32_t dv_ind, rsmi_event_group_t group) { if (grp->find(group) == grp->end()) { return RSMI_STATUS_NOT_SUPPORTED; - } else { - return RSMI_STATUS_SUCCESS; } + + return RSMI_STATUS_SUCCESS; CATCH } @@ -3931,6 +3990,10 @@ rsmi_dev_memory_reserved_pages_get(uint32_t dv_ind, uint32_t *num_pages, ret = GetDevValueVec(amd::smi::kDevMemPageBad, dv_ind, &val_vec); + // file is empty, which is valid for no errors + if (ret == RSMI_STATUS_UNEXPECTED_DATA) { + ret = RSMI_STATUS_SUCCESS; + } if (ret == RSMI_STATUS_FILE_ERROR) { return RSMI_STATUS_NOT_SUPPORTED; } @@ -4320,7 +4383,8 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, return RSMI_STATUS_INVALID_ARGS; } - uint32_t node_ind_src, node_ind_dst; + uint32_t node_ind_src; + uint32_t node_ind_dst; // Fetch the source and destination GPU node index if (smi.get_node_index(dv_ind_src, &node_ind_src) || smi.get_node_index(dv_ind_dst, &node_ind_dst)) { @@ -4378,19 +4442,13 @@ get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { } switch (mapStringToRSMIComputePartitionTypes[compute_partition_str]) { - case RSMI_COMPUTE_PARTITION_INVALID: - // Retrieved an unknown compute partition - return RSMI_STATUS_UNEXPECTED_DATA; case RSMI_COMPUTE_PARTITION_CPX: - break; case RSMI_COMPUTE_PARTITION_SPX: - break; case RSMI_COMPUTE_PARTITION_DPX: - break; case RSMI_COMPUTE_PARTITION_TPX: - break; case RSMI_COMPUTE_PARTITION_QPX: break; + case RSMI_COMPUTE_PARTITION_INVALID: default: // Retrieved an unknown compute partition return RSMI_STATUS_UNEXPECTED_DATA; @@ -4463,19 +4521,13 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, std::string currentComputePartition; switch (compute_partition) { - case RSMI_COMPUTE_PARTITION_INVALID: - // Retrieved an unknown compute partition - return RSMI_STATUS_INVALID_ARGS; case RSMI_COMPUTE_PARTITION_CPX: - break; case RSMI_COMPUTE_PARTITION_SPX: - break; case RSMI_COMPUTE_PARTITION_DPX: - break; case RSMI_COMPUTE_PARTITION_TPX: - break; case RSMI_COMPUTE_PARTITION_QPX: break; + case RSMI_COMPUTE_PARTITION_INVALID: default: return RSMI_STATUS_INVALID_ARGS; } @@ -4522,17 +4574,12 @@ static rsmi_status_t get_nps_mode(uint32_t dv_ind, std::string &nps_mode) { } switch (mapStringToNPSModeTypes[val_str]) { - case RSMI_MEMORY_PARTITION_UNKNOWN: - // Retrieved an unknown NPS mode - return RSMI_STATUS_UNEXPECTED_DATA; case RSMI_MEMORY_PARTITION_NPS1: - break; case RSMI_MEMORY_PARTITION_NPS2: - break; case RSMI_MEMORY_PARTITION_NPS4: - break; case RSMI_MEMORY_PARTITION_NPS8: break; + case RSMI_MEMORY_PARTITION_UNKNOWN: default: // Retrieved an unknown NPS mode return RSMI_STATUS_UNEXPECTED_DATA; @@ -4566,7 +4613,7 @@ rsmi_dev_nps_mode_set(uint32_t dv_ind, rsmi_nps_mode_type_t nps_mode) { } } - if (isCorrectDevice == false) { + if (!isCorrectDevice) { return RSMI_STATUS_NOT_SUPPORTED; } @@ -4575,17 +4622,12 @@ rsmi_dev_nps_mode_set(uint32_t dv_ind, rsmi_nps_mode_type_t nps_mode) { std::string currentNPSMode; switch (nps_mode) { - case RSMI_MEMORY_PARTITION_UNKNOWN: - // Retrieved an unknown NPS mode - return RSMI_STATUS_INVALID_ARGS; case RSMI_MEMORY_PARTITION_NPS1: - break; case RSMI_MEMORY_PARTITION_NPS2: - break; case RSMI_MEMORY_PARTITION_NPS4: - break; case RSMI_MEMORY_PARTITION_NPS8: break; + case RSMI_MEMORY_PARTITION_UNKNOWN: default: return RSMI_STATUS_INVALID_ARGS; } @@ -4721,19 +4763,19 @@ rsmi_dev_supported_func_iterator_open(uint32_t dv_ind, if (dev->supported_funcs()->begin() == dev->supported_funcs()->end()) { delete *handle; return RSMI_STATUS_NO_DATA; - } else { - SupportedFuncMapIt *supp_func_iter = new SupportedFuncMapIt; - - if (supp_func_iter == nullptr) { - return RSMI_STATUS_OUT_OF_RESOURCES; - } - *supp_func_iter = dev->supported_funcs()->begin(); - - (*handle)->func_id_iter = reinterpret_cast(supp_func_iter); - (*handle)->container_ptr = - reinterpret_cast(dev->supported_funcs()); } + SupportedFuncMapIt *supp_func_iter = new SupportedFuncMapIt; + + if (supp_func_iter == nullptr) { + return RSMI_STATUS_OUT_OF_RESOURCES; + } + *supp_func_iter = dev->supported_funcs()->begin(); + + (*handle)->func_id_iter = reinterpret_cast(supp_func_iter); + (*handle)->container_ptr = + reinterpret_cast(dev->supported_funcs()); + return RSMI_STATUS_SUCCESS; CATCH @@ -5100,7 +5142,8 @@ rsmi_event_notification_get(int timeout_ms, if (*num_elem < buffer_size && errno != EAGAIN) { return amd::smi::ErrnoToRsmiStatus(errno); - } else if (*num_elem >= buffer_size) { + } + if (*num_elem >= buffer_size) { return RSMI_STATUS_SUCCESS; } @@ -5176,7 +5219,7 @@ rsmi_test_refcount(uint64_t refcnt_type) { amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); std::lock_guard guard(*smi.bootstrap_mutex()); - if (smi.ref_count() == 0 && smi.devices().size() != 0) { + if (smi.ref_count() == 0 && !smi.devices().empty()) { return -1; } diff --git a/rocm_smi/src/rocm_smi_counters.cc b/rocm_smi/src/rocm_smi_counters.cc index 9f82798183..a08819568e 100755 --- a/rocm_smi/src/rocm_smi_counters.cc +++ b/rocm_smi/src/rocm_smi_counters.cc @@ -41,20 +41,20 @@ * */ -#include -#include -#include -#include #include +#include #include #include -#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include #include #include "rocm_smi/rocm_smi.h" @@ -164,8 +164,7 @@ GetSupportedEventGroups(uint32_t dev_num, dev_evt_grp_set_t *supported_grps) { } // /sys/bus/event_source/devices/_/type Event::Event(rsmi_event_type_t event, uint32_t dev_ind) : - event_type_(event), prev_cntr_val_(0) { - fd_ = -1; + event_type_(event), fd_(-1), prev_cntr_val_(0) { rsmi_event_group_t grp = EvtGrpFromEvtID(event); assert(grp != RSMI_EVNT_GRP_INVALID); // This should have failed before now @@ -398,10 +397,11 @@ readn(int fd, void *buf, size_t n) { return static_cast(n - left); } if (bytes < 0) { - if (errno == EINTR) /* read got interrupted */ + if (errno == EINTR) { + /* read got interrupted */ continue; - else - return -errno; + } + return -errno; } left -= static_cast(bytes); diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 670b6bf952..8bea7e86a3 100755 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -43,30 +43,28 @@ #include #include -#include -#include #include -#include +#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include #include +#include #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" -#include "rocm_smi/rocm_smi_kfd.h" #include "rocm_smi/rocm_smi_logger.h" #include "shared_mutex.h" // NOLINT @@ -611,7 +609,6 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { bool reg_file; int ret = isRegularFile(sysfs_path, ®_file); - if (ret != 0) { ss << "File did not exist - SYSFS file (" << sysfs_path << ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) @@ -708,7 +705,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr) { int ret; std::ostringstream ss; - fs.rdbuf()->pubsetbuf(0,0); + fs.rdbuf()->pubsetbuf(nullptr,0); ret = openSysfsFileStream(type, &fs, valStr.c_str()); if (ret != 0) { ss << "Could not write device info string (" << valStr @@ -822,7 +819,8 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, FILE *ptr; sysfs_path += "/device/"; sysfs_path += kDevAttribNameMap.at(type); - ptr = fopen(sysfs_path.c_str(), "rb"); + + ptr = fopen(sysfs_path.c_str(), "rb"); if (!ptr) { ss << "Could not read DevInfoBinary for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ")" @@ -874,21 +872,21 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, retVec->push_back(line); } - if (retVec->size() == 0) { + if (retVec->empty()) { ss << "Read devInfoMultiLineStr for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ")" << ", but contained no string lines"; - LOG_INFO(ss); - return 0; + LOG_ERROR(ss); + return ENXIO; } // Remove any *trailing* empty (whitespace) lines - while (retVec->size() != 0 && + while (!retVec->empty() && retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) { retVec->pop_back(); } // allow logging output of multiline strings - for (auto l: *retVec) { + for (const auto& l: *retVec) { allLines += "\n" + l; } @@ -902,6 +900,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, << RocmSMI::devInfoTypesStrings.at(type) << ")" << ", but lines were empty"; LOG_INFO(ss); + return ENXIO; } return 0; } @@ -924,10 +923,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); - if (tempStr == "") { + if (tempStr.empty()) { return EINVAL; } - tmp_val = std::stoi(tempStr, 0, 16); + tmp_val = std::stoi(tempStr, nullptr, 16); if (tmp_val < 0) { return EINVAL; } @@ -949,10 +948,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevXGMIError: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); - if (tempStr == "") { + if (tempStr.empty()) { return EINVAL; } - *val = std::stoul(tempStr, 0); + *val = std::stoul(tempStr, nullptr); break; case kDevUniqueId: @@ -979,10 +978,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevFwVersionVcn: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); - if (tempStr == "") { + if (tempStr.empty()) { return EINVAL; } - *val = std::stoul(tempStr, 0, 16); + *val = std::stoul(tempStr, nullptr, 16); break; case kDevGpuReset: @@ -1120,13 +1119,9 @@ void Device::DumpSupportedFunctions(void) { } void Device::fillSupportedFuncs(void) { - if (supported_funcs_.size() != 0) { + if (!supported_funcs_.empty()) { return; } - if (monitor() == nullptr) { - return; - } - std::map::const_iterator it = kDevFuncDependsMap.begin(); std::string dev_rt = path_ + "/device"; @@ -1160,7 +1155,7 @@ void Device::fillSupportedFuncs(void) { std::vector::const_iterator var = it->second.variants.begin(); - if (it->second.variants.size() == 0) { + if (it->second.variants.empty()) { supported_funcs_[it->first] = nullptr; it++; continue; @@ -1176,13 +1171,15 @@ void Device::fillSupportedFuncs(void) { (*supported_variants)[kDevInfoVarTypeToRSMIVariant.at(*var)] = nullptr; } - if ((*supported_variants).size() > 0) { + if (!(*supported_variants).empty()) { supported_funcs_[it->first] = supported_variants; } it++; } - monitor()->fillSupportedFuncs(&supported_funcs_); + if (monitor() != nullptr) { + monitor()->fillSupportedFuncs(&supported_funcs_); + } // DumpSupportedFunctions(); } @@ -1222,35 +1219,32 @@ bool Device::DeviceAPISupported(std::string name, uint64_t variant, if (sub_variant == RSMI_DEFAULT_VARIANT) { return true; - } else { // sub_variant != RSMI_DEFAULT_VARIANT - // if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr - assert(var_it->second != nullptr); + } + // sub_variant != RSMI_DEFAULT_VARIANT + // if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr + assert(var_it->second != nullptr); - return subvariant_match(&(var_it->second), sub_variant); - } - } else { // variant == RSMI_DEFAULT_VARIANT - if (func_it->second != nullptr) { - var_it = func_it->second->find(variant); - } - if (sub_variant == RSMI_DEFAULT_VARIANT) { - return true; - } else { // sub_variant != RSMI_DEFAULT_VARIANT - if (func_it->second == nullptr) { - return false; - } - return subvariant_match(&(var_it->second), sub_variant); - } + return subvariant_match(&(var_it->second), sub_variant); } - assert(false); // We should not reach here - - return false; + // variant == RSMI_DEFAULT_VARIANT + if (func_it->second != nullptr) { + var_it = func_it->second->find(variant); + } + if (sub_variant == RSMI_DEFAULT_VARIANT) { + return true; + } + // sub_variant != RSMI_DEFAULT_VARIANT + if (func_it->second == nullptr) { + return false; + } + return subvariant_match(&(var_it->second), sub_variant); } rsmi_status_t Device::restartAMDGpuDriver(void) { REQUIRE_ROOT_ACCESS bool restartSuccessful = true; bool success = false; - std::string out = ""; + std::string out; bool wasGdmServiceActive = false; // sudo systemctl is-active gdm diff --git a/rocm_smi/src/rocm_smi_gpu_metrics.cc b/rocm_smi/src/rocm_smi_gpu_metrics.cc index 3e147e2f9f..e89a0d58fb 100755 --- a/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -41,24 +41,22 @@ * */ -#include #include +#include +#include +#include +#include #include #include -#include -#include -#include #include -#include +#include #include // NOLINT +#include #include -#include -#include #include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h #include "rocm_smi/rocm_smi_main.h" -#include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_logger.h" @@ -151,7 +149,7 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, const rsmi_gpu_metrics_v_1_2 *rsmi_gpu_metrics_v_1_2, const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3, const rsmi_gpu_metrics_t *rsmi_gpu_metrics) { - if (RocmSMI::getInstance().isLoggingOn() == false) { + if (!RocmSMI::getInstance().isLoggingOn()) { return; } std::ostringstream ss; @@ -171,9 +169,8 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, } if (rsmi_gpu_metrics == nullptr) { return; - } else { - // do nothing - continue } + ss /* Common Header */ << print_unsigned_hex_and_int( @@ -291,8 +288,8 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, rsmi_gpu_metrics->gfx_activity_acc, "rsmi_gpu_metrics->gfx_activity_acc") << print_unsigned_hex_and_int( - rsmi_gpu_metrics->mem_actvity_acc, - "rsmi_gpu_metrics->mem_actvity_acc"); + rsmi_gpu_metrics->mem_activity_acc, + "rsmi_gpu_metrics->mem_activity_acc"); for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) { ss << print_unsigned_hex_and_int( rsmi_gpu_metrics->temperature_hbm[i], @@ -366,7 +363,7 @@ static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind, } #define ASSIGN_DATA_FIELD(FIELD, SRC) \ - data->FIELD = SRC->FIELD; + data->FIELD = (SRC)->FIELD; #define ASSIGN_COMMON_FORMATS(SRC) \ ASSIGN_DATA_FIELD(common_header, (SRC)) \ @@ -417,7 +414,7 @@ static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind, // These fields didn't exist in v0 data->gfx_activity_acc = 0; - data->mem_actvity_acc = 0; + data->mem_activity_acc = 0; (void)memset(data->temperature_hbm, 0, RSMI_NUM_HBM_INSTANCES * sizeof(uint16_t)); } // else handle other conversions to format 1 diff --git a/rocm_smi/src/rocm_smi_io_link.cc b/rocm_smi/src/rocm_smi_io_link.cc index 888f13fffa..218e520d84 100755 --- a/rocm_smi/src/rocm_smi_io_link.cc +++ b/rocm_smi/src/rocm_smi_io_link.cc @@ -41,20 +41,19 @@ * */ -#include -#include #include +#include #include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include "rocm_smi/rocm_smi.h" -#include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_io_link.h" @@ -161,7 +160,7 @@ static int ReadLinkProperties(uint32_t node_indx, uint32_t link_indx, retVec->push_back(line); } - if (retVec->size() == 0) { + if (retVec->empty()) { fs.close(); return 0; } @@ -182,7 +181,7 @@ static int DiscoverLinks(std::map, if (links == nullptr) { return EINVAL; } - assert(links->size() == 0); + assert(links->empty()); links->clear(); @@ -229,8 +228,8 @@ static int DiscoverLinks(std::map, } link_indx = static_cast(std::stoi(dentry_io_link->d_name)); - link = std::shared_ptr(new IOLink(node_indx, link_indx, - directory)); + link = std::make_shared(node_indx, link_indx, + directory); link->Initialize(); @@ -273,7 +272,7 @@ static int DiscoverLinksPerNode(uint32_t node_indx, std::mapsize() == 0); + assert(links->empty()); links->clear(); @@ -297,8 +296,8 @@ static int DiscoverLinksPerNode(uint32_t node_indx, std::map(std::stoi(dentry->d_name)); - link = std::shared_ptr(new IOLink(node_indx, link_indx, - directory)); + link = std::make_shared(node_indx, link_indx, + directory); link->Initialize(); @@ -323,16 +322,15 @@ int DiscoverP2PLinksPerNode(uint32_t node_indx, std::map propVec; - assert(properties_.size() == 0); - if (properties_.size() > 0) { + assert(properties_.empty()); + if (!properties_.empty()) { return 0; } @@ -347,8 +345,8 @@ int IOLink::ReadProperties(void) { uint64_t val_int; // Assume all properties are unsigned integers for now std::istringstream fs; - for (uint32_t i = 0; i < propVec.size(); ++i) { - fs.str(propVec[i]); + for (const auto & i : propVec) { + fs.str(i); fs >> key_str; fs >> val_int; diff --git a/rocm_smi/src/rocm_smi_kfd.cc b/rocm_smi/src/rocm_smi_kfd.cc index 092bcb3414..7fe9004cc3 100755 --- a/rocm_smi/src/rocm_smi_kfd.cc +++ b/rocm_smi/src/rocm_smi_kfd.cc @@ -41,28 +41,29 @@ * */ -#include -#include -#include -#include -#include #include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include #include "rocm_smi/rocm_smi_io_link.h" #include "rocm_smi/rocm_smi_kfd.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" -#include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_logger.h" namespace amd { namespace smi { @@ -195,7 +196,7 @@ int ReadKFDDeviceProperties(uint32_t kfd_node_id, retVec->push_back(line); } - if (retVec->size() == 0) { + if (retVec->empty()) { fs.close(); return ENOENT; } @@ -517,7 +518,7 @@ int DiscoverKFDNodes(std::map> *nodes) { if (nodes == nullptr) { return EINVAL; } - assert(nodes->size() == 0); + assert(nodes->empty()); nodes->clear(); @@ -548,7 +549,7 @@ int DiscoverKFDNodes(std::map> *nodes) { continue; } - node = std::shared_ptr(new KFDNode(node_indx)); + node = std::make_shared(node_indx); node->Initialize(); @@ -596,16 +597,15 @@ int DiscoverKFDNodes(std::map> *nodes) { return 0; } -KFDNode::~KFDNode() { -} +KFDNode::~KFDNode() = default; int KFDNode::ReadProperties(void) { int ret; std::vector propVec; - assert(properties_.size() == 0); - if (properties_.size() > 0) { + assert(properties_.empty()); + if (!properties_.empty()) { return 0; } @@ -620,8 +620,8 @@ int KFDNode::ReadProperties(void) { uint64_t val_int; // Assume all properties are unsigned integers for now std::istringstream fs; - for (uint32_t i = 0; i < propVec.size(); ++i) { - fs.str(propVec[i]); + for (const auto & i : propVec) { + fs.str(i); fs >> key_str; fs >> val_int; @@ -776,20 +776,30 @@ KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth, // /sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties // size_in_bytes 68702699520 int KFDNode::get_total_memory(uint64_t* total) { - if (total == nullptr) return EINVAL; + std::ostringstream ss; + if (total == nullptr) { + return EINVAL; + } *total = 0; std::string f_path = kKFDNodesPathRoot; f_path += "/"; f_path += std::to_string(node_indx_); f_path += "/mem_banks"; + int subDirCount = subDirectoryCountInPath(f_path); + ss << __PRETTY_FUNCTION__ << " | [before loop] Within " << f_path + << " has subdirectory count = " << std::to_string(subDirCount); + LOG_DEBUG(ss); auto kfd_node_dir = opendir(f_path.c_str()); if (kfd_node_dir == nullptr) { return errno; } auto dentry = readdir(kfd_node_dir); - while (dentry != nullptr) { + while (dentry != nullptr && subDirCount > 0) { + ss << __PRETTY_FUNCTION__ << " | [inside loop] Within " << f_path + << " has subdirectory count = " << std::to_string(subDirCount); + LOG_DEBUG(ss); if (dentry->d_name[0] == '.') { dentry = readdir(kfd_node_dir); continue; @@ -823,6 +833,7 @@ int KFDNode::get_total_memory(uint64_t* total) { } } } // end loop for lines in property file + subDirCount--; } // end loop for mem_bank directory if (closedir(kfd_node_dir)) { @@ -863,5 +874,80 @@ int KFDNode::get_used_memory(uint64_t* used) { return 1; } +// /sys/class/kfd/kfd/topology/nodes/*/properties +int read_node_properties(uint32_t node, std::string property_name, + uint64_t *val) { + std::ostringstream ss; + int retVal = EINVAL; + if (property_name.empty() || val == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", property_name is empty or *val is nullptr " + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + return retVal; + } + std::shared_ptr myNode = std::shared_ptr(new KFDNode(node)); + myNode->Initialize(); + if (KFDNodeSupported(node)) { + retVal = myNode->get_property_value(property_name, val); + ss << __PRETTY_FUNCTION__ + << " | Successfully read node #" << std::to_string(node) + << " for property_name = " << property_name + << " | Data (" << property_name << ") * val = " + << std::to_string(*val) + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + } else { + retVal = 1; + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", KFD node was an unsupported node." + << " | return = " << std::to_string(retVal) + << " | "; + LOG_ERROR(ss); + } + return retVal; +} + +// /sys/class/kfd/kfd/topology/nodes/*/gpu_id +int get_gpu_id(uint32_t node, uint64_t *gpu_id) { + std::ostringstream ss; + int retVal = EINVAL; + if (gpu_id == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", gpu_id is a nullptr " + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + return retVal; + } + std::shared_ptr myNode = std::shared_ptr(new KFDNode(node)); + myNode->Initialize(); + if (KFDNodeSupported(node)) { + retVal = ReadKFDGpuId(node, gpu_id); + ss << __PRETTY_FUNCTION__ + << " | Successfully read node #" << std::to_string(node) + << " for gpu_id" + << " | Data (gpu_id) *gpu_id = " + << std::to_string(*gpu_id) + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + } else { + retVal = 1; + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", KFD node was an unsupported node." + << " | return = " << std::to_string(retVal) + << " | "; + LOG_ERROR(ss); + } + return retVal; +} + } // namespace smi } // namespace amd diff --git a/rocm_smi/src/rocm_smi_logger.cc b/rocm_smi/src/rocm_smi_logger.cc index 24ddd6d6f2..05bb09834a 100644 --- a/rocm_smi/src/rocm_smi_logger.cc +++ b/rocm_smi/src/rocm_smi_logger.cc @@ -55,7 +55,7 @@ * be printed, unless RSMI_LOGGING is enabled. * * BUFFER log type should be use while logging raw buffer or raw messages - * Having direct interface as well as C++ Singleton inface. Can use + * Having direct interface as well as C++ Singleton iface. Can use * whatever interface fits your needs. */ @@ -70,7 +70,6 @@ // Code Specific Header Files(s) #include "rocm_smi/rocm_smi_logger.h" #include "rocm_smi/rocm_smi_main.h" -#include "rocm_smi/rocm_smi_utils.h" using namespace ROCmLogging; @@ -117,7 +116,7 @@ void Logger::logIntoFile(std::string& data) { if(!m_File.is_open()) { initialize_resources(); if (!m_File.is_open()) { - std::cout << "WARNING: re-initializing resources was unsuccessfull." + std::cout << "WARNING: re-initializing resources was unsuccessful." <<" Unable to print the following message." << std::endl; logOnConsole(data); unlock(); @@ -164,7 +163,7 @@ void Logger::error(const char* text) throw() { // By default, logging is disabled // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -198,7 +197,7 @@ void Logger::alarm(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -232,7 +231,7 @@ void Logger::always(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -270,7 +269,7 @@ void Logger::buffer(const char* text) throw() { if(!m_File.is_open()) { initialize_resources(); if (!m_File.is_open()) { - std::cout << "WARNING: re-initializing resources was unsuccessfull." + std::cout << "WARNING: re-initializing resources was unsuccessful." <<" Unable to print the following message." << std::endl; std::string txtStr(text); std::cout << txtStr << std::endl; @@ -300,7 +299,7 @@ void Logger::info(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -334,7 +333,7 @@ void Logger::trace(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -368,7 +367,7 @@ void Logger::debug(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -426,7 +425,7 @@ void Logger::enableFileLogging() { // Returns a string of details on current log settings std::string Logger::getLogSettings() { - std::string logSettings = ""; + std::string logSettings; if (m_File.is_open()) { logSettings += "OpenStatus = File (" + logFileName + ") is open"; @@ -490,7 +489,7 @@ void Logger::initialize_resources() { // The check below allows us to toggle logging through RSMI_LOGGING // set or unset m_loggingIsOn = amd::smi::RocmSMI::getInstance().isLoggingOn(); - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } m_File.open(logFileName.c_str(), std::ios::out | std::ios::app); diff --git a/rocm_smi/src/rocm_smi_main.cc b/rocm_smi/src/rocm_smi_main.cc index 49dec9332d..ef4e022889 100755 --- a/rocm_smi/src/rocm_smi_main.cc +++ b/rocm_smi/src/rocm_smi_main.cc @@ -39,25 +39,26 @@ * DEALINGS WITH THE SOFTWARE. * */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include + +#include #include -#include +#include +#include +#include +#include +#include #include +#include +#include #include +#include +#include +#include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_device.h" @@ -285,7 +286,8 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) { // We are looking for the last element in the path that has the form // XXXX:XX:XX.X, where X is a hex integer (lower case is expected) - std::size_t slash_i, end_i; + std::size_t slash_i; + std::size_t end_i; std::string tmp; std::string tpath_str(tpath); @@ -332,9 +334,9 @@ RocmSMI::Initialize(uint64_t flags) { GetEnvVariables(); // To help debug env variable issues - // printEnvVarInfo(); + // debugRSMIEnvVarInfo(); - while (std::string(kAMDMonitorTypes[i]) != "") { + while (!std::string(kAMDMonitorTypes[i]).empty()) { amd_monitor_types_.insert(kAMDMonitorTypes[i]); ++i; } @@ -348,12 +350,12 @@ RocmSMI::Initialize(uint64_t flags) { } uint64_t bdfid; - for (uint32_t i = 0; i < devices_.size(); ++i) { - if (ConstructBDFID(devices_[i]->path(), &bdfid) != 0) { + for (auto & device : devices_) { + if (ConstructBDFID(device->path(), &bdfid) != 0) { std::cerr << "Failed to construct BDFID." << std::endl; ret = 1; } else { - devices_[i]->set_bdfid(bdfid); + device->set_bdfid(bdfid); } } if (ret != 0) { @@ -389,7 +391,7 @@ RocmSMI::Initialize(uint64_t flags) { uint64_t bdfid = (*dev_iter)->bdfid(); if (tmp_map.find(bdfid) == tmp_map.end()) { ss << __PRETTY_FUNCTION__ << " | removing device = " - << (*dev_iter)->path(); + << (*dev_iter)->path() << "; bdfid = " << std::to_string(bdfid); dev_iter = devices_.erase(dev_iter); LOG_DEBUG(ss); continue; @@ -444,8 +446,7 @@ RocmSMI::RocmSMI(uint64_t flags) : init_options_(flags), kfd_notif_evt_fh_(-1), kfd_notif_evt_fh_refcnt_(0) { } -RocmSMI::~RocmSMI() { -} +RocmSMI::~RocmSMI() = default; RocmSMI& RocmSMI::getInstance(uint64_t flags) { // Assume c++11 or greater. static objects will be created by only 1 thread @@ -494,7 +495,7 @@ static inline std::unordered_set GetEnvVarUIntegerSets( if(ev_str == nullptr) { return returnSet; } std::string stringEnv = ev_str; - if (stringEnv.empty() == false) { + if (!stringEnv.empty()) { // parse out values by commas std::string parsedVal; std::istringstream ev_str_ss(stringEnv); @@ -549,48 +550,54 @@ uint32_t RocmSMI::getLogSetting() { return this->env_vars_.logging_on; } -void RocmSMI::printEnvVarInfo(void) { - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_output_bitfield = " - << ((env_vars_.debug_output_bitfield == 0) ? "" - : std::to_string(env_vars_.debug_output_bitfield)) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_DRM_root_override = " - << ((env_vars_.path_DRM_root_override == nullptr) - ? "" : env_vars_.path_DRM_root_override) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_HWMon_root_override = " - << ((env_vars_.path_HWMon_root_override == nullptr) - ? "" : env_vars_.path_HWMon_root_override) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_power_root_override = " - << ((env_vars_.path_power_root_override == nullptr) - ? "" : env_vars_.path_power_root_override) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_inf_loop = " - << ((env_vars_.debug_inf_loop == 0) ? "" - : std::to_string(env_vars_.debug_inf_loop)) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " +void RocmSMI::debugRSMIEnvVarInfo(void) { + std::cout << __PRETTY_FUNCTION__ + << RocmSMI::getInstance().getRSMIEnvVarInfo(); +} + +std::string RocmSMI::getRSMIEnvVarInfo(void) { + std::ostringstream ss; + ss << "\n\tRSMI_DEBUG_BITFIELD = " + << ((env_vars_.debug_output_bitfield == 0) ? "" + : std::to_string(env_vars_.debug_output_bitfield)) + << std::endl; + ss << "\tRSMI_DEBUG_DRM_ROOT_OVERRIDE = " + << ((env_vars_.path_DRM_root_override == nullptr) + ? "" : env_vars_.path_DRM_root_override) + << std::endl; + ss << "\tRSMI_DEBUG_HWMON_ROOT_OVERRIDE = " + << ((env_vars_.path_HWMon_root_override == nullptr) + ? "" : env_vars_.path_HWMon_root_override) + << std::endl; + ss << "\tRSMI_DEBUG_PP_ROOT_OVERRIDE = " + << ((env_vars_.path_power_root_override == nullptr) + ? "" : env_vars_.path_power_root_override) + << std::endl; + ss << "\tRSMI_DEBUG_INFINITE_LOOP = " + << ((env_vars_.debug_inf_loop == 0) ? "" + : std::to_string(env_vars_.debug_inf_loop)) + << std::endl; + ss << "\tRSMI_LOGGING = " << getLogSetting() << std::endl; bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " - << (isLoggingOn ? "true" : "false") << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {"; + ss << "\tRSMI_LOGGING (are logs on) = " + << (isLoggingOn ? "TRUE" : "FALSE") << std::endl; + ss << "\tRSMI_DEBUG_ENUM_OVERRIDE = {"; if (env_vars_.enum_overrides.empty()) { - std::cout << "}" << std::endl; - return; + ss << "}" << std::endl; + return ss.str(); } for (auto it=env_vars_.enum_overrides.begin(); it != env_vars_.enum_overrides.end(); ++it) { DevInfoTypes type = static_cast(*it); - std::cout << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) - + ")"); + ss << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) + ")"); auto temp_it = it; if(++temp_it != env_vars_.enum_overrides.end()) { - std::cout << ", "; + ss << ", "; } } - std::cout << "}" << std::endl; + ss << "}" << std::endl; + return ss.str(); } std::shared_ptr @@ -638,7 +645,7 @@ RocmSMI::FindMonitor(std::string monitor_path) { fs.close(); if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) { - m = std::shared_ptr(new Monitor(mon_name, &env_vars_)); + m = std::make_shared(mon_name, &env_vars_); m->setTempSensorLabelMap(); m->setVoltSensorLabelMap(); break; @@ -666,12 +673,12 @@ RocmSMI::AddToDeviceList(std::string dev_name) { dev_path += "/"; dev_path += dev_name; - auto dev = std::shared_ptr(new Device(dev_path, &env_vars_)); + auto dev = std::make_shared(dev_path, &env_vars_); std::shared_ptr m = FindMonitor(dev_path + "/device/hwmon"); dev->set_monitor(m); - std::string d_name = dev_name; + const std::string& d_name = dev_name; uint32_t card_indx = GetDeviceIndex(d_name); dev->set_drm_render_minor(GetDrmRenderMinor(dev_path)); dev->set_card_index(card_indx); @@ -682,8 +689,6 @@ RocmSMI::AddToDeviceList(std::string dev_name) { << dev_name << " | path = " << dev_path << " | card index = " << std::to_string(card_indx) << " | "; LOG_DEBUG(ss); - - return; } static const uint32_t kAmdGpuId = 0x1002; @@ -694,8 +699,7 @@ static bool isAMDGPU(std::string dev_path) { std::string vend_path = dev_path + "/device/vendor"; if (!FileExists(vend_path.c_str())) { ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path - << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": - "is an amdgpu device - FALSE"); + << " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE"); LOG_DEBUG(ss); return isAmdGpu; } @@ -705,8 +709,7 @@ static bool isAMDGPU(std::string dev_path) { if (!fs.is_open()) { ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path - << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": - "is an amdgpu device - FALSE"); + << " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE"); LOG_DEBUG(ss); return isAmdGpu; } @@ -721,8 +724,7 @@ static bool isAMDGPU(std::string dev_path) { isAmdGpu = true; } ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path - << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": - "is an amdgpu device - FALSE"); + << " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE"); LOG_DEBUG(ss); return isAmdGpu; } @@ -730,6 +732,7 @@ static bool isAMDGPU(std::string dev_path) { uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { std::string err_msg; uint32_t count = 0; + std::ostringstream ss; // If this gets called more than once, clear previous findings. devices_.clear(); @@ -756,17 +759,125 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { } dentry = readdir(drm_dir); } + ss << __PRETTY_FUNCTION__ << " | Discovered a potential of " + << std::to_string(count) << " cards" << " | "; + LOG_DEBUG(ss); + struct systemNode { + uint32_t s_node_id = 0; + uint64_t s_gpu_id = 0; + uint64_t s_unique_id = 0; + }; + // allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id} + std::multimap allSystemNodes; + uint32_t node_id = 0; + while (true) { + uint64_t gpu_id = 0, unique_id = 0; + int ret_gpu_id = get_gpu_id(node_id, &gpu_id); + int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id); + if (ret_gpu_id == 0 || ret_unique_id == 0) { + systemNode myNode; + myNode.s_node_id = node_id; + myNode.s_gpu_id = gpu_id; + myNode.s_unique_id = unique_id; + if(gpu_id != 0) { // only add gpu nodes, 0 = CPU + allSystemNodes.emplace(unique_id, myNode); + } + } else { + break; + } + node_id++; + } + + ss << __PRETTY_FUNCTION__ << " | Ordered system nodes found = {"; + for(auto i: allSystemNodes) { + ss << "\n[node_id = " << std::to_string(i.second.s_node_id) + << "; gpu_id = " << std::to_string(i.second.s_gpu_id) + << "; unique_id = " << std::to_string(i.second.s_unique_id) + << "], " + ; + } + ss << "}"; + LOG_DEBUG(ss); + + // Discover all root cards & gpu partitions associated with each for (uint32_t node_id = 0; node_id < count; node_id++) { std::string path = kPathDRMRoot; path += "/card"; path += std::to_string(node_id); + uint64_t primary_unique_id = 0; + + // each identified gpu card node is a primary node for + // potential matching unique ids if (isAMDGPU(path) || (init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) { std::string d_name = "card"; d_name += std::to_string(node_id); AddToDeviceList(d_name); - } + + ss << __PRETTY_FUNCTION__ + << " | Ordered system nodes seen in lookup = {"; + for (auto i : allSystemNodes) { + ss << "\n[node_id = " << std::to_string(i.second.s_node_id) + << "; gpu_id = " << std::to_string(i.second.s_gpu_id) + << "; unique_id = " << std::to_string(i.second.s_unique_id) + << "], "; + } + ss << "}"; + LOG_DEBUG(ss); + + uint64_t temp_primary_unique_id = 0; + if (allSystemNodes.empty()) { + continue; + } + + // get lowest key 1st to keep order of nodes matching card + uint32_t lowest_NodeId = 0; + uint32_t curr_NodeId = 0; + + for (auto it = allSystemNodes.begin(), end = allSystemNodes.end(); + it != end; it = allSystemNodes.upper_bound(it->first)) { + curr_NodeId = it->second.s_node_id; + if (it == allSystemNodes.begin()) { + lowest_NodeId = it->second.s_node_id; + } + if (curr_NodeId <= lowest_NodeId) { + lowest_NodeId = curr_NodeId; + temp_primary_unique_id = it->second.s_unique_id; + } + } + ss << __PRETTY_FUNCTION__ + << " | lowest_NodeId = " << std::to_string(lowest_NodeId) + << " | curr_NodeId = " << std::to_string(curr_NodeId) + << " | temp_primary_unique_id = " + << std::to_string(temp_primary_unique_id); + LOG_DEBUG(ss); + + if (temp_primary_unique_id != 0) { + primary_unique_id = temp_primary_unique_id; + } else { + allSystemNodes.erase(primary_unique_id); + continue; + } + + auto numb_nodes = allSystemNodes.count(primary_unique_id); + ss << __PRETTY_FUNCTION__ << " | REFRESH - primary_unique_id = " + << std::to_string(primary_unique_id) << " has " + << std::to_string(numb_nodes) << " known gpu nodes"; + LOG_DEBUG(ss); + while (numb_nodes > 1) { + std::string secNode = "card"; + secNode += std::to_string(node_id); // add the primary node id + AddToDeviceList(secNode); + numb_nodes--; + } + // remove already added nodes associated with current card + auto erasedNodes = allSystemNodes.erase(primary_unique_id); + ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = " + << std::to_string(primary_unique_id) << " erased " + << std::to_string(erasedNodes) << " nodes"; + LOG_DEBUG(ss); + } } if (closedir(drm_dir)) { @@ -790,7 +901,7 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) { power_mons_.clear(); } - if (power_mons_.size() != 0) { + if (!power_mons_.empty()) { return 0; } @@ -818,7 +929,7 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) { if (FileExists(tmp.c_str())) { std::shared_ptr mon = - std::shared_ptr(new PowerMon(mon_name, &env_vars_)); + std::make_shared(mon_name, &env_vars_); power_mons_.push_back(mon); mon->set_dev_index(GetDeviceIndex(dentry->d_name)); } @@ -831,8 +942,8 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) { return errno; } - for (auto m : power_mons_) { - for (auto d : devices_) { + for (const auto& m : power_mons_) { + for (const auto& d : devices_) { if (m->dev_index() == d->index()) { d->set_power_monitor(m); break; diff --git a/rocm_smi/src/rocm_smi_monitor.cc b/rocm_smi/src/rocm_smi_monitor.cc index 982c660447..c4f94284a6 100755 --- a/rocm_smi/src/rocm_smi_monitor.cc +++ b/rocm_smi/src/rocm_smi_monitor.cc @@ -41,19 +41,18 @@ * */ -#include #include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include #include // NOLINT +#include #include -#include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_exception.h" @@ -286,8 +285,7 @@ static const std::map kMonFuncDependsMap = { env_ = nullptr; #endif } -Monitor::~Monitor(void) { -} +Monitor::~Monitor(void) = default; std::string Monitor::MakeMonitorPath(MonitorTypes type, uint32_t sensor_id) { @@ -339,7 +337,7 @@ Monitor::setTempSensorLabelMap(void) { std::string type_str; int ret; - if (temp_type_index_map_.size() > 0) { + if (!temp_type_index_map_.empty()) { return 0; // We've already filled in the map } auto add_temp_sensor_entry = [&](uint32_t file_index) { @@ -377,7 +375,7 @@ Monitor::setVoltSensorLabelMap(void) { std::string type_str; int ret; - if (volt_type_index_map_.size() > 0) { + if (!volt_type_index_map_.empty()) { return 0; // We've already filled in the map } auto add_volt_sensor_entry = [&](uint32_t file_index) { @@ -513,10 +511,10 @@ typedef enum { static monitor_types getFuncType(std::string f_name) { monitor_types ret = eDefaultMonitor; - if (f_name.compare("rsmi_dev_temp_metric_get") == 0) { + if (f_name == "rsmi_dev_temp_metric_get") { ret = eTempMonitor; } - if (f_name.compare("rsmi_dev_volt_metric_get") == 0) { + if (f_name == "rsmi_dev_volt_metric_get") { ret = eVoltMonitor; } return ret; @@ -617,22 +615,22 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { } else { supported_monitors = intersect; } - if (supported_monitors.size() > 0) { - for (uint32_t i = 0; i < supported_monitors.size(); ++i) { + if (!supported_monitors.empty()) { + for (unsigned long & supported_monitor : supported_monitors) { if (m_type == eDefaultMonitor) { - assert(supported_monitors[i] > 0); - supported_monitors[i] |= - (supported_monitors[i] - 1) << MONITOR_TYPE_BIT_POSITION; + assert(supported_monitor > 0); + supported_monitor |= + (supported_monitor - 1) << MONITOR_TYPE_BIT_POSITION; } else if (m_type == eTempMonitor) { // Temp sensor file names are 1-based - assert(supported_monitors[i] > 0); - supported_monitors[i] |= - static_cast(getTempSensorEnum(supported_monitors[i])) + assert(supported_monitor > 0); + supported_monitor |= + static_cast(getTempSensorEnum(supported_monitor)) << MONITOR_TYPE_BIT_POSITION; } else if (m_type == eVoltMonitor) { // Voltage sensor file names are 0-based - supported_monitors[i] |= - static_cast(getVoltSensorEnum(supported_monitors[i])) + supported_monitor |= + static_cast(getVoltSensorEnum(supported_monitor)) << MONITOR_TYPE_BIT_POSITION; } else { assert(false); // Unexpected monitor type @@ -643,10 +641,10 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { } } - if (it->second.variants.size() == 0) { + if (it->second.variants.empty()) { (*supported_funcs)[it->first] = nullptr; supported_variants = nullptr; // Invoke destructor - } else if ((*supported_variants).size() > 0) { + } else if (!(*supported_variants).empty()) { (*supported_funcs)[it->first] = supported_variants; } diff --git a/rocm_smi/src/rocm_smi_power_mon.cc b/rocm_smi/src/rocm_smi_power_mon.cc index 3e1d7e0d45..454851651b 100755 --- a/rocm_smi/src/rocm_smi_power_mon.cc +++ b/rocm_smi/src/rocm_smi_power_mon.cc @@ -41,17 +41,14 @@ * */ -#include - -#include -#include +#include #include -#include +#include #include +#include #include +#include -#include "rocm_smi/rocm_smi_main.h" -#include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi_exception.h" @@ -70,8 +67,7 @@ static const std::map kMonitorNameMap = { PowerMon::PowerMon(std::string path, RocmSMI_env_vars const *e) : path_(path), env_(e) { } -PowerMon::~PowerMon(void) { -} +PowerMon::~PowerMon(void) = default; static int parse_power_str(std::string s, PowerMonTypes type, uint64_t *val) { std::stringstream ss(s); diff --git a/rocm_smi/src/rocm_smi_utils.cc b/rocm_smi/src/rocm_smi_utils.cc index 582ed39703..796244f4d6 100755 --- a/rocm_smi/src/rocm_smi_utils.cc +++ b/rocm_smi/src/rocm_smi_utils.cc @@ -40,27 +40,28 @@ * DEALINGS WITH THE SOFTWARE. * */ -#include -#include -#include -#include +#define _GNU_SOURCE 1 // REQUIRED: to utilize some GNU features/functions, see + // _GNU_SOURCE functions which check #include +#include #include +#include #include +#include -#include -#include -#include +#include +#include +#include #include +#include +#include #include #include -#include -#include -#include -#include #include -#include +#include +#include #include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_utils.h" @@ -138,7 +139,7 @@ std::vector globFilesExist(const std::string& filePattern) { glob_t result_glob; memset(&result_glob, 0, sizeof(result_glob)); - if (glob(filePattern.c_str(), GLOB_TILDE, NULL, &result_glob) != 0) { + if (glob(filePattern.c_str(), GLOB_TILDE, nullptr, &result_glob) != 0) { globfree(&result_glob); // Leaving below to help debug issues discovering future glob file searches // debugFilesDiscovered(fileNames); @@ -146,7 +147,7 @@ std::vector globFilesExist(const std::string& filePattern) { } for(size_t i = 0; i < result_glob.gl_pathc; ++i) { - fileNames.push_back(std::string(result_glob.gl_pathv[i])); + fileNames.emplace_back(result_glob.gl_pathv[i]); } globfree(&result_glob); @@ -159,17 +160,26 @@ int isRegularFile(std::string fname, bool *is_reg) { struct stat file_stat; int ret; - assert(is_reg != nullptr); - ret = stat(fname.c_str(), &file_stat); if (ret) { return errno; } - *is_reg = S_ISREG(file_stat.st_mode); + + if (is_reg != nullptr) { + *is_reg = S_ISREG(file_stat.st_mode); + } + return 0; } int WriteSysfsStr(std::string path, std::string val) { + // On success, zero is returned. On error, -1 is returned, and + // errno is set to indicate the error. + auto is_regular_file_result = isRegularFile(path, nullptr); + if (is_regular_file_result != 0) { + return ENOENT; + } + std::ofstream fs; int ret = 0; std::ostringstream ss; @@ -196,6 +206,13 @@ int WriteSysfsStr(std::string path, std::string val) { } int ReadSysfsStr(std::string path, std::string *retStr) { + // On success, zero is returned. On error, -1 is returned, and + // errno is set to indicate the error. + auto is_regular_file_result = isRegularFile(path, nullptr); + if (is_regular_file_result != 0) { + return ENOENT; + } + std::stringstream ss; int ret = 0; std::ostringstream oss; @@ -381,7 +398,7 @@ std::string removeString(const std::string origStr, // defaults to trim stdOut std::pair executeCommand(std::string command, bool stdOut) { char buffer[128]; - std::string stdoutAndErr = ""; + std::string stdoutAndErr; bool successfulRun = true; command = "stdbuf -i0 -o0 -e0 " + command; // remove stdOut and err buffering @@ -411,14 +428,10 @@ std::pair executeCommand(std::string command, bool stdOut) { return std::make_pair(successfulRun, stdoutAndErr); } -// originalstring - string to search for substring +// originalString - string to search for substring // substring - string looking to find bool containsString(std::string originalString, std::string substring) { - if (originalString.find(substring) != std::string::npos) { - return true; - } else { - return false; - } + return (originalString.find(substring) != std::string::npos); } // Creates and stores supplied data into a temporary file (within /tmp/). @@ -429,9 +442,9 @@ bool containsString(std::string originalString, std::string substring) { // https://man7.org/linux/man-pages/man3/mkstemp.3.html // // Temporary file name format: -// ___ +// ___ // - prefix for our application's identifier (see kTmpFilePrefix) -// - name of parameter being stored +// - name of parameter being stored // - state at which the stored value captures // - device identifier // @@ -466,9 +479,8 @@ rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName, close(fd); if (rc_write == -1) { return RSMI_STATUS_FILE_ERROR; - } else { - return RSMI_STATUS_SUCCESS; } + return RSMI_STATUS_SUCCESS; } std::vector getListOfAppTmpFiles() { @@ -477,16 +489,18 @@ std::vector getListOfAppTmpFiles() { struct dirent *ent; std::vector tmpFiles; - if ((dir = opendir(path.c_str())) != nullptr) { - // captures all files & directories under specified path - while ((ent = readdir(dir)) != nullptr) { - std::string fileDirName = ent->d_name; - // we only want our app specific files - if (containsString(fileDirName, kTmpFilePrefix)) { - tmpFiles.emplace_back(path + "/" + fileDirName); - } else { - continue; - } + dir = opendir(path.c_str()); + if (dir == nullptr) { + return tmpFiles; + } + // captures all files & directories under specified path + while ((ent = readdir(dir)) != nullptr) { + std::string fileDirName = ent->d_name; + // we only want our app specific files + if (containsString(fileDirName, kTmpFilePrefix)) { + tmpFiles.emplace_back(path + "/" + fileDirName); + } else { + continue; } } return tmpFiles; @@ -515,7 +529,7 @@ std::vector readEntireFile(std::string path) { std::string line; while (std::getline(inFileStream, line)) { std::istringstream ss(line); - if(line.size() > 0) { + if (!line.empty()) { fileContent.push_back(line); } } @@ -527,7 +541,7 @@ std::vector readEntireFile(std::string path) { // and their content void displayAppTmpFilesContent() { std::vector tmpFiles = getListOfAppTmpFiles(); - if (tmpFiles.empty() == false) { + if (!tmpFiles.empty()) { for (auto &x: tmpFiles) { std::string out = readFile(x); std::cout << __PRETTY_FUNCTION__ << " | Temporary file: " << x @@ -543,7 +557,7 @@ void displayAppTmpFilesContent() { std::string debugVectorContent(std::vector v) { std::ostringstream ss; ss << "Vector = {"; - if (v.size() > 0) { + if (!v.empty()) { for (auto it=v.begin(); it < v.end(); it++) { ss << *it; auto temp_it = it; @@ -561,7 +575,7 @@ std::string debugVectorContent(std::vector v) { std::string displayAllDevicePaths(std::vector> v) { std::ostringstream ss; ss << "Vector = {"; - if (v.size() > 0) { + if (!v.empty()) { for (auto it=v.begin(); it < v.end(); it++) { ss << (*it)->path(); auto temp_it = it; @@ -576,7 +590,7 @@ std::string displayAllDevicePaths(std::vector> v) { } // Attempts to read application specific temporary file -// This method is to be used for reading (or determing if it exists), +// This method is to be used for reading (or determining if it exists), // in order to keep file naming scheme consistent. // // dv_ind - device index @@ -594,7 +608,7 @@ std::tuple readTmpFile(uint32_t dv_ind, "_" + std::to_string(dv_ind); std::string fileContent; std::vector tmpFiles = getListOfAppTmpFiles(); - if (tmpFiles.empty() == false) { + if (!tmpFiles.empty()) { for (auto &x: tmpFiles) { if (containsString(x, tmpFileName)) { fileContent = readFile(x); @@ -629,15 +643,23 @@ std::string getRSMIStatusString(rsmi_status_t ret) { // Big Endian (BE), multi-bit symbols encoded as big endian (MSB first) // Little Endian (LE), multi-bit symbols encoded as little endian (LSB first) std::tuple + std::string, std::string, std::string, std::string, + std::string, std::string, std::string> getSystemDetails(void) { struct utsname buf; bool errorDetected = false; std::string temp_data; - std::string sysname, nodename, release, version, machine; + std::string sysname; + std::string nodename; + std::string release; + std::string version; + std::string machine; std::string domainName = ""; std::string os_distribution = ""; std::string endianness = ""; + std::string rocm_lib_path = ""; + std::string rocm_build_type = ""; + std::string rocm_env_variables = ""; if (uname(&buf) < 0) { errorDetected = true; @@ -654,7 +676,7 @@ std::tuple fileContent = readEntireFile(filePath); for (auto &line: fileContent) { if (line.find("PRETTY_NAME=") != std::string::npos) { @@ -672,9 +694,13 @@ std::tupled_name, ".") == 0 || strcmp(dent->d_name, "..") == 0) { + continue; + } + + if (fstatat(dirfd(srcdir), dent->d_name, &st, 0) < 0) { + perror(dent->d_name); + continue; + } + + if (S_ISDIR(st.st_mode)) { + dir_count++; + } + } + closedir(srcdir); + return dir_count; +} } // namespace smi } // namespace amd diff --git a/src/rocm_smi_properties.cc b/src/rocm_smi_properties.cc index 0e606e6874..d73f974286 100644 --- a/src/rocm_smi_properties.cc +++ b/src/rocm_smi_properties.cc @@ -90,7 +90,6 @@ AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id) { static_cast(AMDGpuPropertyTypesOffset_t::kClkTypes) | static_cast(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes); - auto property_type_offset = (static_cast(property_type_offset_mask) & (property_id)); auto property_type_id = (static_cast(property_id) & ~(property_type_offset_mask)); return property_type_id; @@ -167,6 +166,7 @@ const AMDGpuVerbList_t amdgpu_verb_check_list { { AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, "amdsmi_get_gpu_od_volt_curve_regions" } }; +const uint16_t kDevIDAll(0xFFFF); const uint16_t kDevRevIDAll(0xFFFF); const AMDGpuPropertyList_t amdgpu_property_reinforcement_list { // @@ -177,6 +177,14 @@ const AMDGpuPropertyList_t amdgpu_property_reinforcement_list { // rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL = rsmi_dev_clk_range_set; // + // AMD All Families + {kDevIDAll, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanCntrlEnable), + AMDGpuVerbTypes_t::kResetGpuFan, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + // AMD Instinct MI210 {0x740F, {0x02, make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, @@ -240,12 +248,6 @@ const AMDGpuPropertyList_t amdgpu_property_reinforcement_list { AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets, AMDGpuPropertyOpModeTypes_t::kBoth, false } }, - {0x74A1, {kDevRevIDAll, - make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, - DevInfoTypes::kDevGpuReset), - AMDGpuVerbTypes_t::kResetGpu, - AMDGpuPropertyOpModeTypes_t::kSrIov, false } - }, {0x74A1, {kDevRevIDAll, make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_DETERMINISM), @@ -351,7 +353,7 @@ rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbT // likely the reinforcement table does not contain any entries/rules for the // dev_id in question. // - auto amdgpu_property_query_result_hdlr = [](rsmi_status_t query_result) { + auto amdgpu_property_query_result_hdlr = [&](const rsmi_status_t query_result) { switch (query_result) { case (rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR): case (rsmi_status_t::RSMI_STATUS_NO_DATA): @@ -364,7 +366,7 @@ rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbT break; default: - return rsmi_status_t::RSMI_STATUS_NOT_FOUND; + return actual_error_code; break; } }; @@ -416,7 +418,7 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx std::ostringstream osstream; auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR); - AMDGpuPropertyQuery_t amdgpu_property_query = [&]() { + auto amdgpu_property_query = [&]() { AMDGpuPropertyQuery_t amdgpu_property_query_init{}; amdgpu_property_query_init.m_asic_id = 0; amdgpu_property_query_init.m_pci_rev_id = 0; @@ -435,7 +437,7 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx id_filter_result = rsmi_dev_revision_get(dev_idx, &tmp_amdgpu_query.m_pci_rev_id); } } - is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) ? true : false; + is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS); return tmp_amdgpu_query; }; @@ -446,6 +448,18 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx LOG_TRACE(osstream); bool is_proper_query(false); + + // Generic filter for checking properties for all asics and revisions. + auto amdgpu_property_query_all_asics = amdgpu_property_query; + amdgpu_property_query_all_asics.m_asic_id = kDevIDAll; + amdgpu_property_query_all_asics.m_pci_rev_id = kDevRevIDAll; + auto amdgpu_property_query_result = run_amdgpu_property_reinforcement_query(amdgpu_property_query_all_asics); + // We found a generic entry for all asics and revisions + if (amdgpu_property_query_result != rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR) { + return amdgpu_property_query_result; + } + + // If no generic entry, then we query for specific asic and revision ids. amdgpu_property_query = build_asic_id_filters(amdgpu_property_query, is_proper_query); if (!is_proper_query) { rsmi_status = rsmi_status_t::RSMI_STATUS_NO_DATA; @@ -475,13 +489,6 @@ rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuProper return (amdgpu_property_reinforcement_list.find(asic_id) != amdgpu_property_reinforcement_list.end()); }; - auto ends_with = [](const std::string& value, const std::string& ending) { - if (value.size() < ending.size()) { - return false; - } - return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); - }; - // Traverse through all values for a given key osstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n"; LOG_TRACE(osstream); @@ -495,7 +502,7 @@ rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuProper osstream << __PRETTY_FUNCTION__ << " asic id found: " << itr_begin->first << "\n"; // Pci_rev_id matches the filter or ALL Revisions if ((itr_begin->second.m_pci_rev_id == amdgpu_property_query.m_pci_rev_id) || - (itr_begin->second.m_pci_rev_id == kDevRevIDAll)) { + (itr_begin->second.m_pci_rev_id == kDevRevIDAll)) { osstream << __PRETTY_FUNCTION__ << " asic rev.id found: " << itr_begin->second.m_pci_rev_id << "\n"; // Do we have the property we are looking for? if (((amdgpu_property_query.m_property != 0) && diff --git a/tests/amd_smi_test/functional/frequencies_read.cc b/tests/amd_smi_test/functional/frequencies_read.cc index 0cb51149e5..eae1c7abeb 100755 --- a/tests/amd_smi_test/functional/frequencies_read.cc +++ b/tests/amd_smi_test/functional/frequencies_read.cc @@ -126,16 +126,20 @@ void TestFrequenciesRead::Run(void) { } else if (err == AMDSMI_STATUS_NOT_YET_IMPLEMENTED) { std::cout << "\t**Get " << name << ": Not implemented on this machine" << std::endl; + // special driver issue, shouldn't normally occur + } else if (err == AMDSMI_STATUS_UNEXPECTED_DATA) { + std::cerr << "WARN: Clock file [" << FreqEnumToStr(t) << "] exists on device [" << i << "] but empty!" << std::endl; + std::cerr << " Likely a driver issue!" << std::endl; } else { - CHK_ERR_ASRT(err) - IF_VERB(STANDARD) { - std::cout << "\t**Supported " << name << " clock frequencies: "; - std::cout << f.num_supported << std::endl; - print_frequencies(&f); - // Verify api support checking functionality is working - err = amdsmi_get_clk_freq(processor_handles_[i], t, nullptr); - ASSERT_EQ(err, AMDSMI_STATUS_INVAL); - } + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Supported " << name << " clock frequencies: "; + std::cout << f.num_supported << std::endl; + print_frequencies(&f); + // Verify api support checking functionality is working + err = amdsmi_get_clk_freq(processor_handles_[i], t, nullptr); + ASSERT_EQ(err, AMDSMI_STATUS_INVAL); + } } }; diff --git a/tests/amd_smi_test/functional/frequencies_read_write.cc b/tests/amd_smi_test/functional/frequencies_read_write.cc index 1274957b2c..b80f700fb1 100755 --- a/tests/amd_smi_test/functional/frequencies_read_write.cc +++ b/tests/amd_smi_test/functional/frequencies_read_write.cc @@ -104,8 +104,7 @@ void TestFrequenciesReadWrite::Run(void) { for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { PrintDeviceHeader(processor_handles_[dv_ind]); - for (uint32_t clk = (uint32_t)CLK_TYPE_FIRST; - clk <= CLK_TYPE__MAX; ++clk) { + for (uint32_t clk = CLK_TYPE_FIRST; clk <= CLK_TYPE__MAX; ++clk) { amdsmi_clk = (amdsmi_clk_type_t)clk; auto freq_read = [&]() -> bool { @@ -121,14 +120,20 @@ void TestFrequenciesReadWrite::Run(void) { std::cout << "\t**Set " << FreqEnumToStr(amdsmi_clk) << ": Not supported on this machine" << std::endl; return false; - } else { - // CHK_ERR_ASRT(ret) - IF_VERB(STANDARD) { - std::cout << "Initial frequency for clock " << - FreqEnumToStr(amdsmi_clk) << " is " << f.current << std::endl; - } - return true; } + + // special driver issue, shouldn't normally occur + if (ret == AMDSMI_STATUS_UNEXPECTED_DATA) { + std::cerr << "WARN: Clock file [" << FreqEnumToStr(amdsmi_clk) << "] exists on device [" << dv_ind << "] but empty!" << std::endl; + std::cerr << " Likely a driver issue!" << std::endl; + } + + // CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "Initial frequency for clock " << + FreqEnumToStr(amdsmi_clk) << " is " << f.current << std::endl; + } + return true; }; auto freq_write = [&]() { @@ -152,19 +157,18 @@ void TestFrequenciesReadWrite::Run(void) { std::endl; } ret = amdsmi_set_clk_freq(processor_handles_[dv_ind], amdsmi_clk, freq_bitmask); - //Certain ASICs does not allow to set particular clocks. If set function for a clock returns - //permission error despite root access, manually set ret value to success and return - if (ret == AMDSMI_STATUS_NO_PERM && geteuid() == 0) { - std::cout << "\t**Set " << FreqEnumToStr(amdsmi_clk) << - ": Not supported on this machine. Skipping..." << std::endl; - ret = AMDSMI_STATUS_SUCCESS; - return; - } else if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + // Certain ASICs does not allow to set particular clocks. If set function for a clock returns + // permission error despite root access, manually set ret value to success and return + // + // Sometimes setting clock frequencies is completely not supported + if ((ret == AMDSMI_STATUS_NO_PERM && geteuid() == 0) || + (ret == AMDSMI_STATUS_NOT_SUPPORTED)) { std::cout << "\t**Set " << FreqEnumToStr(amdsmi_clk) << ": Not supported on this machine. Skipping..." << std::endl; ret = AMDSMI_STATUS_SUCCESS; return; } + CHK_ERR_ASRT(ret) ret = amdsmi_get_clk_freq(processor_handles_[dv_ind], amdsmi_clk, &f); if (ret != AMDSMI_STATUS_SUCCESS) { @@ -187,7 +191,9 @@ void TestFrequenciesReadWrite::Run(void) { } ret = amdsmi_set_gpu_perf_level(processor_handles_[dv_ind], AMDSMI_DEV_PERF_LEVEL_AUTO); - if (ret != AMDSMI_STATUS_SUCCESS) { + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**Setting performance level is not supported on this machine. Skipping..." << std::endl; + ret = AMDSMI_STATUS_SUCCESS; return; } }; @@ -199,44 +205,6 @@ void TestFrequenciesReadWrite::Run(void) { } freq_write(); CHK_ERR_ASRT(ret) -#if 0 - ret = amdsmi_get_clk_freq(dv_ind, amdsmi_clk, &f); - CHK_ERR_ASRT(ret) - - IF_VERB(STANDARD) { - std::cout << "Initial frequency for clock " << amdsmi_clk << " is " << - f.current << std::endl; - } - // Set clocks to something other than the usual default of the lowest - // frequency. - freq_bitmask = 0b01100; // Try the 3rd and 4th clocks - - std::string freq_bm_str = - std::bitset(freq_bitmask).to_string(); - - freq_bm_str.erase(0, std::min(freq_bm_str.find_first_not_of('0'), - freq_bm_str.size()-1)); - - IF_VERB(STANDARD) { - std::cout << "Setting frequency mask for clock " << amdsmi_clk << - " to 0b" << freq_bm_str << " ..." << std::endl; - } - ret = amdsmi_set_clk_freq(dv_ind, amdsmi_clk, freq_bitmask); - CHK_ERR_ASRT(ret) - - ret = amdsmi_get_clk_freq(dv_ind, amdsmi_clk, &f); - CHK_ERR_ASRT(ret) - - IF_VERB(STANDARD) { - std::cout << "Frequency is now index " << f.current << std::endl; - std::cout << "Resetting mask to all frequencies." << std::endl; - } - ret = amdsmi_set_clk_freq(dv_ind, amdsmi_clk, 0xFFFFFFFF); - CHK_ERR_ASRT(ret) - - ret = amdsmi_set_gpu_perf_level(dv_ind, AMDSMI_DEV_PERF_LEVEL_AUTO); - CHK_ERR_ASRT(ret) -#endif } } } diff --git a/tests/amd_smi_test/functional/gpu_metrics_read.cc b/tests/amd_smi_test/functional/gpu_metrics_read.cc index 4bfadfca10..0c6ffa1a12 100644 --- a/tests/amd_smi_test/functional/gpu_metrics_read.cc +++ b/tests/amd_smi_test/functional/gpu_metrics_read.cc @@ -177,8 +177,8 @@ void TestGpuMetricsRead::Run(void) { << std::to_string(smu.pcie_link_speed) << '\n'; std::cout << "gfx_activity_acc=" << std::dec << smu.gfx_activity_acc << '\n'; - std::cout << "mem_actvity_acc=" - << std::dec << smu.mem_actvity_acc << '\n'; + std::cout << "mem_activity_acc=" + << std::dec << smu.mem_activity_acc << '\n'; for (int i = 0; i < AMDSMI_NUM_HBM_INSTANCES; ++i) { std::cout << "temperature_hbm[" << i << "]=" << std::dec << diff --git a/tests/amd_smi_test/functional/power_cap_read_write.cc b/tests/amd_smi_test/functional/power_cap_read_write.cc index f6fd038ba4..7672afb93e 100755 --- a/tests/amd_smi_test/functional/power_cap_read_write.cc +++ b/tests/amd_smi_test/functional/power_cap_read_write.cc @@ -112,6 +112,13 @@ void TestPowerCapReadWrite::Run(void) { max = info.max_power_cap; orig = info.default_power_cap; + // Check if power cap is within the range + // skip the test otherwise + if (orig < min || orig > max) { + std::cout << "Power cap is not within the range. Skipping test for " << dv_ind << std::endl; + continue; + } + new_cap = (max + min)/2; IF_VERB(STANDARD) { diff --git a/tests/amd_smi_test/functional/power_read_write.cc b/tests/amd_smi_test/functional/power_read_write.cc index f341c7c069..16e006a57c 100755 --- a/tests/amd_smi_test/functional/power_read_write.cc +++ b/tests/amd_smi_test/functional/power_read_write.cc @@ -126,6 +126,10 @@ void TestPowerReadWrite::Run(void) { if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { std::cout << "The power profile presets settings is not supported. " << std::endl; + + // Verify api support checking functionality is working + ret = amdsmi_get_gpu_power_profile_presets(processor_handles_[dv_ind], 0, nullptr); + ASSERT_EQ(ret, AMDSMI_STATUS_NOT_SUPPORTED); continue; } CHK_ERR_ASRT(ret) diff --git a/tests/amd_smi_test/main.cc b/tests/amd_smi_test/main.cc index 791f38120f..c4e1ced1d1 100644 --- a/tests/amd_smi_test/main.cc +++ b/tests/amd_smi_test/main.cc @@ -106,14 +106,12 @@ static void RunCustomTestProlog(TestBase *test) { } test->SetUp(); test->Run(); - return; } static void RunCustomTestEpilog(TestBase *tst) { if (sRSMIGlvalues->verbosity >= TestBase::VERBOSE_STANDARD) { tst->DisplayResults(); } tst->Close(); - return; } // If the test case one big test, you should use RunGenericTest() @@ -125,7 +123,6 @@ static void RunCustomTestEpilog(TestBase *tst) { static void RunGenericTest(TestBase *test) { RunCustomTestProlog(test); RunCustomTestEpilog(test); - return; } diff --git a/tests/amd_smi_test/rsmitst.exclude b/tests/amd_smi_test/rsmitst.exclude index 87a50ffaea..adc042312a 100644 --- a/tests/amd_smi_test/rsmitst.exclude +++ b/tests/amd_smi_test/rsmitst.exclude @@ -63,10 +63,7 @@ $BLACKLIST_ALL_ASICS\ # /sys/class/kfd/kfd/topology/nodes/*/properties FILTER[90400]=\ $BLACKLIST_ALL_ASICS\ -"rsmitstReadOnly.TestVoltCurvRead:"\ -"rsmitstReadOnly.TestFrequenciesRead:"\ -"rsmitstReadWrite.TestFrequenciesReadWrite:"\ -"rsmitstReadWrite.TestPowerReadWrite" +"rsmitstReadOnly.TestVoltCurvRead" FILTER[90401]=${FILTER[90400]} FILTER[90402]=${FILTER[90400]} diff --git a/tests/amd_smi_test/test_base.cc b/tests/amd_smi_test/test_base.cc index 3069c102dc..a242711e5e 100644 --- a/tests/amd_smi_test/test_base.cc +++ b/tests/amd_smi_test/test_base.cc @@ -43,7 +43,7 @@ * */ -#include +#include #include "amd_smi/amdsmi.h" #include "test_base.h" @@ -61,10 +61,9 @@ static const char kResultsLabel[] = "TEST RESULTS"; // This one is used outside this file const char kSetupLabel[] = "TEST SETUP"; -TestBase::TestBase() : setup_failed_(false), description_("") { -} -TestBase::~TestBase() { +TestBase::TestBase() : setup_failed_(false) { } +TestBase::~TestBase() = default; void TestBase::MakeHeaderStr(const char *inStr, std::string *outStr) const { @@ -155,8 +154,6 @@ void TestBase::SetUp(uint64_t init_flags) { std::cout << "No AMD SMI tests can be run." << std::endl; } } - - return; } void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) { @@ -254,7 +251,7 @@ void TestBase::set_description(std::string d) { size_t endlptr; for (size_t i = le; i < description_.size(); i += le) { - endlptr = description_.find_last_of(" ", i); + endlptr = description_.find_last_of(' ', i); description_.replace(endlptr, 1, "\n"); i = endlptr; } diff --git a/tests/amd_smi_test/test_base.h b/tests/amd_smi_test/test_base.h index 104e450db4..ffd6a55116 100644 --- a/tests/amd_smi_test/test_base.h +++ b/tests/amd_smi_test/test_base.h @@ -45,6 +45,7 @@ #ifndef TESTS_AMD_SMI_TEST_TEST_BASE_H_ #define TESTS_AMD_SMI_TEST_TEST_BASE_H_ +#include #include #include #include "amd_smi/amdsmi.h" @@ -150,9 +151,8 @@ class TestBase { "\t===> Abort is over-ridden due to dont_fail command line option." \ << std::endl; \ return; \ - } else { \ - ASSERT_EQ(AMDSMI_STATUS_SUCCESS, (RET)); \ } \ + ASSERT_EQ(AMDSMI_STATUS_SUCCESS, (RET)); \ } void MakeHeaderStr(const char *inStr, std::string *outStr); diff --git a/tests/amd_smi_test/test_common.cc b/tests/amd_smi_test/test_common.cc index d880c109c6..7237e4cc89 100644 --- a/tests/amd_smi_test/test_common.cc +++ b/tests/amd_smi_test/test_common.cc @@ -43,13 +43,13 @@ * */ -#include -#include #include +#include +#include #include -#include #include +#include #include "test_base.h" #include "test_common.h" diff --git a/tests/amd_smi_test/test_common.h b/tests/amd_smi_test/test_common.h index 5a2aa6d9fc..ba00fad2ce 100644 --- a/tests/amd_smi_test/test_common.h +++ b/tests/amd_smi_test/test_common.h @@ -74,7 +74,7 @@ void DumpMonitorInfo(const TestBase *test); #endif #define DISPLAY_AMDSMI_ERR(RET) { \ - if (RET != AMDSMI_STATUS_SUCCESS) { \ + if ((RET) != AMDSMI_STATUS_SUCCESS) { \ const char *err_str; \ std::cout << "\t===> ERROR: AMDSMI call returned " << (RET) << std::endl; \ amdsmi_status_code_to_string((RET), &err_str); \ @@ -91,7 +91,7 @@ void DumpMonitorInfo(const TestBase *test); } \ } #define CHK_AMDSMI_PERM_ERR(RET) { \ - if (RET == AMDSMI_STATUS_NO_PERM) { \ + if ((RET) == AMDSMI_STATUS_NO_PERM) { \ std::cout << "This command requires root access." << std::endl; \ } else { \ DISPLAY_AMDSMI_ERR(RET) \