From 328ce0150bd496cb0549f4701bf006ba7e3e81aa Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Mon, 28 Aug 2023 20:08:13 -0500 Subject: [PATCH 01/19] rocm_smi_lib: Fix rocm-smi --showfan shows 'unable to detect fan' Code changes related to the following: * Reverts earlier fix for the same issue * Check for existence of files before reading Change-Id: I175b20c3343c414b12b79dc3fc404f53fbaabf3a Signed-off-by: Oliveira, Daniel --- python_smi_tools/rocm_smi.py | 8 ++------ src/rocm_smi_device.cc | 4 ++-- src/rocm_smi_utils.cc | 15 ++++++++++++--- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 4771a29f8f..d0bb5ab365 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -785,12 +785,8 @@ def resetFans(deviceList): for device in deviceList: sensor_ind = c_uint32(0) ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind) - if (ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED) or (ret == rsmi_status_t.RSMI_STATUS_PERMISSION): - if not rsmi_ret_ok(rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED, device, 'reset_fan'): - continue - else: - if rsmi_ret_ok(ret, device, 'reset_fan'): - printLog(device, 'Successfully reset fan speed to driver control', None) + if rsmi_ret_ok(ret, device, 'reset_fan'): + printLog(device, 'Successfully reset fan speed to driver control', None) printLogSpacer() diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index f859a9812e..3830d602aa 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -593,7 +593,6 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { bool reg_file; int ret = isRegularFile(sysfs_path, ®_file); - if (ret != 0) { ss << "File did not exist - SYSFS file (" << sysfs_path << ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) @@ -804,7 +803,8 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, FILE *ptr; sysfs_path += "/device/"; sysfs_path += kDevAttribNameMap.at(type); - ptr = fopen(sysfs_path.c_str(), "rb"); + + ptr = fopen(sysfs_path.c_str(), "rb"); if (!ptr) { ss << "Could not read DevInfoBinary for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ")" diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index 670d90faec..5009d3bd14 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -158,13 +158,15 @@ int isRegularFile(std::string fname, bool *is_reg) { struct stat file_stat; int ret; - assert(is_reg != nullptr); - ret = stat(fname.c_str(), &file_stat); if (ret) { return errno; } - *is_reg = S_ISREG(file_stat.st_mode); + + if (is_reg != nullptr) { + *is_reg = S_ISREG(file_stat.st_mode); + } + return 0; } @@ -192,6 +194,13 @@ int WriteSysfsStr(std::string path, std::string val) { } int ReadSysfsStr(std::string path, std::string *retStr) { + // On success, zero is returned. On error, -1 is returned, and + // errno is set to indicate the error. + auto is_regular_file_result = isRegularFile(path, nullptr); + if (is_regular_file_result != 0) { + return ENOENT; + } + std::stringstream ss; int ret = 0; std::ostringstream oss; From fab0542ab16b7378c523e1038e23ca1b22b9b91c Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 6 Sep 2023 13:16:52 -0500 Subject: [PATCH 02/19] Fix doxygen warning messages The Doxygen will enable warning as error message. Change-Id: Ie7a7c9a823388c4140f31489604d65ec43005772 --- include/rocm_smi/rocm_smi.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 9fa7de0b35..bf5db443e8 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -884,14 +884,28 @@ struct metrics_table_header_t { #define RSMI_GPU_METRICS_API_FORMAT_VER 1 // The content version increments when gpu_metrics is extended with new and/or // existing field sizes are changed. + +/** + * @brief The GPU metrics version 1 + */ #define RSMI_GPU_METRICS_API_CONTENT_VER_1 1 +/** + * @brief The GPU metrics version 2 + */ #define RSMI_GPU_METRICS_API_CONTENT_VER_2 2 +/** + * @brief The GPU metrics version 3 + */ #define RSMI_GPU_METRICS_API_CONTENT_VER_3 3 -// This should match NUM_HBM_INSTANCES +/** + * @brief This should match NUM_HBM_INSTANCES + */ #define RSMI_NUM_HBM_INSTANCES 4 -// Unit conversion factor for HBM temperatures +/** + * @brief Unit conversion factor for HBM temperatures + */ #define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000 typedef struct { @@ -2230,7 +2244,7 @@ rsmi_dev_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent); * If the function reutrns RSMI_STATUS_SUCCESS, the counter will be set in the value field of * the rsmi_utilization_counter_t. * - * @param[in] count The size of @utilization_counters array. + * @param[in] count The size of utilization_counters array. * * @param[inout] timestamp The timestamp when the counter is retreived. Resolution: 1 ns. * @retval ::RSMI_STATUS_SUCCESS call was successful @@ -3340,7 +3354,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices, * @brief Get the info of a process on a specific device. * * @details Given a process id @p pid, a @p dv_ind, this function will - * write the process information for @p pid on the device, if available, to + * write the process information for pid on the device, if available, to * the memory pointed to by @p proc. * * @param[in] pid The process id of the process for which the gpu @@ -3348,7 +3362,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices, * * @param[in] dv_ind a device index where the process running on. * - * @param[inout] procs a pointer to memory provided by the caller to which + * @param[inout] proc a pointer to memory provided by the caller to which * process information will be written. * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call @@ -3540,7 +3554,7 @@ rsmi_topo_get_link_type(uint32_t dv_ind_src, uint32_t dv_ind_dst, * * @details Given a source device index @p dv_ind_src and * a destination device index @p dv_ind_dst, and a pointer to a - * bool @accessible, this function will write the P2P connection status + * bool @p accessible, this function will write the P2P connection status * between the device @p dv_ind_src and @p dv_ind_dst to the memory * pointed to by @p accessible. * From 4aef7675962c9cbd4baf031aebc2a5bacabdd1ba Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 31 Aug 2023 11:34:58 -0500 Subject: [PATCH 03/19] Cleanup rocm_smi.cc Change-Id: Ia676c237222b0dd5d9e8a054a93776f3b11e2225 Signed-off-by: Galantsev, Dmitrii --- include/rocm_smi/rocm_smi.h | 6 +- include/rocm_smi/rocm_smi_common.h | 2 +- src/rocm_smi.cc | 588 ++++++++++++----------------- src/rocm_smi_counters.cc | 26 +- src/rocm_smi_device.cc | 89 +++-- src/rocm_smi_gpu_metrics.cc | 30 +- src/rocm_smi_io_link.cc | 40 +- src/rocm_smi_kfd.cc | 37 +- src/rocm_smi_logger.cc | 23 +- src/rocm_smi_main.cc | 65 ++-- src/rocm_smi_monitor.cc | 48 ++- src/rocm_smi_power_mon.cc | 14 +- src/rocm_smi_properties.cc | 10 +- src/rocm_smi_utils.cc | 99 ++--- tests/rocm_smi_test/main.cc | 3 - tests/rocm_smi_test/test_base.cc | 11 +- tests/rocm_smi_test/test_base.h | 4 +- tests/rocm_smi_test/test_common.cc | 6 +- tests/rocm_smi_test/test_common.h | 4 +- 19 files changed, 483 insertions(+), 622 deletions(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index bf5db443e8..92ac970841 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -2406,7 +2406,7 @@ rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid * */ -rsmi_status_t rsmi_dev_gpu_reset(int32_t dv_ind); +rsmi_status_t rsmi_dev_gpu_reset(uint32_t dv_ind); /** * @brief This function retrieves the voltage/frequency curve information @@ -2640,7 +2640,7 @@ rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t sensor_ind, * */ rsmi_status_t -rsmi_dev_perf_level_set(int32_t dv_ind, rsmi_dev_perf_level_t perf_lvl); +rsmi_dev_perf_level_set(uint32_t dv_ind, rsmi_dev_perf_level_t perf_lvl); /** * @brief Set the PowerPlay performance level associated with the device with @@ -2706,7 +2706,7 @@ rsmi_dev_perf_level_set_v1(uint32_t dv_ind, rsmi_dev_perf_level_t perf_lvl); * @retval ::RSMI_STATUS_PERMISSION function requires root access * */ -rsmi_status_t rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od); +rsmi_status_t rsmi_dev_overdrive_level_set(uint32_t dv_ind, uint32_t od); /** * @brief Set the overdrive percent associated with the device with provided diff --git a/include/rocm_smi/rocm_smi_common.h b/include/rocm_smi/rocm_smi_common.h index bff8c8edc5..f29e427789 100755 --- a/include/rocm_smi/rocm_smi_common.h +++ b/include/rocm_smi/rocm_smi_common.h @@ -90,7 +90,7 @@ /* This group of macros is used to facilitate checking of support for rsmi_dev* * "getter" functions. When the return buffer is set to nullptr, the macro will * check the previously gathered device support data to see if the function, - * with possible variants (e.g., memory types, firware types,...) and + * with possible variants (e.g., memory types, firmware types,...) and * subvariants (e.g. monitors/sensors) are supported. */ // This macro assumes dev already available diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 5c2e3f8fd5..506e784206 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -41,29 +41,29 @@ * */ -#include -#include -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include #include -#include +#include +#include -#include #include -#include #include +#include +#include +#include #include -#include -#include -#include +#include #include #include +#include +#include #include +#include +#include #include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h #include "rocm_smi/rocm_smi.h" @@ -81,16 +81,24 @@ using namespace ROCmLogging; using namespace amd::smi; static const uint32_t kMaxOverdriveLevel = 20; -static const float kEnergyCounterResolution = 15.3f; +static const float kEnergyCounterResolution = 15.3F; -std::map ClkStateMap = { - {RSMI_CLK_TYPE_SYS, "SCLK"}, - {RSMI_CLK_TYPE_DF, "DFCLK"}, - {RSMI_CLK_TYPE_DCEF, "DCEFCLK"}, - {RSMI_CLK_TYPE_SOC, "SOCCLK"}, - {RSMI_CLK_TYPE_MEM, "MCLK"}, - {RSMI_CLK_TYPE_PCIE, "PCIECLK"}, - }; +static const std::map kClkStateMap = { + { RSMI_CLK_TYPE_SYS, "SCLK" }, + { RSMI_CLK_TYPE_DF, "DFCLK" }, + { RSMI_CLK_TYPE_DCEF, "DCEFCLK" }, + { RSMI_CLK_TYPE_SOC, "SOCCLK" }, + { RSMI_CLK_TYPE_MEM, "MCLK" }, + { RSMI_CLK_TYPE_PCIE, "PCIECLK" }, +}; + +static const std::map kClkTypeMap = { + { RSMI_CLK_TYPE_SYS, amd::smi::kDevGPUSClk }, + { RSMI_CLK_TYPE_MEM, amd::smi::kDevGPUMClk }, + { RSMI_CLK_TYPE_DF, amd::smi::kDevFClk }, + { RSMI_CLK_TYPE_DCEF, amd::smi::kDevDCEFClk }, + { RSMI_CLK_TYPE_SOC, amd::smi::kDevSOCClk }, +}; #define TRY try { #define CATCH } catch (...) {return amd::smi::handleException();} @@ -156,7 +164,7 @@ static uint64_t freq_string_to_int(const std::vector &freq_lines, } if (is_curr != nullptr) { - if (freq_lines[i].find("*") != std::string::npos) { + if (freq_lines[i].find('*') != std::string::npos) { *is_curr = true; } else { *is_curr = false; @@ -167,7 +175,7 @@ static uint64_t freq_string_to_int(const std::vector &freq_lines, if (star_str[0] == 'x') { assert(lanes != nullptr && "Lanes are provided but null lanes pointer"); if (lanes) { - if (star_str.substr(1) == "") { + if (star_str.substr(1).empty()) { throw amd::smi::rsmi_exception(RSMI_STATUS_NO_DATA, __FUNCTION__); } @@ -209,8 +217,6 @@ static void freq_volt_string_to_point(std::string in_line, multiplier = get_multiplier_from_str(volts_units_str[0]); pt->voltage = static_cast(volts*multiplier); - - return; } static void od_value_pair_str_to_range(std::string in_line, rsmi_range_t *rg) { @@ -237,8 +243,6 @@ static void od_value_pair_str_to_range(std::string in_line, rsmi_range_t *rg) { multiplier = get_multiplier_from_str(hi_units_str[0]); rg->upper_bound = static_cast(hi*multiplier); - - return; } /** @@ -258,7 +262,7 @@ power_prof_string_to_int(std::string pow_prof_line, bool *is_curr, fs >> *prof_ind; fs >> mode; - while (1) { + while (true) { tmp = mode.find_last_of("* :"); if (tmp == std::string::npos) { break; @@ -267,7 +271,7 @@ power_prof_string_to_int(std::string pow_prof_line, bool *is_curr, } if (is_curr != nullptr) { - if (pow_prof_line.find("*") != std::string::npos) { + if (pow_prof_line.find('*') != std::string::npos) { *is_curr = true; } else { *is_curr = false; @@ -759,14 +763,13 @@ rsmi_status_t rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node) { TRY rsmi_status_t ret; - uint64_t val = 0; CHK_SUPPORT_NAME_ONLY(numa_node) DEVICE_MUTEX std::string str_val; ret = get_dev_value_str(amd::smi::kDevNumaNode, dv_ind, &str_val); - *numa_node = std::stol(str_val, 0); + *numa_node = std::stoi(str_val, nullptr); return ret; CATCH @@ -1005,13 +1008,10 @@ rsmi_dev_mem_overdrive_level_get(uint32_t dv_ind, uint32_t *od) { } rsmi_status_t -rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od) { +rsmi_dev_overdrive_level_set(uint32_t dv_ind, uint32_t od) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); - if (dv_ind < 0) { - return RSMI_STATUS_INVALID_ARGS; - } return rsmi_dev_overdrive_level_set_v1(static_cast(dv_ind), od); } @@ -1032,11 +1032,11 @@ rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od) { } rsmi_status_t -rsmi_dev_perf_level_set(int32_t dv_ind, rsmi_dev_perf_level_t perf_level) { +rsmi_dev_perf_level_set(uint32_t dv_ind, rsmi_dev_perf_level_t perf_level) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); - return rsmi_dev_perf_level_set_v1(static_cast(dv_ind), perf_level); + return rsmi_dev_perf_level_set_v1(dv_ind, perf_level); } rsmi_status_t @@ -1075,7 +1075,7 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_ } assert(val_vec.size() <= RSMI_MAX_NUM_FREQUENCIES); - if (val_vec.size() == 0) { + if (val_vec.empty()) { return RSMI_STATUS_NOT_YET_IMPLEMENTED; } @@ -1090,7 +1090,8 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_ // Check that that is true. if (i > 0) { if (f->frequency[i] < f->frequency[i-1]) { - std::string sysvalue = ClkStateMap[clk_type]; + std::string sysvalue; + sysvalue += kClkStateMap.find(clk_type)->second; sysvalue += " Current Value"; sysvalue += ' ' + std::to_string(f->frequency[i]); sysvalue += " Previous Value"; @@ -1101,7 +1102,8 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_ if (current) { // set the current frequency if (f->current != RSMI_MAX_NUM_FREQUENCIES + 1) { - std::string sysvalue = ClkStateMap[clk_type]; + std::string sysvalue; + sysvalue += kClkStateMap.find(clk_type)->second; sysvalue += " Current Value"; sysvalue += ' ' + std::to_string(f->frequency[i]); sysvalue += " Previous Value"; @@ -1140,7 +1142,7 @@ static rsmi_status_t get_power_profiles(uint32_t dv_ind, return ret; } assert(val_vec.size() <= RSMI_MAX_NUM_POWER_PROFILES); - if (val_vec.size() > RSMI_MAX_NUM_POWER_PROFILES + 1 || val_vec.size() < 1) { + if (val_vec.size() > RSMI_MAX_NUM_POWER_PROFILES + 1 || val_vec.empty()) { return RSMI_STATUS_UNEXPECTED_SIZE; } // -1 for the header line, below @@ -1326,8 +1328,9 @@ rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, LOG_TRACE(ss); assert(minclkvalue < maxclkvalue); - std::string min_sysvalue, max_sysvalue; - std::map ClkStateMap = { + std::string min_sysvalue; + std::string max_sysvalue; + std::map clk_char_map = { {RSMI_CLK_TYPE_SYS, "s"}, {RSMI_CLK_TYPE_MEM, "m"}, }; @@ -1345,11 +1348,11 @@ rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, // minimum clock. And 1 if to set maximum clock. E.g., "s 0 500" will update // minimum sclk to be 500 MHz. "m 1 800" will update maximum mclk to 800Mhz. - min_sysvalue = ClkStateMap[clkType]; + min_sysvalue = clk_char_map[clkType]; min_sysvalue += ' ' + std::to_string(RSMI_FREQ_IND_MIN); min_sysvalue += ' ' + std::to_string(minclkvalue); min_sysvalue += '\n'; - max_sysvalue = ClkStateMap[clkType]; + max_sysvalue = clk_char_map[clkType]; max_sysvalue += ' ' + std::to_string(RSMI_FREQ_IND_MAX); max_sysvalue += ' ' + std::to_string(maxclkvalue); max_sysvalue += '\n'; @@ -1381,7 +1384,7 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, LOG_TRACE(ss); std::string sysvalue; - std::map ClkStateMap = { + std::map clk_char_map = { {RSMI_CLK_TYPE_SYS, "s"}, {RSMI_CLK_TYPE_MEM, "m"}, }; @@ -1400,14 +1403,8 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, switch (clkType) { case RSMI_CLK_TYPE_SYS: - sysvalue = ClkStateMap[clkType]; - sysvalue += ' ' + std::to_string(level); - sysvalue += ' ' + std::to_string(clkvalue); - sysvalue += '\n'; - break; - case RSMI_CLK_TYPE_MEM: - sysvalue = ClkStateMap[clkType]; + sysvalue = clk_char_map[clkType]; sysvalue += ' ' + std::to_string(level); sysvalue += ' ' + std::to_string(clkvalue); sysvalue += '\n'; @@ -1487,7 +1484,6 @@ static void get_vc_region(uint32_t start_ind, } od_value_pair_str_to_range((*val_vec)[start_ind], &p->freq_range); od_value_pair_str_to_range((*val_vec)[start_ind + 1], &p->volt_range); - return; } /* @@ -1614,24 +1610,11 @@ rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, rsmi_clk_type_t clk_type, CHK_SUPPORT_VAR(f, clk_type) - switch (clk_type) { - case RSMI_CLK_TYPE_SYS: - dev_type = amd::smi::kDevGPUSClk; - break; - case RSMI_CLK_TYPE_MEM: - dev_type = amd::smi::kDevGPUMClk; - break; - case RSMI_CLK_TYPE_DF: - dev_type = amd::smi::kDevFClk; - break; - case RSMI_CLK_TYPE_DCEF: - dev_type = amd::smi::kDevDCEFClk; - break; - case RSMI_CLK_TYPE_SOC: - dev_type = amd::smi::kDevSOCClk; - break; - default: - return RSMI_STATUS_INVALID_ARGS; + const auto & clk_type_it = kClkTypeMap.find(clk_type); + if (clk_type_it != kClkTypeMap.end()) { + dev_type = clk_type_it->second; + } else { + return RSMI_STATUS_INVALID_ARGS; } DEVICE_MUTEX @@ -1653,72 +1636,35 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, std::string val_str; amd::smi::DevInfoTypes dev_type; - switch (block) { - case RSMI_FW_BLOCK_ASD: - dev_type = amd::smi::kDevFwVersionAsd; - break; - case RSMI_FW_BLOCK_CE: - dev_type = amd::smi::kDevFwVersionCe; - break; - case RSMI_FW_BLOCK_DMCU: - dev_type = amd::smi::kDevFwVersionDmcu; - break; - case RSMI_FW_BLOCK_MC: - dev_type = amd::smi::kDevFwVersionMc; - break; - case RSMI_FW_BLOCK_ME: - dev_type = amd::smi::kDevFwVersionMe; - break; - case RSMI_FW_BLOCK_MEC: - dev_type = amd::smi::kDevFwVersionMec; - break; - case RSMI_FW_BLOCK_MEC2: - dev_type = amd::smi::kDevFwVersionMec2; - break; - case RSMI_FW_BLOCK_PFP: - dev_type = amd::smi::kDevFwVersionPfp; - break; - case RSMI_FW_BLOCK_RLC: - dev_type = amd::smi::kDevFwVersionRlc; - break; - case RSMI_FW_BLOCK_RLC_SRLC: - dev_type = amd::smi::kDevFwVersionRlcSrlc; - break; - case RSMI_FW_BLOCK_RLC_SRLG: - dev_type = amd::smi::kDevFwVersionRlcSrlg; - break; - case RSMI_FW_BLOCK_RLC_SRLS: - dev_type = amd::smi::kDevFwVersionRlcSrls; - break; - case RSMI_FW_BLOCK_SDMA: - dev_type = amd::smi::kDevFwVersionSdma; - break; - case RSMI_FW_BLOCK_SDMA2: - dev_type = amd::smi::kDevFwVersionSdma2; - break; - case RSMI_FW_BLOCK_SMC: - dev_type = amd::smi::kDevFwVersionSmc; - break; - case RSMI_FW_BLOCK_SOS: - dev_type = amd::smi::kDevFwVersionSos; - break; - case RSMI_FW_BLOCK_TA_RAS: - dev_type = amd::smi::kDevFwVersionTaRas; - break; - case RSMI_FW_BLOCK_TA_XGMI: - dev_type = amd::smi::kDevFwVersionTaXgmi; - break; - case RSMI_FW_BLOCK_UVD: - dev_type = amd::smi::kDevFwVersionUvd; - break; - case RSMI_FW_BLOCK_VCE: - dev_type = amd::smi::kDevFwVersionVce; - break; - case RSMI_FW_BLOCK_VCN: - dev_type = amd::smi::kDevFwVersionVcn; - break; - default: - return RSMI_STATUS_INVALID_ARGS; + static const std::map kFWBlockTypeMap = { + { RSMI_FW_BLOCK_ASD, amd::smi::kDevFwVersionAsd }, + { RSMI_FW_BLOCK_CE, amd::smi::kDevFwVersionCe }, + { RSMI_FW_BLOCK_DMCU, amd::smi::kDevFwVersionDmcu }, + { RSMI_FW_BLOCK_MC, amd::smi::kDevFwVersionMc }, + { RSMI_FW_BLOCK_ME, amd::smi::kDevFwVersionMe }, + { RSMI_FW_BLOCK_MEC, amd::smi::kDevFwVersionMec }, + { RSMI_FW_BLOCK_MEC2, amd::smi::kDevFwVersionMec2 }, + { RSMI_FW_BLOCK_PFP, amd::smi::kDevFwVersionPfp }, + { RSMI_FW_BLOCK_RLC, amd::smi::kDevFwVersionRlc }, + { RSMI_FW_BLOCK_RLC_SRLC, amd::smi::kDevFwVersionRlcSrlc }, + { RSMI_FW_BLOCK_RLC_SRLG, amd::smi::kDevFwVersionRlcSrlg }, + { RSMI_FW_BLOCK_RLC_SRLS, amd::smi::kDevFwVersionRlcSrls }, + { RSMI_FW_BLOCK_SDMA, amd::smi::kDevFwVersionSdma }, + { RSMI_FW_BLOCK_SDMA2, amd::smi::kDevFwVersionSdma2 }, + { RSMI_FW_BLOCK_SMC, amd::smi::kDevFwVersionSmc }, + { RSMI_FW_BLOCK_SOS, amd::smi::kDevFwVersionSos }, + { RSMI_FW_BLOCK_TA_RAS, amd::smi::kDevFwVersionTaRas }, + { RSMI_FW_BLOCK_TA_XGMI, amd::smi::kDevFwVersionTaXgmi }, + { RSMI_FW_BLOCK_UVD, amd::smi::kDevFwVersionUvd }, + { RSMI_FW_BLOCK_VCE, amd::smi::kDevFwVersionVce }, + { RSMI_FW_BLOCK_VCN, amd::smi::kDevFwVersionVcn }, + }; + + const auto & dev_type_it = kFWBlockTypeMap.find(block); + if (dev_type_it != kFWBlockTypeMap.end()) { + dev_type = dev_type_it->second; + } else { + return RSMI_STATUS_INVALID_ARGS; } DEVICE_MUTEX @@ -1728,7 +1674,7 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, static std::string bitfield_to_freq_string(uint64_t bitf, uint32_t num_supported) { - std::string bf_str(""); + std::string bf_str; std::bitset bs(bitf); if (num_supported > RSMI_MAX_NUM_FREQUENCIES) { @@ -1793,24 +1739,11 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, int ret_i; amd::smi::DevInfoTypes dev_type; - switch (clk_type) { - case RSMI_CLK_TYPE_SYS: - dev_type = amd::smi::kDevGPUSClk; - break; - case RSMI_CLK_TYPE_MEM: - dev_type = amd::smi::kDevGPUMClk; - break; - case RSMI_CLK_TYPE_DF: - dev_type = amd::smi::kDevFClk; - break; - case RSMI_CLK_TYPE_SOC: - dev_type = amd::smi::kDevSOCClk; - break; - case RSMI_CLK_TYPE_DCEF: - dev_type = amd::smi::kDevDCEFClk; - break; - default: - return RSMI_STATUS_INVALID_ARGS; + const auto & clk_type_it = kClkTypeMap.find(clk_type); + if (clk_type_it != kClkTypeMap.end()) { + dev_type = clk_type_it->second; + } else { + return RSMI_STATUS_INVALID_ARGS; } ret_i = dev->writeDevInfo(dev_type, freq_enable_str); @@ -1878,7 +1811,7 @@ get_id_name_str_from_line(uint64_t id, std::string ln, *ln_str >> token1; - if (token1 == "") { + if (token1.empty()) { throw amd::smi::rsmi_exception(RSMI_STATUS_NO_DATA, __FUNCTION__); } @@ -1991,13 +1924,13 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name, } } - for (auto fl : pci_name_files) { + for (const auto& fl : pci_name_files) { std::ifstream id_file_strm(fl); while (std::getline(id_file_strm, ln)) { std::istringstream ln_str(ln); // parse line - if (ln[0] == '#' || ln.size() == 0) { + if (ln[0] == '#' || ln.empty()) { continue; } @@ -2008,29 +1941,28 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name, if (typ == NAME_STR_SUBSYS && found_device_id_for_subsys) { val_str = get_id_name_str_from_line(subsys_vend_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { // We've chopped the subsys_vend ID, now we need to get the // subsys description val_str = get_id_name_str_from_line(subsys_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { break; - } else { - val_str.clear(); } + val_str.clear(); } } } else if (typ == NAME_STR_DEVICE) { // ln[1] != '\t' // This is a device line val_str = get_id_name_str_from_line(device_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { break; } } else if (typ == NAME_STR_SUBSYS) { // match the device id line val_str = get_id_name_str_from_line(device_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { found_device_id_for_subsys = true; } } @@ -2048,22 +1980,21 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name, val_str = get_id_name_str_from_line(vendor_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { if (typ == NAME_STR_VENDOR) { break; - } else { - val_str.clear(); - found_device_vendor = true; } + val_str.clear(); + found_device_vendor = true; } } } - if (val_str.size() > 0) { + if (!val_str.empty()) { break; } } - if (val_str.size() == 0) { + if (val_str.empty()) { return get_backup_name(vendor_id, name, len); } size_t ct = val_str.copy(name, len); @@ -2293,8 +2224,8 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { const uint32_t SPEED_DATA_LENGTH = sizeof(link_speed)/sizeof(uint32_t); // Calculate the index - int width_index = -1; - int speed_index = -1; + uint32_t width_index = -1; + uint32_t speed_index = -1; uint32_t cur_index = 0; for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH; cur_index++) { if (link_width[cur_index] == gpu_metrics.pcie_link_width) { @@ -2302,8 +2233,7 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { break; } } - for (cur_index = 0; - cur_index < SPEED_DATA_LENGTH; cur_index++) { + for (cur_index = 0; cur_index < SPEED_DATA_LENGTH; cur_index++) { if (link_speed[cur_index] == gpu_metrics.pcie_link_speed) { speed_index = cur_index; break; @@ -2315,11 +2245,10 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { // Set possible lanes and frequencies b->transfer_rate.num_supported = WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; b->transfer_rate.current = speed_index*WIDTH_DATA_LENGTH + width_index; - for (cur_index = 0; - cur_index < WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; cur_index++) { - b->transfer_rate.frequency[cur_index] - = link_speed[cur_index/WIDTH_DATA_LENGTH] * 100 * 1000000L; - b->lanes[cur_index] = link_width[cur_index % WIDTH_DATA_LENGTH]; + for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; cur_index++) { + b->transfer_rate.frequency[cur_index] = + static_cast(link_speed[cur_index/WIDTH_DATA_LENGTH]) * 100 * 1000000L; + b->lanes[cur_index] = link_width[cur_index % WIDTH_DATA_LENGTH]; } /* frequency = {2500, 2500, 2500, 2500, 2500, 2500, @@ -2429,54 +2358,29 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, LOG_TRACE(ss); rsmi_status_t ret; - amd::smi::MonitorTypes mon_type; + amd::smi::MonitorTypes mon_type = amd::smi::kMonInvalid; uint16_t val_ui16; - switch (metric) { - case RSMI_TEMP_CURRENT: - mon_type = amd::smi::kMonTemp; - break; - case RSMI_TEMP_MAX: - mon_type = amd::smi::kMonTempMax; - break; - case RSMI_TEMP_MIN: - mon_type = amd::smi::kMonTempMin; - break; - case RSMI_TEMP_MAX_HYST: - mon_type = amd::smi::kMonTempMaxHyst; - break; - case RSMI_TEMP_MIN_HYST: - mon_type = amd::smi::kMonTempMinHyst; - break; - case RSMI_TEMP_CRITICAL: - mon_type = amd::smi::kMonTempCritical; - break; - case RSMI_TEMP_CRITICAL_HYST: - mon_type = amd::smi::kMonTempCriticalHyst; - break; - case RSMI_TEMP_EMERGENCY: - mon_type = amd::smi::kMonTempEmergency; - break; - case RSMI_TEMP_EMERGENCY_HYST: - mon_type = amd::smi::kMonTempEmergencyHyst; - break; - case RSMI_TEMP_CRIT_MIN: - mon_type = amd::smi::kMonTempCritMin; - break; - case RSMI_TEMP_CRIT_MIN_HYST: - mon_type = amd::smi::kMonTempCritMinHyst; - break; - case RSMI_TEMP_OFFSET: - mon_type = amd::smi::kMonTempOffset; - break; - case RSMI_TEMP_LOWEST: - mon_type = amd::smi::kMonTempLowest; - break; - case RSMI_TEMP_HIGHEST: - mon_type = amd::smi::kMonTempHighest; - break; - default: - mon_type = amd::smi::kMonInvalid; + static const std::map kMetricTypeMap = { + { RSMI_TEMP_CURRENT, amd::smi::kMonTemp }, + { RSMI_TEMP_MAX, amd::smi::kMonTempMax }, + { RSMI_TEMP_MIN, amd::smi::kMonTempMin }, + { RSMI_TEMP_MAX_HYST, amd::smi::kMonTempMaxHyst }, + { RSMI_TEMP_MIN_HYST, amd::smi::kMonTempMinHyst }, + { RSMI_TEMP_CRITICAL, amd::smi::kMonTempCritical }, + { RSMI_TEMP_CRITICAL_HYST, amd::smi::kMonTempCriticalHyst }, + { RSMI_TEMP_EMERGENCY, amd::smi::kMonTempEmergency }, + { RSMI_TEMP_EMERGENCY_HYST, amd::smi::kMonTempEmergencyHyst }, + { RSMI_TEMP_CRIT_MIN, amd::smi::kMonTempCritMin }, + { RSMI_TEMP_CRIT_MIN_HYST, amd::smi::kMonTempCritMinHyst }, + { RSMI_TEMP_OFFSET, amd::smi::kMonTempOffset }, + { RSMI_TEMP_LOWEST, amd::smi::kMonTempLowest }, + { RSMI_TEMP_HIGHEST, amd::smi::kMonTempHighest }, + }; + + const auto mon_type_it = kMetricTypeMap.find(metric); + if (mon_type_it != kMetricTypeMap.end()) { + mon_type = mon_type_it->second; } if (temperature == nullptr) { @@ -2492,80 +2396,81 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, return RSMI_STATUS_INVALID_ARGS; } - // The HBM temperature is retreived from the gpu_metrics - if (sensor_type == RSMI_TEMP_TYPE_HBM_0 - || sensor_type == RSMI_TEMP_TYPE_HBM_1 - || sensor_type == RSMI_TEMP_TYPE_HBM_2 - || sensor_type == RSMI_TEMP_TYPE_HBM_3) { - if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Type: " << monitorTypesToString.at(mon_type) - << " | Cause: To retreive HBM temp, we only support metric = " - << "RSMI_TEMP_CURRENT" - << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; - LOG_ERROR(ss); - return RSMI_STATUS_NOT_SUPPORTED; - } + // The HBM temperature is retrieved from the gpu_metrics + if (sensor_type == RSMI_TEMP_TYPE_HBM_0 || + sensor_type == RSMI_TEMP_TYPE_HBM_1 || + sensor_type == RSMI_TEMP_TYPE_HBM_2 || + sensor_type == RSMI_TEMP_TYPE_HBM_3) { + if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: To retrieve HBM temp, we only support metric = " + << "RSMI_TEMP_CURRENT" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } - rsmi_gpu_metrics_t gpu_metrics; - ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); - if (ret != RSMI_STATUS_SUCCESS) { - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Type: " << monitorTypesToString.at(mon_type) - << " | Cause: rsmi_dev_gpu_metrics_info_get returned " - << getRSMIStatusString(ret) - << " | Returning = " - << getRSMIStatusString(ret) << " |"; - LOG_ERROR(ss); - return ret; - } + rsmi_gpu_metrics_t gpu_metrics; + ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: rsmi_dev_gpu_metrics_info_get returned " + << getRSMIStatusString(ret) + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_ERROR(ss); + return ret; + } - switch (sensor_type) { - case RSMI_TEMP_TYPE_HBM_0: - val_ui16 = gpu_metrics.temperature_hbm[0]; - break; - case RSMI_TEMP_TYPE_HBM_1: - val_ui16 = gpu_metrics.temperature_hbm[1]; - break; - case RSMI_TEMP_TYPE_HBM_2: - val_ui16 = gpu_metrics.temperature_hbm[2]; - break; - case RSMI_TEMP_TYPE_HBM_3: - val_ui16 = gpu_metrics.temperature_hbm[3]; - break; - default: - return RSMI_STATUS_INVALID_ARGS; - } - if (val_ui16 == UINT16_MAX) { - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Type: " << monitorTypesToString.at(mon_type) - << " | Cause: Reached UINT16 max value, overflow" - << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; - LOG_ERROR(ss); - return RSMI_STATUS_NOT_SUPPORTED; - } else - *temperature = val_ui16 * CENTRIGRADE_TO_MILLI_CENTIGRADE; + switch (sensor_type) { + case RSMI_TEMP_TYPE_HBM_0: + val_ui16 = gpu_metrics.temperature_hbm[0]; + break; + case RSMI_TEMP_TYPE_HBM_1: + val_ui16 = gpu_metrics.temperature_hbm[1]; + break; + case RSMI_TEMP_TYPE_HBM_2: + val_ui16 = gpu_metrics.temperature_hbm[2]; + break; + case RSMI_TEMP_TYPE_HBM_3: + val_ui16 = gpu_metrics.temperature_hbm[3]; + break; + default: + return RSMI_STATUS_INVALID_ARGS; + } + if (val_ui16 == UINT16_MAX) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: Reached UINT16 max value, overflow" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } - ss << __PRETTY_FUNCTION__ << " | ======= end ======= " - << " | Success " - << " | Device #: " << dv_ind - << " | Type: " << monitorTypesToString.at(mon_type) - << " | Data: " << *temperature - << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " | "; - LOG_INFO(ss); - return RSMI_STATUS_SUCCESS; + *temperature = static_cast(val_ui16) * CENTRIGRADE_TO_MILLI_CENTIGRADE; + + ss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Data: " << *temperature + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " | "; + LOG_INFO(ss); + return RSMI_STATUS_SUCCESS; } // end HBM temperature DEVICE_MUTEX @@ -2811,7 +2716,7 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) { } rsmi_status_t -rsmi_dev_gpu_reset(int32_t dv_ind) { +rsmi_dev_gpu_reset(uint32_t dv_ind) { TRY std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; @@ -2985,7 +2890,8 @@ rsmi_status_t rsmi_dev_power_cap_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t cap) { TRY rsmi_status_t ret; - uint64_t min, max; + uint64_t min; + uint64_t max; std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); @@ -3354,10 +3260,10 @@ rsmi_utilization_count_get(uint32_t dv_ind, default: return RSMI_STATUS_INVALID_ARGS; } - if (val_ui32 == UINT32_MAX) + if (val_ui32 == UINT32_MAX) { return RSMI_STATUS_NOT_SUPPORTED; - else - utilization_counters[index].value = val_ui32; + } + utilization_counters[index].value = val_ui32; } *timestamp = gpu_metrics.system_clock_counter; @@ -3583,7 +3489,7 @@ rsmi_dev_counter_destroy(rsmi_event_handle_t evnt_handle) { rsmi_status_t rsmi_counter_control(rsmi_event_handle_t evt_handle, - rsmi_counter_command_t cmd, void *) { + rsmi_counter_command_t cmd, void * /*unused*/) { TRY amd::smi::evt::Event *evt = @@ -3644,9 +3550,9 @@ rsmi_counter_read(rsmi_event_handle_t evt_handle, } if (ret == 0) { return RSMI_STATUS_SUCCESS; - } else { - return RSMI_STATUS_UNEXPECTED_SIZE; } + + return RSMI_STATUS_UNEXPECTED_SIZE; CATCH } @@ -3689,9 +3595,9 @@ rsmi_dev_counter_group_supported(uint32_t dv_ind, rsmi_event_group_t group) { if (grp->find(group) == grp->end()) { return RSMI_STATUS_NOT_SUPPORTED; - } else { - return RSMI_STATUS_SUCCESS; } + + return RSMI_STATUS_SUCCESS; CATCH } @@ -4183,7 +4089,8 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, return RSMI_STATUS_INVALID_ARGS; } - uint32_t node_ind_src, node_ind_dst; + uint32_t node_ind_src; + uint32_t node_ind_dst; // Fetch the source and destination GPU node index if (smi.get_node_index(dv_ind_src, &node_ind_src) || smi.get_node_index(dv_ind_dst, &node_ind_dst)) { @@ -4241,19 +4148,13 @@ get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { } switch (mapStringToRSMIComputePartitionTypes[compute_partition_str]) { - case RSMI_COMPUTE_PARTITION_INVALID: - // Retrieved an unknown compute partition - return RSMI_STATUS_UNEXPECTED_DATA; case RSMI_COMPUTE_PARTITION_CPX: - break; case RSMI_COMPUTE_PARTITION_SPX: - break; case RSMI_COMPUTE_PARTITION_DPX: - break; case RSMI_COMPUTE_PARTITION_TPX: - break; case RSMI_COMPUTE_PARTITION_QPX: break; + case RSMI_COMPUTE_PARTITION_INVALID: default: // Retrieved an unknown compute partition return RSMI_STATUS_UNEXPECTED_DATA; @@ -4326,19 +4227,13 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, std::string currentComputePartition; switch (compute_partition) { - case RSMI_COMPUTE_PARTITION_INVALID: - // Retrieved an unknown compute partition - return RSMI_STATUS_INVALID_ARGS; case RSMI_COMPUTE_PARTITION_CPX: - break; case RSMI_COMPUTE_PARTITION_SPX: - break; case RSMI_COMPUTE_PARTITION_DPX: - break; case RSMI_COMPUTE_PARTITION_TPX: - break; case RSMI_COMPUTE_PARTITION_QPX: break; + case RSMI_COMPUTE_PARTITION_INVALID: default: return RSMI_STATUS_INVALID_ARGS; } @@ -4385,17 +4280,12 @@ static rsmi_status_t get_nps_mode(uint32_t dv_ind, std::string &nps_mode) { } switch (mapStringToNPSModeTypes[val_str]) { - case RSMI_MEMORY_PARTITION_UNKNOWN: - // Retrieved an unknown NPS mode - return RSMI_STATUS_UNEXPECTED_DATA; case RSMI_MEMORY_PARTITION_NPS1: - break; case RSMI_MEMORY_PARTITION_NPS2: - break; case RSMI_MEMORY_PARTITION_NPS4: - break; case RSMI_MEMORY_PARTITION_NPS8: break; + case RSMI_MEMORY_PARTITION_UNKNOWN: default: // Retrieved an unknown NPS mode return RSMI_STATUS_UNEXPECTED_DATA; @@ -4429,7 +4319,7 @@ rsmi_dev_nps_mode_set(uint32_t dv_ind, rsmi_nps_mode_type_t nps_mode) { } } - if (isCorrectDevice == false) { + if (!isCorrectDevice) { return RSMI_STATUS_NOT_SUPPORTED; } @@ -4438,17 +4328,12 @@ rsmi_dev_nps_mode_set(uint32_t dv_ind, rsmi_nps_mode_type_t nps_mode) { std::string currentNPSMode; switch (nps_mode) { - case RSMI_MEMORY_PARTITION_UNKNOWN: - // Retrieved an unknown NPS mode - return RSMI_STATUS_INVALID_ARGS; case RSMI_MEMORY_PARTITION_NPS1: - break; case RSMI_MEMORY_PARTITION_NPS2: - break; case RSMI_MEMORY_PARTITION_NPS4: - break; case RSMI_MEMORY_PARTITION_NPS8: break; + case RSMI_MEMORY_PARTITION_UNKNOWN: default: return RSMI_STATUS_INVALID_ARGS; } @@ -4584,19 +4469,19 @@ rsmi_dev_supported_func_iterator_open(uint32_t dv_ind, if (dev->supported_funcs()->begin() == dev->supported_funcs()->end()) { delete *handle; return RSMI_STATUS_NO_DATA; - } else { - SupportedFuncMapIt *supp_func_iter = new SupportedFuncMapIt; - - if (supp_func_iter == nullptr) { - return RSMI_STATUS_OUT_OF_RESOURCES; - } - *supp_func_iter = dev->supported_funcs()->begin(); - - (*handle)->func_id_iter = reinterpret_cast(supp_func_iter); - (*handle)->container_ptr = - reinterpret_cast(dev->supported_funcs()); } + SupportedFuncMapIt *supp_func_iter = new SupportedFuncMapIt; + + if (supp_func_iter == nullptr) { + return RSMI_STATUS_OUT_OF_RESOURCES; + } + *supp_func_iter = dev->supported_funcs()->begin(); + + (*handle)->func_id_iter = reinterpret_cast(supp_func_iter); + (*handle)->container_ptr = + reinterpret_cast(dev->supported_funcs()); + return RSMI_STATUS_SUCCESS; CATCH @@ -4963,7 +4848,8 @@ rsmi_event_notification_get(int timeout_ms, if (*num_elem < buffer_size && errno != EAGAIN) { return amd::smi::ErrnoToRsmiStatus(errno); - } else if (*num_elem >= buffer_size) { + } + if (*num_elem >= buffer_size) { return RSMI_STATUS_SUCCESS; } @@ -5039,7 +4925,7 @@ rsmi_test_refcount(uint64_t refcnt_type) { amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); std::lock_guard guard(*smi.bootstrap_mutex()); - if (smi.ref_count() == 0 && smi.devices().size() != 0) { + if (smi.ref_count() == 0 && !smi.devices().empty()) { return -1; } diff --git a/src/rocm_smi_counters.cc b/src/rocm_smi_counters.cc index 9f82798183..a08819568e 100755 --- a/src/rocm_smi_counters.cc +++ b/src/rocm_smi_counters.cc @@ -41,20 +41,20 @@ * */ -#include -#include -#include -#include #include +#include #include #include -#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include #include #include "rocm_smi/rocm_smi.h" @@ -164,8 +164,7 @@ GetSupportedEventGroups(uint32_t dev_num, dev_evt_grp_set_t *supported_grps) { } // /sys/bus/event_source/devices/_/type Event::Event(rsmi_event_type_t event, uint32_t dev_ind) : - event_type_(event), prev_cntr_val_(0) { - fd_ = -1; + event_type_(event), fd_(-1), prev_cntr_val_(0) { rsmi_event_group_t grp = EvtGrpFromEvtID(event); assert(grp != RSMI_EVNT_GRP_INVALID); // This should have failed before now @@ -398,10 +397,11 @@ readn(int fd, void *buf, size_t n) { return static_cast(n - left); } if (bytes < 0) { - if (errno == EINTR) /* read got interrupted */ + if (errno == EINTR) { + /* read got interrupted */ continue; - else - return -errno; + } + return -errno; } left -= static_cast(bytes); diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index 3830d602aa..95f27d8e12 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -43,30 +43,28 @@ #include #include -#include -#include #include -#include +#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include #include +#include #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" -#include "rocm_smi/rocm_smi_kfd.h" #include "rocm_smi/rocm_smi_logger.h" #include "shared_mutex.h" // NOLINT @@ -689,7 +687,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr) { int ret; std::ostringstream ss; - fs.rdbuf()->pubsetbuf(0,0); + fs.rdbuf()->pubsetbuf(nullptr,0); ret = openSysfsFileStream(type, &fs, valStr.c_str()); if (ret != 0) { ss << "Could not write device info string (" << valStr @@ -856,7 +854,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, retVec->push_back(line); } - if (retVec->size() == 0) { + if (retVec->empty()) { ss << "Read devInfoMultiLineStr for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ")" << ", but contained no string lines"; @@ -864,13 +862,13 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, return 0; } // Remove any *trailing* empty (whitespace) lines - while (retVec->size() != 0 && + while (!retVec->empty() && retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) { retVec->pop_back(); } // allow logging output of multiline strings - for (auto l: *retVec) { + for (const auto& l: *retVec) { allLines += "\n" + l; } @@ -905,10 +903,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); - if (tempStr == "") { + if (tempStr.empty()) { return EINVAL; } - tmp_val = std::stoi(tempStr, 0, 16); + tmp_val = std::stoi(tempStr, nullptr, 16); if (tmp_val < 0) { return EINVAL; } @@ -930,10 +928,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevXGMIError: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); - if (tempStr == "") { + if (tempStr.empty()) { return EINVAL; } - *val = std::stoul(tempStr, 0); + *val = std::stoul(tempStr, nullptr); break; case kDevUniqueId: @@ -960,10 +958,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevFwVersionVcn: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); - if (tempStr == "") { + if (tempStr.empty()) { return EINVAL; } - *val = std::stoul(tempStr, 0, 16); + *val = std::stoul(tempStr, nullptr, 16); break; case kDevGpuReset: @@ -1100,7 +1098,7 @@ void Device::DumpSupportedFunctions(void) { } void Device::fillSupportedFuncs(void) { - if (supported_funcs_.size() != 0) { + if (!supported_funcs_.empty()) { return; } if (monitor() == nullptr) { @@ -1140,7 +1138,7 @@ void Device::fillSupportedFuncs(void) { std::vector::const_iterator var = it->second.variants.begin(); - if (it->second.variants.size() == 0) { + if (it->second.variants.empty()) { supported_funcs_[it->first] = nullptr; it++; continue; @@ -1156,7 +1154,7 @@ void Device::fillSupportedFuncs(void) { (*supported_variants)[kDevInfoVarTypeToRSMIVariant.at(*var)] = nullptr; } - if ((*supported_variants).size() > 0) { + if (!(*supported_variants).empty()) { supported_funcs_[it->first] = supported_variants; } @@ -1202,35 +1200,32 @@ bool Device::DeviceAPISupported(std::string name, uint64_t variant, if (sub_variant == RSMI_DEFAULT_VARIANT) { return true; - } else { // sub_variant != RSMI_DEFAULT_VARIANT - // if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr - assert(var_it->second != nullptr); + } + // sub_variant != RSMI_DEFAULT_VARIANT + // if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr + assert(var_it->second != nullptr); - return subvariant_match(&(var_it->second), sub_variant); - } - } else { // variant == RSMI_DEFAULT_VARIANT - if (func_it->second != nullptr) { - var_it = func_it->second->find(variant); - } - if (sub_variant == RSMI_DEFAULT_VARIANT) { - return true; - } else { // sub_variant != RSMI_DEFAULT_VARIANT - if (func_it->second == nullptr) { - return false; - } - return subvariant_match(&(var_it->second), sub_variant); - } + return subvariant_match(&(var_it->second), sub_variant); } - assert(false); // We should not reach here - - return false; + // variant == RSMI_DEFAULT_VARIANT + if (func_it->second != nullptr) { + var_it = func_it->second->find(variant); + } + if (sub_variant == RSMI_DEFAULT_VARIANT) { + return true; + } + // sub_variant != RSMI_DEFAULT_VARIANT + if (func_it->second == nullptr) { + return false; + } + return subvariant_match(&(var_it->second), sub_variant); } rsmi_status_t Device::restartAMDGpuDriver(void) { REQUIRE_ROOT_ACCESS bool restartSuccessful = true; bool success = false; - std::string out = ""; + std::string out; bool wasGdmServiceActive = false; // sudo systemctl is-active gdm diff --git a/src/rocm_smi_gpu_metrics.cc b/src/rocm_smi_gpu_metrics.cc index 885c36d7f6..c2ad2e2659 100755 --- a/src/rocm_smi_gpu_metrics.cc +++ b/src/rocm_smi_gpu_metrics.cc @@ -41,23 +41,22 @@ * */ -#include #include - -#include -#include -#include -#include -#include -#include -#include // NOLINT -#include #include -#include + +#include +#include +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include #include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h #include "rocm_smi/rocm_smi_main.h" -#include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_logger.h" @@ -150,7 +149,7 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, const rsmi_gpu_metrics_v_1_2 *rsmi_gpu_metrics_v_1_2, const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3, const rsmi_gpu_metrics_t *rsmi_gpu_metrics) { - if (RocmSMI::getInstance().isLoggingOn() == false) { + if (!RocmSMI::getInstance().isLoggingOn()) { return; } std::ostringstream ss; @@ -170,9 +169,8 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, } if (rsmi_gpu_metrics == nullptr) { return; - } else { - // do nothing - continue } + ss /* Common Header */ << print_unsigned_hex_and_int( @@ -365,7 +363,7 @@ static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind, } #define ASSIGN_DATA_FIELD(FIELD, SRC) \ - data->FIELD = SRC->FIELD; + data->FIELD = (SRC)->FIELD; #define ASSIGN_COMMON_FORMATS(SRC) \ ASSIGN_DATA_FIELD(common_header, (SRC)) \ diff --git a/src/rocm_smi_io_link.cc b/src/rocm_smi_io_link.cc index 888f13fffa..218e520d84 100755 --- a/src/rocm_smi_io_link.cc +++ b/src/rocm_smi_io_link.cc @@ -41,20 +41,19 @@ * */ -#include -#include #include +#include #include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include "rocm_smi/rocm_smi.h" -#include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_io_link.h" @@ -161,7 +160,7 @@ static int ReadLinkProperties(uint32_t node_indx, uint32_t link_indx, retVec->push_back(line); } - if (retVec->size() == 0) { + if (retVec->empty()) { fs.close(); return 0; } @@ -182,7 +181,7 @@ static int DiscoverLinks(std::map, if (links == nullptr) { return EINVAL; } - assert(links->size() == 0); + assert(links->empty()); links->clear(); @@ -229,8 +228,8 @@ static int DiscoverLinks(std::map, } link_indx = static_cast(std::stoi(dentry_io_link->d_name)); - link = std::shared_ptr(new IOLink(node_indx, link_indx, - directory)); + link = std::make_shared(node_indx, link_indx, + directory); link->Initialize(); @@ -273,7 +272,7 @@ static int DiscoverLinksPerNode(uint32_t node_indx, std::mapsize() == 0); + assert(links->empty()); links->clear(); @@ -297,8 +296,8 @@ static int DiscoverLinksPerNode(uint32_t node_indx, std::map(std::stoi(dentry->d_name)); - link = std::shared_ptr(new IOLink(node_indx, link_indx, - directory)); + link = std::make_shared(node_indx, link_indx, + directory); link->Initialize(); @@ -323,16 +322,15 @@ int DiscoverP2PLinksPerNode(uint32_t node_indx, std::map propVec; - assert(properties_.size() == 0); - if (properties_.size() > 0) { + assert(properties_.empty()); + if (!properties_.empty()) { return 0; } @@ -347,8 +345,8 @@ int IOLink::ReadProperties(void) { uint64_t val_int; // Assume all properties are unsigned integers for now std::istringstream fs; - for (uint32_t i = 0; i < propVec.size(); ++i) { - fs.str(propVec[i]); + for (const auto & i : propVec) { + fs.str(i); fs >> key_str; fs >> val_int; diff --git a/src/rocm_smi_kfd.cc b/src/rocm_smi_kfd.cc index 092bcb3414..afe567d80d 100755 --- a/src/rocm_smi_kfd.cc +++ b/src/rocm_smi_kfd.cc @@ -41,27 +41,27 @@ * */ -#include -#include -#include -#include -#include #include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include #include "rocm_smi/rocm_smi_io_link.h" #include "rocm_smi/rocm_smi_kfd.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" -#include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi_main.h" namespace amd { @@ -195,7 +195,7 @@ int ReadKFDDeviceProperties(uint32_t kfd_node_id, retVec->push_back(line); } - if (retVec->size() == 0) { + if (retVec->empty()) { fs.close(); return ENOENT; } @@ -517,7 +517,7 @@ int DiscoverKFDNodes(std::map> *nodes) { if (nodes == nullptr) { return EINVAL; } - assert(nodes->size() == 0); + assert(nodes->empty()); nodes->clear(); @@ -548,7 +548,7 @@ int DiscoverKFDNodes(std::map> *nodes) { continue; } - node = std::shared_ptr(new KFDNode(node_indx)); + node = std::make_shared(node_indx); node->Initialize(); @@ -596,16 +596,15 @@ int DiscoverKFDNodes(std::map> *nodes) { return 0; } -KFDNode::~KFDNode() { -} +KFDNode::~KFDNode() = default; int KFDNode::ReadProperties(void) { int ret; std::vector propVec; - assert(properties_.size() == 0); - if (properties_.size() > 0) { + assert(properties_.empty()); + if (!properties_.empty()) { return 0; } @@ -620,8 +619,8 @@ int KFDNode::ReadProperties(void) { uint64_t val_int; // Assume all properties are unsigned integers for now std::istringstream fs; - for (uint32_t i = 0; i < propVec.size(); ++i) { - fs.str(propVec[i]); + for (const auto & i : propVec) { + fs.str(i); fs >> key_str; fs >> val_int; diff --git a/src/rocm_smi_logger.cc b/src/rocm_smi_logger.cc index 0600654ef3..ccbb12c29a 100644 --- a/src/rocm_smi_logger.cc +++ b/src/rocm_smi_logger.cc @@ -55,7 +55,7 @@ * be printed, unless RSMI_LOGGING is enabled. * * BUFFER log type should be use while logging raw buffer or raw messages - * Having direct interface as well as C++ Singleton inface. Can use + * Having direct interface as well as C++ Singleton iface. Can use * whatever interface fits your needs. */ @@ -70,7 +70,6 @@ // Code Specific Header Files(s) #include "rocm_smi/rocm_smi_logger.h" #include "rocm_smi/rocm_smi_main.h" -#include "rocm_smi/rocm_smi_utils.h" using namespace ROCmLogging; @@ -117,7 +116,7 @@ void Logger::logIntoFile(std::string& data) { if(!m_File.is_open()) { initialize_resources(); if (!m_File.is_open()) { - std::cout << "WARNING: re-initializing resources was unsuccessfull." + std::cout << "WARNING: re-initializing resources was unsuccessful." <<" Unable to print the following message." << std::endl; logOnConsole(data); unlock(); @@ -164,7 +163,7 @@ void Logger::error(const char* text) throw() { // By default, logging is disabled // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -198,7 +197,7 @@ void Logger::alarm(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -232,7 +231,7 @@ void Logger::always(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -270,7 +269,7 @@ void Logger::buffer(const char* text) throw() { if(!m_File.is_open()) { initialize_resources(); if (!m_File.is_open()) { - std::cout << "WARNING: re-initializing resources was unsuccessfull." + std::cout << "WARNING: re-initializing resources was unsuccessful." <<" Unable to print the following message." << std::endl; std::string txtStr(text); std::cout << txtStr << std::endl; @@ -300,7 +299,7 @@ void Logger::info(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -334,7 +333,7 @@ void Logger::trace(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -368,7 +367,7 @@ void Logger::debug(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -426,7 +425,7 @@ void Logger::enableFileLogging() { // Returns a string of details on current log settings std::string Logger::getLogSettings() { - std::string logSettings = ""; + std::string logSettings; if (m_File.is_open()) { logSettings += "OpenStatus = File (" + logFileName + ") is open"; @@ -490,7 +489,7 @@ void Logger::initialize_resources() { // The check below allows us to toggle logging through RSMI_LOGGING // set or unset m_loggingIsOn = amd::smi::RocmSMI::getInstance().isLoggingOn(); - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } m_File.open(logFileName.c_str(), std::ios::out | std::ios::app); diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index 8cb95fe7f2..1eb973c17e 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -39,25 +39,26 @@ * DEALINGS WITH THE SOFTWARE. * */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include + +#include #include -#include +#include +#include +#include +#include +#include #include +#include +#include #include +#include +#include +#include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_device.h" @@ -284,7 +285,8 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) { // We are looking for the last element in the path that has the form // XXXX:XX:XX.X, where X is a hex integer (lower case is expected) - std::size_t slash_i, end_i; + std::size_t slash_i; + std::size_t end_i; std::string tmp; std::string tpath_str(tpath); @@ -333,7 +335,7 @@ RocmSMI::Initialize(uint64_t flags) { // To help debug env variable issues // printEnvVarInfo(); - while (std::string(kAMDMonitorTypes[i]) != "") { + while (!std::string(kAMDMonitorTypes[i]).empty()) { amd_monitor_types_.insert(kAMDMonitorTypes[i]); ++i; } @@ -347,12 +349,12 @@ RocmSMI::Initialize(uint64_t flags) { } uint64_t bdfid; - for (uint32_t i = 0; i < devices_.size(); ++i) { - if (ConstructBDFID(devices_[i]->path(), &bdfid) != 0) { + for (auto & device : devices_) { + if (ConstructBDFID(device->path(), &bdfid) != 0) { std::cerr << "Failed to construct BDFID." << std::endl; ret = 1; } else { - devices_[i]->set_bdfid(bdfid); + device->set_bdfid(bdfid); } } if (ret != 0) { @@ -443,8 +445,7 @@ RocmSMI::RocmSMI(uint64_t flags) : init_options_(flags), kfd_notif_evt_fh_(-1), kfd_notif_evt_fh_refcnt_(0) { } -RocmSMI::~RocmSMI() { -} +RocmSMI::~RocmSMI() = default; RocmSMI& RocmSMI::getInstance(uint64_t flags) { // Assume c++11 or greater. static objects will be created by only 1 thread @@ -493,7 +494,7 @@ static inline std::unordered_set GetEnvVarUIntegerSets( if(ev_str == nullptr) { return returnSet; } std::string stringEnv = ev_str; - if (stringEnv.empty() == false) { + if (!stringEnv.empty()) { // parse out values by commas std::string parsedVal; std::istringstream ev_str_ss(stringEnv); @@ -571,7 +572,7 @@ void RocmSMI::printEnvVarInfo(void) { << std::endl; std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " << getLogSetting() << std::endl; - bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false; + bool isLoggingOn = RocmSMI::isLoggingOn(); std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " << (isLoggingOn ? "true" : "false") << std::endl; std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {"; @@ -637,7 +638,7 @@ RocmSMI::FindMonitor(std::string monitor_path) { fs.close(); if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) { - m = std::shared_ptr(new Monitor(mon_name, &env_vars_)); + m = std::make_shared(mon_name, &env_vars_); m->setTempSensorLabelMap(); m->setVoltSensorLabelMap(); break; @@ -665,12 +666,12 @@ RocmSMI::AddToDeviceList(std::string dev_name) { dev_path += "/"; dev_path += dev_name; - auto dev = std::shared_ptr(new Device(dev_path, &env_vars_)); + auto dev = std::make_shared(dev_path, &env_vars_); std::shared_ptr m = FindMonitor(dev_path + "/device/hwmon"); dev->set_monitor(m); - std::string d_name = dev_name; + const std::string& d_name = dev_name; uint32_t card_indx = GetDeviceIndex(d_name); dev->set_drm_render_minor(GetDrmRenderMinor(dev_path)); dev->set_card_index(card_indx); @@ -681,8 +682,6 @@ RocmSMI::AddToDeviceList(std::string dev_name) { << dev_name << " | path = " << dev_path << " | card index = " << std::to_string(card_indx) << " | "; LOG_DEBUG(ss); - - return; } static const uint32_t kAmdGpuId = 0x1002; @@ -789,7 +788,7 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) { power_mons_.clear(); } - if (power_mons_.size() != 0) { + if (!power_mons_.empty()) { return 0; } @@ -817,7 +816,7 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) { if (FileExists(tmp.c_str())) { std::shared_ptr mon = - std::shared_ptr(new PowerMon(mon_name, &env_vars_)); + std::make_shared(mon_name, &env_vars_); power_mons_.push_back(mon); mon->set_dev_index(GetDeviceIndex(dentry->d_name)); } @@ -830,8 +829,8 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) { return errno; } - for (auto m : power_mons_) { - for (auto d : devices_) { + for (const auto& m : power_mons_) { + for (const auto& d : devices_) { if (m->dev_index() == d->index()) { d->set_power_monitor(m); break; diff --git a/src/rocm_smi_monitor.cc b/src/rocm_smi_monitor.cc index 00035e6307..f0fe75fdbc 100755 --- a/src/rocm_smi_monitor.cc +++ b/src/rocm_smi_monitor.cc @@ -41,19 +41,18 @@ * */ -#include #include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include #include // NOLINT +#include #include -#include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_exception.h" @@ -286,8 +285,7 @@ static const std::map kMonFuncDependsMap = { env_ = nullptr; #endif } -Monitor::~Monitor(void) { -} +Monitor::~Monitor(void) = default; std::string Monitor::MakeMonitorPath(MonitorTypes type, uint32_t sensor_id) { @@ -339,7 +337,7 @@ Monitor::setTempSensorLabelMap(void) { std::string type_str; int ret; - if (temp_type_index_map_.size() > 0) { + if (!temp_type_index_map_.empty()) { return 0; // We've already filled in the map } auto add_temp_sensor_entry = [&](uint32_t file_index) { @@ -377,7 +375,7 @@ Monitor::setVoltSensorLabelMap(void) { std::string type_str; int ret; - if (volt_type_index_map_.size() > 0) { + if (!volt_type_index_map_.empty()) { return 0; // We've already filled in the map } auto add_volt_sensor_entry = [&](uint32_t file_index) { @@ -510,10 +508,10 @@ typedef enum { static monitor_types getFuncType(std::string f_name) { monitor_types ret = eDefaultMonitor; - if (f_name.compare("rsmi_dev_temp_metric_get") == 0) { + if (f_name == "rsmi_dev_temp_metric_get") { ret = eTempMonitor; } - if (f_name.compare("rsmi_dev_volt_metric_get") == 0) { + if (f_name == "rsmi_dev_volt_metric_get") { ret = eVoltMonitor; } return ret; @@ -614,22 +612,22 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { } else { supported_monitors = intersect; } - if (supported_monitors.size() > 0) { - for (uint32_t i = 0; i < supported_monitors.size(); ++i) { + if (!supported_monitors.empty()) { + for (unsigned long & supported_monitor : supported_monitors) { if (m_type == eDefaultMonitor) { - assert(supported_monitors[i] > 0); - supported_monitors[i] |= - (supported_monitors[i] - 1) << MONITOR_TYPE_BIT_POSITION; + assert(supported_monitor > 0); + supported_monitor |= + (supported_monitor - 1) << MONITOR_TYPE_BIT_POSITION; } else if (m_type == eTempMonitor) { // Temp sensor file names are 1-based - assert(supported_monitors[i] > 0); - supported_monitors[i] |= - static_cast(getTempSensorEnum(supported_monitors[i])) + assert(supported_monitor > 0); + supported_monitor |= + static_cast(getTempSensorEnum(supported_monitor)) << MONITOR_TYPE_BIT_POSITION; } else if (m_type == eVoltMonitor) { // Voltage sensor file names are 0-based - supported_monitors[i] |= - static_cast(getVoltSensorEnum(supported_monitors[i])) + supported_monitor |= + static_cast(getVoltSensorEnum(supported_monitor)) << MONITOR_TYPE_BIT_POSITION; } else { assert(false); // Unexpected monitor type @@ -640,10 +638,10 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { } } - if (it->second.variants.size() == 0) { + if (it->second.variants.empty()) { (*supported_funcs)[it->first] = nullptr; supported_variants = nullptr; // Invoke destructor - } else if ((*supported_variants).size() > 0) { + } else if (!(*supported_variants).empty()) { (*supported_funcs)[it->first] = supported_variants; } diff --git a/src/rocm_smi_power_mon.cc b/src/rocm_smi_power_mon.cc index 3e1d7e0d45..454851651b 100755 --- a/src/rocm_smi_power_mon.cc +++ b/src/rocm_smi_power_mon.cc @@ -41,17 +41,14 @@ * */ -#include - -#include -#include +#include #include -#include +#include #include +#include #include +#include -#include "rocm_smi/rocm_smi_main.h" -#include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi_exception.h" @@ -70,8 +67,7 @@ static const std::map kMonitorNameMap = { PowerMon::PowerMon(std::string path, RocmSMI_env_vars const *e) : path_(path), env_(e) { } -PowerMon::~PowerMon(void) { -} +PowerMon::~PowerMon(void) = default; static int parse_power_str(std::string s, PowerMonTypes type, uint64_t *val) { std::stringstream ss(s); diff --git a/src/rocm_smi_properties.cc b/src/rocm_smi_properties.cc index 0e606e6874..b076991881 100644 --- a/src/rocm_smi_properties.cc +++ b/src/rocm_smi_properties.cc @@ -90,7 +90,6 @@ AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id) { static_cast(AMDGpuPropertyTypesOffset_t::kClkTypes) | static_cast(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes); - auto property_type_offset = (static_cast(property_type_offset_mask) & (property_id)); auto property_type_id = (static_cast(property_id) & ~(property_type_offset_mask)); return property_type_id; @@ -435,7 +434,7 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx id_filter_result = rsmi_dev_revision_get(dev_idx, &tmp_amdgpu_query.m_pci_rev_id); } } - is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) ? true : false; + is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS); return tmp_amdgpu_query; }; @@ -475,13 +474,6 @@ rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuProper return (amdgpu_property_reinforcement_list.find(asic_id) != amdgpu_property_reinforcement_list.end()); }; - auto ends_with = [](const std::string& value, const std::string& ending) { - if (value.size() < ending.size()) { - return false; - } - return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); - }; - // Traverse through all values for a given key osstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n"; LOG_TRACE(osstream); diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index 5009d3bd14..973d555d26 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -40,26 +40,26 @@ * DEALINGS WITH THE SOFTWARE. * */ -#include -#include -#include -#include + #include #include +#include #include +#include -#include -#include -#include +#include +#include +#include #include +#include +#include #include #include -#include -#include -#include #include -#include +#include +#include #include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_utils.h" @@ -137,7 +137,7 @@ std::vector globFilesExist(const std::string& filePattern) { glob_t result_glob; memset(&result_glob, 0, sizeof(result_glob)); - if (glob(filePattern.c_str(), GLOB_TILDE, NULL, &result_glob) != 0) { + if (glob(filePattern.c_str(), GLOB_TILDE, nullptr, &result_glob) != 0) { globfree(&result_glob); // Leaving below to help debug issues discovering future glob file searches // debugFilesDiscovered(fileNames); @@ -145,7 +145,7 @@ std::vector globFilesExist(const std::string& filePattern) { } for(size_t i = 0; i < result_glob.gl_pathc; ++i) { - fileNames.push_back(std::string(result_glob.gl_pathv[i])); + fileNames.emplace_back(result_glob.gl_pathv[i]); } globfree(&result_glob); @@ -367,7 +367,7 @@ std::string removeString(const std::string origStr, // defaults to trim stdOut std::pair executeCommand(std::string command, bool stdOut) { char buffer[128]; - std::string stdoutAndErr = ""; + std::string stdoutAndErr; bool successfulRun = true; command = "stdbuf -i0 -o0 -e0 " + command; // remove stdOut and err buffering @@ -397,14 +397,10 @@ std::pair executeCommand(std::string command, bool stdOut) { return std::make_pair(successfulRun, stdoutAndErr); } -// originalstring - string to search for substring +// originalString - string to search for substring // substring - string looking to find bool containsString(std::string originalString, std::string substring) { - if (originalString.find(substring) != std::string::npos) { - return true; - } else { - return false; - } + return (originalString.find(substring) != std::string::npos); } // Creates and stores supplied data into a temporary file (within /tmp/). @@ -415,9 +411,9 @@ bool containsString(std::string originalString, std::string substring) { // https://man7.org/linux/man-pages/man3/mkstemp.3.html // // Temporary file name format: -// ___ +// ___ // - prefix for our application's identifier (see kTmpFilePrefix) -// - name of parameter being stored +// - name of parameter being stored // - state at which the stored value captures // - device identifier // @@ -452,9 +448,8 @@ rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName, close(fd); if (rc_write == -1) { return RSMI_STATUS_FILE_ERROR; - } else { - return RSMI_STATUS_SUCCESS; } + return RSMI_STATUS_SUCCESS; } std::vector getListOfAppTmpFiles() { @@ -463,16 +458,18 @@ std::vector getListOfAppTmpFiles() { struct dirent *ent; std::vector tmpFiles; - if ((dir = opendir(path.c_str())) != nullptr) { - // captures all files & directories under specified path - while ((ent = readdir(dir)) != nullptr) { - std::string fileDirName = ent->d_name; - // we only want our app specific files - if (containsString(fileDirName, kTmpFilePrefix)) { - tmpFiles.emplace_back(path + "/" + fileDirName); - } else { - continue; - } + dir = opendir(path.c_str()); + if (dir == nullptr) { + return tmpFiles; + } + // captures all files & directories under specified path + while ((ent = readdir(dir)) != nullptr) { + std::string fileDirName = ent->d_name; + // we only want our app specific files + if (containsString(fileDirName, kTmpFilePrefix)) { + tmpFiles.emplace_back(path + "/" + fileDirName); + } else { + continue; } } return tmpFiles; @@ -501,7 +498,7 @@ std::vector readEntireFile(std::string path) { std::string line; while (std::getline(inFileStream, line)) { std::istringstream ss(line); - if(line.size() > 0) { + if (!line.empty()) { fileContent.push_back(line); } } @@ -513,7 +510,7 @@ std::vector readEntireFile(std::string path) { // and their content void displayAppTmpFilesContent() { std::vector tmpFiles = getListOfAppTmpFiles(); - if (tmpFiles.empty() == false) { + if (!tmpFiles.empty()) { for (auto &x: tmpFiles) { std::string out = readFile(x); std::cout << __PRETTY_FUNCTION__ << " | Temporary file: " << x @@ -529,7 +526,7 @@ void displayAppTmpFilesContent() { std::string debugVectorContent(std::vector v) { std::ostringstream ss; ss << "Vector = {"; - if (v.size() > 0) { + if (!v.empty()) { for (auto it=v.begin(); it < v.end(); it++) { ss << *it; auto temp_it = it; @@ -547,7 +544,7 @@ std::string debugVectorContent(std::vector v) { std::string displayAllDevicePaths(std::vector> v) { std::ostringstream ss; ss << "Vector = {"; - if (v.size() > 0) { + if (!v.empty()) { for (auto it=v.begin(); it < v.end(); it++) { ss << (*it)->path(); auto temp_it = it; @@ -562,7 +559,7 @@ std::string displayAllDevicePaths(std::vector> v) { } // Attempts to read application specific temporary file -// This method is to be used for reading (or determing if it exists), +// This method is to be used for reading (or determining if it exists), // in order to keep file naming scheme consistent. // // dv_ind - device index @@ -580,7 +577,7 @@ std::tuple readTmpFile(uint32_t dv_ind, "_" + std::to_string(dv_ind); std::string fileContent; std::vector tmpFiles = getListOfAppTmpFiles(); - if (tmpFiles.empty() == false) { + if (!tmpFiles.empty()) { for (auto &x: tmpFiles) { if (containsString(x, tmpFileName)) { fileContent = readFile(x); @@ -620,7 +617,11 @@ std::tuple fileContent = readEntireFile(filePath); for (auto &line: fileContent) { if (line.find("PRETTY_NAME=") != std::string::npos) { @@ -668,11 +669,17 @@ std::tupleSetUp(); test->Run(); - return; } static void RunCustomTestEpilog(TestBase *tst) { if (sRSMIGlvalues->verbosity >= TestBase::VERBOSE_STANDARD) { tst->DisplayResults(); } tst->Close(); - return; } // If the test case one big test, you should use RunGenericTest() @@ -127,7 +125,6 @@ static void RunCustomTestEpilog(TestBase *tst) { static void RunGenericTest(TestBase *test) { RunCustomTestProlog(test); RunCustomTestEpilog(test); - return; } // TEST ENTRY TEMPLATE: diff --git a/tests/rocm_smi_test/test_base.cc b/tests/rocm_smi_test/test_base.cc index a406868c63..0acce3a150 100755 --- a/tests/rocm_smi_test/test_base.cc +++ b/tests/rocm_smi_test/test_base.cc @@ -43,7 +43,7 @@ * */ -#include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi_test/test_base.h" @@ -61,10 +61,9 @@ static const char kResultsLabel[] = "TEST RESULTS"; // This one is used outside this file const char kSetupLabel[] = "TEST SETUP"; -TestBase::TestBase() : setup_failed_(false), description_("") { -} -TestBase::~TestBase() { +TestBase::TestBase() : setup_failed_(false) { } +TestBase::~TestBase() = default; void TestBase::MakeHeaderStr(const char *inStr, std::string *outStr) const { @@ -116,8 +115,6 @@ void TestBase::SetUp(uint64_t init_flags) { std::cout << "No ROCm SMI tests can be run." << std::endl; } } - - return; } void TestBase::PrintDeviceHeader(uint32_t dv_ind) { @@ -213,7 +210,7 @@ void TestBase::set_description(std::string d) { size_t endlptr; for (size_t i = le; i < description_.size(); i += le) { - endlptr = description_.find_last_of(" ", i); + endlptr = description_.find_last_of(' ', i); description_.replace(endlptr, 1, "\n"); i = endlptr; } diff --git a/tests/rocm_smi_test/test_base.h b/tests/rocm_smi_test/test_base.h index d2ced349bc..c175f55c9b 100755 --- a/tests/rocm_smi_test/test_base.h +++ b/tests/rocm_smi_test/test_base.h @@ -45,6 +45,7 @@ #ifndef TESTS_ROCM_SMI_TEST_TEST_BASE_H_ #define TESTS_ROCM_SMI_TEST_TEST_BASE_H_ +#include #include class TestBase { @@ -142,9 +143,8 @@ class TestBase { "\t===> Abort is over-ridden due to dont_fail command line option." \ << std::endl; \ return; \ - } else { \ - ASSERT_EQ(RSMI_STATUS_SUCCESS, (RET)); \ } \ + ASSERT_EQ(RSMI_STATUS_SUCCESS, (RET)); \ } void MakeHeaderStr(const char *inStr, std::string *outStr); diff --git a/tests/rocm_smi_test/test_common.cc b/tests/rocm_smi_test/test_common.cc index d7a8a34d86..db6fa24098 100755 --- a/tests/rocm_smi_test/test_common.cc +++ b/tests/rocm_smi_test/test_common.cc @@ -43,13 +43,13 @@ * */ -#include -#include #include +#include +#include #include -#include #include +#include #include "rocm_smi_test/test_base.h" #include "rocm_smi_test/test_common.h" diff --git a/tests/rocm_smi_test/test_common.h b/tests/rocm_smi_test/test_common.h index e601fb0e8e..ba425cc462 100755 --- a/tests/rocm_smi_test/test_common.h +++ b/tests/rocm_smi_test/test_common.h @@ -74,7 +74,7 @@ void DumpMonitorInfo(const TestBase *test); #endif #define DISPLAY_RSMI_ERR(RET) { \ - if (RET != RSMI_STATUS_SUCCESS) { \ + if ((RET) != RSMI_STATUS_SUCCESS) { \ const char *err_str; \ std::cout << "\t===> ERROR: RSMI call returned " << (RET) << std::endl; \ rsmi_status_string((RET), &err_str); \ @@ -91,7 +91,7 @@ void DumpMonitorInfo(const TestBase *test); } \ } #define CHK_RSMI_PERM_ERR(RET) { \ - if (RET == RSMI_STATUS_PERMISSION) { \ + if ((RET) == RSMI_STATUS_PERMISSION) { \ std::cout << "This command requires root access." << std::endl; \ } else { \ DISPLAY_RSMI_ERR(RET) \ From ed6777a8e7f94c0f82f433284d3c115454481074 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Fri, 25 Aug 2023 22:25:25 -0500 Subject: [PATCH 04/19] Add GPU partition nodes * Updates: - Fixed infinit loop on systems which did not have VRAM files - Fixed concise info from throwing exception with no amdgpu driver loaded - Fix for ability to see all nodes when after switching partitions (mirrors original card display/settings) - Added to logs build type, lib path, and set env. variables Change-Id: Ic0333df355144ce2242cecea93fe4ce51caf311c Signed-off-by: Charis Poag --- include/rocm_smi/rocm_smi_kfd.h | 4 + include/rocm_smi/rocm_smi_main.h | 3 +- include/rocm_smi/rocm_smi_utils.h | 6 +- oam/CMakeLists.txt | 2 +- python_smi_tools/rocm_smi.py | 11 +- rocm_smi/CMakeLists.txt | 2 +- src/rocm_smi.cc | 41 +++++- src/rocm_smi_kfd.cc | 91 +++++++++++++- src/rocm_smi_main.cc | 194 +++++++++++++++++++++++------ src/rocm_smi_utils.cc | 100 ++++++++++++--- tests/rocm_smi_test/CMakeLists.txt | 3 +- 11 files changed, 390 insertions(+), 67 deletions(-) diff --git a/include/rocm_smi/rocm_smi_kfd.h b/include/rocm_smi/rocm_smi_kfd.h index 9cf8fd8e40..90c7f6ff3b 100755 --- a/include/rocm_smi/rocm_smi_kfd.h +++ b/include/rocm_smi/rocm_smi_kfd.h @@ -118,6 +118,10 @@ GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_count); int ReadKFDDeviceProperties(uint32_t dev_id, std::vector *retVec); +int read_node_properties(uint32_t node, std::string property_name, + uint64_t *val); +int get_gpu_id(uint32_t node, uint64_t *gpu_id); + } // namespace smi } // namespace amd diff --git a/include/rocm_smi/rocm_smi_main.h b/include/rocm_smi/rocm_smi_main.h index f276bd85bb..8b60324988 100755 --- a/include/rocm_smi/rocm_smi_main.h +++ b/include/rocm_smi/rocm_smi_main.h @@ -113,7 +113,8 @@ class RocmSMI { uint64_t *weight); int get_node_index(uint32_t dv_ind, uint32_t *node_ind); const RocmSMI_env_vars& getEnv(void); - void printEnvVarInfo(void); + std::string getRSMIEnvVarInfo(void); + void debugRSMIEnvVarInfo(); bool isLoggingOn(void); uint32_t getLogSetting(void); static const std::map devInfoTypesStrings; diff --git a/include/rocm_smi/rocm_smi_utils.h b/include/rocm_smi/rocm_smi_utils.h index 5ba813a273..49a3521dc1 100755 --- a/include/rocm_smi/rocm_smi_utils.h +++ b/include/rocm_smi/rocm_smi_utils.h @@ -99,13 +99,17 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type, rsmi_status_t ErrnoToRsmiStatus(int err); std::string getRSMIStatusString(rsmi_status_t ret); std::tuple + std::string, std::string, std::string, std::string, + std::string, std::string, std::string> getSystemDetails(void); void logSystemDetails(void); rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str); void logHexDump(const char *desc, const void *addr, const size_t len, size_t perLine); bool isSystemBigEndian(); +std::string getBuildType(); +std::string getMyLibPath(); +int subDirectoryCountInPath(const std::string path); template std::string print_int_as_hex(T i, bool showHexNotation=true) { std::stringstream ss; diff --git a/oam/CMakeLists.txt b/oam/CMakeLists.txt index dc674b0bb0..6927d245e6 100644 --- a/oam/CMakeLists.txt +++ b/oam/CMakeLists.txt @@ -72,7 +72,7 @@ target_include_directories(${OAM_EXAMPLE_EXE} PRIVATE ${OAM_INC_LIST}) target_link_libraries(${OAM_EXAMPLE_EXE} ${OAM_TARGET}) add_library(${OAM_TARGET} ${CMN_SRC_LIST} ${OAM_SRC_LIST} ${CMN_INC_LIST} ${OAM_INC_LIST}) -target_link_libraries(${OAM_TARGET} pthread rt) +target_link_libraries(${OAM_TARGET} pthread rt dl) target_include_directories(${OAM_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include ${COMMON_PROJ_ROOT}/common/shared_mutex) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index d0bb5ab365..1d4c7e69a1 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -1594,7 +1594,9 @@ def showAllConcise(deviceList): printLogSpacer(' Concise Info ') deviceList.sort() - (temp_type, _) = findFirstAvailableTemp(deviceList[0]) + temp_type = '(' + temp_type_lst[0] + ')' + if len(deviceList) >= 1: + (temp_type, _) = findFirstAvailableTemp(deviceList[0]) available_temp_type = temp_type.lower() available_temp_type = available_temp_type.replace('(', '') available_temp_type = available_temp_type.replace(')', '') @@ -1843,7 +1845,8 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr)) else: logging.debug('PCIe clock is unsupported on device[{}]'.format(device)) - printLogSpacer() + if not concise: + printLogSpacer() def showCurrentFans(deviceList): @@ -2786,7 +2789,9 @@ def getGraphColor(percentage): def showTempGraph(deviceList): deviceList.sort() - (temp_type, temp_value) = findFirstAvailableTemp(deviceList[0]) + temp_type = '(' + temp_type_lst[0] + ')' + if len(deviceList) >= 1: + (temp_type, _) = findFirstAvailableTemp(deviceList[0]) printLogSpacer(' Temperature Graph ' + temp_type + ' ') temp_type = temp_type.lower() temp_type = temp_type.replace('(', '') diff --git a/rocm_smi/CMakeLists.txt b/rocm_smi/CMakeLists.txt index ae8c017933..08b2599542 100755 --- a/rocm_smi/CMakeLists.txt +++ b/rocm_smi/CMakeLists.txt @@ -80,7 +80,7 @@ add_executable(${SMI_EXAMPLE_EXE} "example/rocm_smi_example.cc") target_link_libraries(${SMI_EXAMPLE_EXE} ${ROCM_SMI_TARGET}) add_library(${ROCM_SMI_TARGET} ${CMN_SRC_LIST} ${SMI_SRC_LIST} ${CMN_INC_LIST} ${SMI_INC_LIST}) -target_link_libraries(${ROCM_SMI_TARGET} pthread rt) +target_link_libraries(${ROCM_SMI_TARGET} pthread rt dl) target_include_directories(${ROCM_SMI_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${COMMON_PROJ_ROOT}/common/shared_mutex) diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 506e784206..d8bd892ac8 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -2991,10 +2991,24 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, if (mem_type == RSMI_MEM_TYPE_VRAM && *total == 0) { GET_DEV_AND_KFDNODE_FROM_INDX if (kfd_node->get_total_memory(total) == 0 && *total > 0) { + ss << __PRETTY_FUNCTION__ + << " | inside success fallback... " + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: total = " << std::to_string(*total) + << " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS); + LOG_DEBUG(ss); return RSMI_STATUS_SUCCESS; } } + ss << __PRETTY_FUNCTION__ + << " | after fallback... " + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: total = " << std::to_string(*total) + << " | ret = " << getRSMIStatusString(ret); + LOG_DEBUG(ss); return ret; CATCH } @@ -3036,11 +3050,36 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, GET_DEV_AND_KFDNODE_FROM_INDX uint64_t total = 0; ret = get_dev_value_int(amd::smi::kDevMemTotVRAM, dv_ind, &total); - if (total != 0) return ret; // do not need to fallback + if (total != 0) { + ss << __PRETTY_FUNCTION__ + << " no fallback needed! - " + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: Used = " << std::to_string(*used) + << " | Data: total = " << std::to_string(total) + << " | ret = " << getRSMIStatusString(ret); + LOG_DEBUG(ss); + return ret; // do not need to fallback + } if ( kfd_node->get_used_memory(used) == 0 ) { + ss << __PRETTY_FUNCTION__ + << " | in fallback == success ..." + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: Used = " << std::to_string(*used) + << " | Data: total = " << std::to_string(total) + << " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS); + LOG_DEBUG(ss); return RSMI_STATUS_SUCCESS; } } + ss << __PRETTY_FUNCTION__ + << " | at end!!!! after fallback ..." + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: Used = " << std::to_string(*used) + << " | ret = " << getRSMIStatusString(ret); + LOG_DEBUG(ss); return ret; CATCH diff --git a/src/rocm_smi_kfd.cc b/src/rocm_smi_kfd.cc index afe567d80d..7fe9004cc3 100755 --- a/src/rocm_smi_kfd.cc +++ b/src/rocm_smi_kfd.cc @@ -63,6 +63,7 @@ #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_logger.h" namespace amd { namespace smi { @@ -775,20 +776,30 @@ KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth, // /sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties // size_in_bytes 68702699520 int KFDNode::get_total_memory(uint64_t* total) { - if (total == nullptr) return EINVAL; + std::ostringstream ss; + if (total == nullptr) { + return EINVAL; + } *total = 0; std::string f_path = kKFDNodesPathRoot; f_path += "/"; f_path += std::to_string(node_indx_); f_path += "/mem_banks"; + int subDirCount = subDirectoryCountInPath(f_path); + ss << __PRETTY_FUNCTION__ << " | [before loop] Within " << f_path + << " has subdirectory count = " << std::to_string(subDirCount); + LOG_DEBUG(ss); auto kfd_node_dir = opendir(f_path.c_str()); if (kfd_node_dir == nullptr) { return errno; } auto dentry = readdir(kfd_node_dir); - while (dentry != nullptr) { + while (dentry != nullptr && subDirCount > 0) { + ss << __PRETTY_FUNCTION__ << " | [inside loop] Within " << f_path + << " has subdirectory count = " << std::to_string(subDirCount); + LOG_DEBUG(ss); if (dentry->d_name[0] == '.') { dentry = readdir(kfd_node_dir); continue; @@ -822,6 +833,7 @@ int KFDNode::get_total_memory(uint64_t* total) { } } } // end loop for lines in property file + subDirCount--; } // end loop for mem_bank directory if (closedir(kfd_node_dir)) { @@ -862,5 +874,80 @@ int KFDNode::get_used_memory(uint64_t* used) { return 1; } +// /sys/class/kfd/kfd/topology/nodes/*/properties +int read_node_properties(uint32_t node, std::string property_name, + uint64_t *val) { + std::ostringstream ss; + int retVal = EINVAL; + if (property_name.empty() || val == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", property_name is empty or *val is nullptr " + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + return retVal; + } + std::shared_ptr myNode = std::shared_ptr(new KFDNode(node)); + myNode->Initialize(); + if (KFDNodeSupported(node)) { + retVal = myNode->get_property_value(property_name, val); + ss << __PRETTY_FUNCTION__ + << " | Successfully read node #" << std::to_string(node) + << " for property_name = " << property_name + << " | Data (" << property_name << ") * val = " + << std::to_string(*val) + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + } else { + retVal = 1; + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", KFD node was an unsupported node." + << " | return = " << std::to_string(retVal) + << " | "; + LOG_ERROR(ss); + } + return retVal; +} + +// /sys/class/kfd/kfd/topology/nodes/*/gpu_id +int get_gpu_id(uint32_t node, uint64_t *gpu_id) { + std::ostringstream ss; + int retVal = EINVAL; + if (gpu_id == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", gpu_id is a nullptr " + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + return retVal; + } + std::shared_ptr myNode = std::shared_ptr(new KFDNode(node)); + myNode->Initialize(); + if (KFDNodeSupported(node)) { + retVal = ReadKFDGpuId(node, gpu_id); + ss << __PRETTY_FUNCTION__ + << " | Successfully read node #" << std::to_string(node) + << " for gpu_id" + << " | Data (gpu_id) *gpu_id = " + << std::to_string(*gpu_id) + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + } else { + retVal = 1; + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", KFD node was an unsupported node." + << " | return = " << std::to_string(retVal) + << " | "; + LOG_ERROR(ss); + } + return retVal; +} + } // namespace smi } // namespace amd diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index 1eb973c17e..831e382b93 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -333,7 +333,7 @@ RocmSMI::Initialize(uint64_t flags) { GetEnvVariables(); // To help debug env variable issues - // printEnvVarInfo(); + // debugRSMIEnvVarInfo(); while (!std::string(kAMDMonitorTypes[i]).empty()) { amd_monitor_types_.insert(kAMDMonitorTypes[i]); @@ -390,7 +390,7 @@ RocmSMI::Initialize(uint64_t flags) { uint64_t bdfid = (*dev_iter)->bdfid(); if (tmp_map.find(bdfid) == tmp_map.end()) { ss << __PRETTY_FUNCTION__ << " | removing device = " - << (*dev_iter)->path(); + << (*dev_iter)->path() << "; bdfid = " << std::to_string(bdfid); dev_iter = devices_.erase(dev_iter); LOG_DEBUG(ss); continue; @@ -549,48 +549,54 @@ uint32_t RocmSMI::getLogSetting() { return this->env_vars_.logging_on; } -void RocmSMI::printEnvVarInfo(void) { - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_output_bitfield = " - << ((env_vars_.debug_output_bitfield == 0) ? "" - : std::to_string(env_vars_.debug_output_bitfield)) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_DRM_root_override = " - << ((env_vars_.path_DRM_root_override == nullptr) - ? "" : env_vars_.path_DRM_root_override) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_HWMon_root_override = " - << ((env_vars_.path_HWMon_root_override == nullptr) - ? "" : env_vars_.path_HWMon_root_override) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_power_root_override = " - << ((env_vars_.path_power_root_override == nullptr) - ? "" : env_vars_.path_power_root_override) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_inf_loop = " - << ((env_vars_.debug_inf_loop == 0) ? "" - : std::to_string(env_vars_.debug_inf_loop)) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " +void RocmSMI::debugRSMIEnvVarInfo(void) { + std::cout << __PRETTY_FUNCTION__ + << RocmSMI::getInstance().getRSMIEnvVarInfo(); +} + +std::string RocmSMI::getRSMIEnvVarInfo(void) { + std::ostringstream ss; + ss << "\n\tRSMI_DEBUG_BITFIELD = " + << ((env_vars_.debug_output_bitfield == 0) ? "" + : std::to_string(env_vars_.debug_output_bitfield)) + << std::endl; + ss << "\tRSMI_DEBUG_DRM_ROOT_OVERRIDE = " + << ((env_vars_.path_DRM_root_override == nullptr) + ? "" : env_vars_.path_DRM_root_override) + << std::endl; + ss << "\tRSMI_DEBUG_HWMON_ROOT_OVERRIDE = " + << ((env_vars_.path_HWMon_root_override == nullptr) + ? "" : env_vars_.path_HWMon_root_override) + << std::endl; + ss << "\tRSMI_DEBUG_PP_ROOT_OVERRIDE = " + << ((env_vars_.path_power_root_override == nullptr) + ? "" : env_vars_.path_power_root_override) + << std::endl; + ss << "\tRSMI_DEBUG_INFINITE_LOOP = " + << ((env_vars_.debug_inf_loop == 0) ? "" + : std::to_string(env_vars_.debug_inf_loop)) + << std::endl; + ss << "\tRSMI_LOGGING = " << getLogSetting() << std::endl; - bool isLoggingOn = RocmSMI::isLoggingOn(); - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " - << (isLoggingOn ? "true" : "false") << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {"; + bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false; + ss << "\tRSMI_LOGGING (are logs on) = " + << (isLoggingOn ? "TRUE" : "FALSE") << std::endl; + ss << "\tRSMI_DEBUG_ENUM_OVERRIDE = {"; if (env_vars_.enum_overrides.empty()) { - std::cout << "}" << std::endl; - return; + ss << "}" << std::endl; + return ss.str(); } for (auto it=env_vars_.enum_overrides.begin(); it != env_vars_.enum_overrides.end(); ++it) { DevInfoTypes type = static_cast(*it); - std::cout << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) - + ")"); + ss << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) + ")"); auto temp_it = it; if(++temp_it != env_vars_.enum_overrides.end()) { - std::cout << ", "; + ss << ", "; } } - std::cout << "}" << std::endl; + ss << "}" << std::endl; + return ss.str(); } std::shared_ptr @@ -692,8 +698,7 @@ static bool isAMDGPU(std::string dev_path) { std::string vend_path = dev_path + "/device/vendor"; if (!FileExists(vend_path.c_str())) { ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path - << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": - "is an amdgpu device - FALSE"); + << " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE"); LOG_DEBUG(ss); return isAmdGpu; } @@ -703,8 +708,7 @@ static bool isAMDGPU(std::string dev_path) { if (!fs.is_open()) { ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path - << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": - "is an amdgpu device - FALSE"); + << " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE"); LOG_DEBUG(ss); return isAmdGpu; } @@ -719,8 +723,7 @@ static bool isAMDGPU(std::string dev_path) { isAmdGpu = true; } ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path - << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": - "is an amdgpu device - FALSE"); + << " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE"); LOG_DEBUG(ss); return isAmdGpu; } @@ -728,6 +731,7 @@ static bool isAMDGPU(std::string dev_path) { uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { std::string err_msg; uint32_t count = 0; + std::ostringstream ss; // If this gets called more than once, clear previous findings. devices_.clear(); @@ -754,17 +758,125 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { } dentry = readdir(drm_dir); } + ss << __PRETTY_FUNCTION__ << " | Discovered a potential of " + << std::to_string(count) << " cards" << " | "; + LOG_DEBUG(ss); + struct systemNode { + uint32_t s_node_id = 0; + uint64_t s_gpu_id = 0; + uint64_t s_unique_id = 0; + }; + // allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id} + std::multimap allSystemNodes; + uint32_t node_id = 0; + while (true) { + uint64_t gpu_id = 0, unique_id = 0; + int ret_gpu_id = get_gpu_id(node_id, &gpu_id); + int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id); + if (ret_gpu_id == 0 || ret_unique_id == 0) { + systemNode myNode; + myNode.s_node_id = node_id; + myNode.s_gpu_id = gpu_id; + myNode.s_unique_id = unique_id; + if(gpu_id != 0) { // only add gpu nodes, 0 = CPU + allSystemNodes.emplace(unique_id, myNode); + } + } else { + break; + } + node_id++; + } + + ss << __PRETTY_FUNCTION__ << " | Ordered system nodes found = {"; + for(auto i: allSystemNodes) { + ss << "\n[node_id = " << std::to_string(i.second.s_node_id) + << "; gpu_id = " << std::to_string(i.second.s_gpu_id) + << "; unique_id = " << std::to_string(i.second.s_unique_id) + << "], " + ; + } + ss << "}"; + LOG_DEBUG(ss); + + // Discover all root cards & gpu partitions associated with each for (uint32_t node_id = 0; node_id < count; node_id++) { std::string path = kPathDRMRoot; path += "/card"; path += std::to_string(node_id); + uint64_t primary_unique_id = 0; + + // each identified gpu card node is a primary node for + // potential matching unique ids if (isAMDGPU(path) || (init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) { std::string d_name = "card"; d_name += std::to_string(node_id); AddToDeviceList(d_name); - } + + ss << __PRETTY_FUNCTION__ + << " | Ordered system nodes seen in lookup = {"; + for (auto i : allSystemNodes) { + ss << "\n[node_id = " << std::to_string(i.second.s_node_id) + << "; gpu_id = " << std::to_string(i.second.s_gpu_id) + << "; unique_id = " << std::to_string(i.second.s_unique_id) + << "], "; + } + ss << "}"; + LOG_DEBUG(ss); + + uint64_t temp_primary_unique_id = 0; + if (allSystemNodes.empty()) { + continue; + } + + // get lowest key 1st to keep order of nodes matching card + uint32_t lowest_NodeId = 0; + uint32_t curr_NodeId = 0; + + for (auto it = allSystemNodes.begin(), end = allSystemNodes.end(); + it != end; it = allSystemNodes.upper_bound(it->first)) { + curr_NodeId = it->second.s_node_id; + if (it == allSystemNodes.begin()) { + lowest_NodeId = it->second.s_node_id; + } + if (curr_NodeId <= lowest_NodeId) { + lowest_NodeId = curr_NodeId; + temp_primary_unique_id = it->second.s_unique_id; + } + } + ss << __PRETTY_FUNCTION__ + << " | lowest_NodeId = " << std::to_string(lowest_NodeId) + << " | curr_NodeId = " << std::to_string(curr_NodeId) + << " | temp_primary_unique_id = " + << std::to_string(temp_primary_unique_id); + LOG_DEBUG(ss); + + if (temp_primary_unique_id != 0) { + primary_unique_id = temp_primary_unique_id; + } else { + allSystemNodes.erase(primary_unique_id); + continue; + } + + auto numb_nodes = allSystemNodes.count(primary_unique_id); + ss << __PRETTY_FUNCTION__ << " | REFRESH - primary_unique_id = " + << std::to_string(primary_unique_id) << " has " + << std::to_string(numb_nodes) << " known gpu nodes"; + LOG_DEBUG(ss); + while (numb_nodes > 1) { + std::string secNode = "card"; + secNode += std::to_string(node_id); // add the primary node id + AddToDeviceList(secNode); + numb_nodes--; + } + // remove already added nodes associated with current card + auto erasedNodes = allSystemNodes.erase(primary_unique_id); + ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = " + << std::to_string(primary_unique_id) << " erased " + << std::to_string(erasedNodes) << " nodes"; + LOG_DEBUG(ss); + } } if (closedir(drm_dir)) { diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index 973d555d26..1e9a444320 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -40,12 +40,17 @@ * DEALINGS WITH THE SOFTWARE. * */ - +#define _GNU_SOURCE 1 // REQUIRED: to utilize some GNU features/functions, see + // _GNU_SOURCE functions which check +#include +#include +#include +#include #include #include #include #include -#include +#include #include #include @@ -612,7 +617,8 @@ std::string getRSMIStatusString(rsmi_status_t ret) { // Big Endian (BE), multi-bit symbols encoded as big endian (MSB first) // Little Endian (LE), multi-bit symbols encoded as little endian (LSB first) std::tuple + std::string, std::string, std::string, std::string, + std::string, std::string, std::string> getSystemDetails(void) { struct utsname buf; bool errorDetected = false; @@ -625,6 +631,9 @@ std::tupled_name, ".") == 0 || strcmp(dent->d_name, "..") == 0) { + continue; + } + + if (fstatat(dirfd(srcdir), dent->d_name, &st, 0) < 0) { + perror(dent->d_name); + continue; + } + + if (S_ISDIR(st.st_mode)) { + dir_count++; + } + } + closedir(srcdir); + return dir_count; +} } // namespace smi } // namespace amd diff --git a/tests/rocm_smi_test/CMakeLists.txt b/tests/rocm_smi_test/CMakeLists.txt index 9c8ed197dc..b2347e0bff 100755 --- a/tests/rocm_smi_test/CMakeLists.txt +++ b/tests/rocm_smi_test/CMakeLists.txt @@ -67,7 +67,8 @@ target_link_libraries( PUBLIC GTest::gtest_main PUBLIC c PUBLIC stdc++ - PUBLIC pthread) + PUBLIC pthread + PUBLIC dl) install(TARGETS ${RSMITST} gtest gtest_main DESTINATION ${SHARE_INSTALL_PREFIX}/rsmitst_tests From 41ade41d8467eadc37ab60c4a423a1ece9a65449 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Mon, 11 Sep 2023 19:23:46 -0500 Subject: [PATCH 05/19] SWDEV-409184 - Fix erroneous 'not supported' when HWMON is absent Change-Id: Ic5ff406977d962fadc709a03853dac61b5460a26 Signed-off-by: Galantsev, Dmitrii --- src/rocm_smi_device.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index 95f27d8e12..b01734f0b1 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -1101,10 +1101,6 @@ void Device::fillSupportedFuncs(void) { if (!supported_funcs_.empty()) { return; } - if (monitor() == nullptr) { - return; - } - std::map::const_iterator it = kDevFuncDependsMap.begin(); std::string dev_rt = path_ + "/device"; @@ -1160,7 +1156,9 @@ void Device::fillSupportedFuncs(void) { it++; } - monitor()->fillSupportedFuncs(&supported_funcs_); + if (monitor() != nullptr) { + monitor()->fillSupportedFuncs(&supported_funcs_); + } // DumpSupportedFunctions(); } From ff992e9b56669fc2897b24567d8ae9c2f14dd5be Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Mon, 11 Sep 2023 19:43:25 -0500 Subject: [PATCH 06/19] TESTS - re-enable frequency tests on aqua_vanjaram Change-Id: I8fcd9418da5b973897ccfffc7d8a2f3ea833ea77 Signed-off-by: Galantsev, Dmitrii --- tests/rocm_smi_test/rsmitst.exclude | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/rocm_smi_test/rsmitst.exclude b/tests/rocm_smi_test/rsmitst.exclude index 43a738ab22..0632242f93 100644 --- a/tests/rocm_smi_test/rsmitst.exclude +++ b/tests/rocm_smi_test/rsmitst.exclude @@ -65,8 +65,6 @@ $BLACKLIST_ALL_ASICS\ FILTER[90400]=\ $BLACKLIST_ALL_ASICS\ "rsmitstReadOnly.TestVoltCurvRead:"\ -"rsmitstReadOnly.TestFrequenciesRead:"\ -"rsmitstReadWrite.TestFrequenciesReadWrite:"\ "rsmitstReadWrite.TestPowerReadWrite" FILTER[90401]=${FILTER[90400]} FILTER[90402]=${FILTER[90400]} From 4acfb00ad59782cb011e19a05c15e37730af1f7c Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 7 Sep 2023 16:20:30 -0500 Subject: [PATCH 07/19] PY: Silence error output when printing concise info Change-Id: I9ce4ad523b3fe2ec8afc5bea791810ec67558f11 Signed-off-by: Galantsev, Dmitrii --- python_smi_tools/rocm_smi.py | 181 +++++++++++++++++++++-------------- 1 file changed, 107 insertions(+), 74 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 1d4c7e69a1..4a943a66a4 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -173,10 +173,12 @@ def formatMatrixToJSON(deviceList, matrix, metricName): printSysLog(metricName.format(deviceList[row_indx], deviceList[col_ind]), valueStr) -def getBus(device): +def getBus(device, silent=False): """ Return the bus identifier of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ bdfid = c_uint64(0) ret = rocmsmi.rsmi_dev_pci_id_get(device, byref(bdfid)) @@ -188,16 +190,18 @@ def getBus(device): function = bdfid.value & 0x7 pic_id = '{:04X}:{:02X}:{:02X}.{:0X}'.format(domain, bus, device, function) - if rsmi_ret_ok(ret, device, 'get_pci_id'): + if rsmi_ret_ok(ret, device, 'get_pci_id', silent): return pic_id -def getFanSpeed(device): +def getFanSpeed(device, silent=True): """ Return a tuple with the fan speed (value,%) for a specified device, or (None,None) if either current fan speed or max fan speed cannot be obtained @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ fanLevel = c_int64() fanMax = c_int64() @@ -209,7 +213,7 @@ def getFanSpeed(device): /sys/class/drm/cardX/device/hwmon/hwmonX/pwmX """ ret = rocmsmi.rsmi_dev_fan_speed_get(device, sensor_ind, byref(fanLevel)) - if rsmi_ret_ok(ret, device, 'get_fan_speed', True): + if rsmi_ret_ok(ret, device, 'get_fan_speed', silent): fl = fanLevel.value last_ret = ret @@ -217,7 +221,7 @@ def getFanSpeed(device): /sys/class/drm/cardX/device/hwmon/hwmonX/pwmX """ ret = rocmsmi.rsmi_dev_fan_speed_max_get(device, sensor_ind, byref(fanMax)) - if rsmi_ret_ok(ret, device, 'get_fan_max_speed', True): + if rsmi_ret_ok(ret, device, 'get_fan_max_speed', silent): fm = fanMax.value """ In case we had an error before, we don't overwrite it with a @@ -232,59 +236,67 @@ def getFanSpeed(device): return (last_ret, fl, round((float(fl) / float(fm)) * 100, 2)) -def getGpuUse(device): +def getGpuUse(device, silent=False): """ Return the current GPU usage as a percentage @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ percent = c_uint32() ret = rocmsmi.rsmi_dev_busy_percent_get(device, byref(percent)) - if rsmi_ret_ok(ret, device, 'GPU Utilization '): + if rsmi_ret_ok(ret, device, 'GPU Utilization ', silent): return percent.value return -1 -def getId(device): +def getId(device, silent=False): """ Return the hexadecimal value of a device's ID @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ dv_id = c_short() ret = rocmsmi.rsmi_dev_id_get(device, byref(dv_id)) - if rsmi_ret_ok(ret, device, 'get_device_id'): + if rsmi_ret_ok(ret, device, 'get_device_id', silent): return hex(dv_id.value) -def getRev(device): +def getRev(device, silent=False): """ Return the hexadecimal value of a device's Revision @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ dv_rev = c_short() ret = rocmsmi.rsmi_dev_revision_get(device, byref(dv_rev)) - if rsmi_ret_ok(ret, device, 'get_device_rev'): + if rsmi_ret_ok(ret, device, 'get_device_rev', silent): return hex(dv_rev.value) -def getMaxPower(device): +def getMaxPower(device, silent=False): """ Return the maximum power cap of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ power_cap = c_uint64() ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap)) - if rsmi_ret_ok(ret, device, 'get_power_cap'): + if rsmi_ret_ok(ret, device, 'get_power_cap', silent): return power_cap.value / 1000000 return -1 -def getMemInfo(device, memType, quiet=False): +def getMemInfo(device, memType, silent=False): """ Returns a tuple of (memory_used, memory_total) of the requested memory type usage for the device specified @param device: DRM device identifier @param type: [vram|vis_vram|gtt] Memory type to return - @param quiet=Turn on to silience error output + @param silent=Turn on to silence error output (you plan to handle manually). Default is off, which exposes any issue accessing the different memory types. @@ -300,11 +312,11 @@ def getMemInfo(device, memType, quiet=False): memTotal = None ret = rocmsmi.rsmi_dev_memory_usage_get(device, memory_type_l.index(memType), byref(memoryUse)) - if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), quiet): + if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), silent): memUsed = memoryUse.value ret = rocmsmi.rsmi_dev_memory_total_get(device, memory_type_l.index(memType), byref(memoryTot)) - if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), quiet): + if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), silent): memTotal = memoryTot.value return (memUsed, memTotal) @@ -334,14 +346,16 @@ def getProcessName(pid): return pName -def getPerfLevel(device): +def getPerfLevel(device, silent=False): """ Return the current performance level of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ perf = rsmi_dev_perf_level_t() ret = rocmsmi.rsmi_dev_perf_level_get(device, byref(perf)) - if rsmi_ret_ok(ret, device, 'get_perf_level'): + if rsmi_ret_ok(ret, device, 'get_perf_level', silent): return perf_level_string(perf.value) return 'N/A' @@ -369,42 +383,48 @@ def getPidList(): return -def getPower(device): +def getPower(device, silent=False): """ Return the current power level of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ power = c_uint32() ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power)) - if rsmi_ret_ok(ret, device, 'get_power_avg'): + if rsmi_ret_ok(ret, device, 'get_power_avg', silent): return power.value / 1000000 return 'N/A' -def getRasEnablement(device, block): +def getRasEnablement(device, block, silent=True): """ Return RAS enablement state for a given device @param device: DRM device identifier @param block: RAS block identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ state = rsmi_ras_err_state_t() ret = rocmsmi.rsmi_dev_ecc_status_get(device, rsmi_gpu_block_d[block], byref(state)) - if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), True): + if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), silent): return rsmi_ras_err_stale_machine[state.value].upper() return 'N/A' -def getTemp(device, sensor): +def getTemp(device, sensor, silent=True): """ Display the current temperature from a given device's sensor @param device: DRM device identifier @param sensor: Temperature sensor identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ temp = c_int64(0) metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), temp_type_lst.index(sensor), metric, byref(temp)) - if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), True): + if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), silent): return temp.value / 1000 return 'N/A' @@ -428,52 +448,60 @@ def findFirstAvailableTemp(device): continue return (ret_temp_type, ret_temp) -def getVbiosVersion(device): +def getVbiosVersion(device, silent=False): """ Returns the VBIOS version for a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ vbios = create_string_buffer(256) ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256) if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: return "Unsupported" - elif rsmi_ret_ok(ret, device): + elif rsmi_ret_ok(ret, device, silent=silent): return vbios.value.decode() -def getVersion(deviceList, component): +def getVersion(deviceList, component, silent=False): """ Return the software version for the specified component @param deviceList: List of DRM devices (can be a single-item list) @param component: Component (currently only driver) + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ ver_str = create_string_buffer(256) ret = rocmsmi.rsmi_version_str_get(component, ver_str, 256) - if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component)): + if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component), silent): return ver_str.value.decode() return None -def getComputePartition(device): +def getComputePartition(device, silent=True): """ Return the current compute partition of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ currentComputePartition = create_string_buffer(256) ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256) - if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode(): + if rsmi_ret_ok(ret, device, 'get_compute_partition', silent) and currentComputePartition.value.decode(): return str(currentComputePartition.value.decode()) return "N/A" -def getMemoryPartition(device): +def getMemoryPartition(device, silent=True): """ Return the current memory partition of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ currentNPSMode = create_string_buffer(256) ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256) - if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode(): + if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent) and currentNPSMode.value.decode(): return str(currentNPSMode.value.decode()) return "N/A" @@ -1591,6 +1619,7 @@ def showAllConcise(deviceList): MAX_ALL_CONCISE_WIDTH = 100 appWidth_temp = appWidth appWidth = MAX_ALL_CONCISE_WIDTH + silent = True printLogSpacer(' Concise Info ') deviceList.sort() @@ -1618,9 +1647,9 @@ def showAllConcise(deviceList): values = {} degree_sign = u'\N{DEGREE SIGN}' for device in deviceList: - gpu_dev_product_info = getDevProductInfo(device) + gpu_dev_product_info = getDevProductInfo(device, silent) gpu_dev_product_info_names = list(gpu_dev_product_info[device]) - temp_val = str(getTemp(device, available_temp_type)) + temp_val = str(getTemp(device, available_temp_type, silent)) if temp_val != 'N/A': temp_val += degree_sign + 'C' avgPwr = str(getPower(device)) @@ -1628,26 +1657,25 @@ def showAllConcise(deviceList): avgPwr += 'W' else: avgPwr = 'N/A' - combined_partition = (getMemoryPartition(device) + ", " - + getComputePartition(device)) - concise = True - sclk = showCurrentClocks([device], 'sclk', concise) - mclk = showCurrentClocks([device], 'mclk', concise) - (retCode, fanLevel, fanSpeed) = getFanSpeed(device) + combined_partition = (getMemoryPartition(device, silent) + ", " + + getComputePartition(device, silent)) + sclk = showCurrentClocks([device], 'sclk', concise=silent) + mclk = showCurrentClocks([device], 'mclk', concise=silent) + (retCode, fanLevel, fanSpeed) = getFanSpeed(device, silent) fan = str(fanSpeed) + '%' - if getPerfLevel(device) != -1: - perf = getPerfLevel(device) + if getPerfLevel(device, silent) != -1: + perf = getPerfLevel(device, silent) else: perf = 'Unsupported' - if getMaxPower(device) != -1: - pwrCap = str(getMaxPower(device)) + 'W' + if getMaxPower(device, silent) != -1: + pwrCap = str(getMaxPower(device, silent)) + 'W' else: pwrCap = 'Unsupported' - if getGpuUse(device) != -1: - gpu_busy = str(getGpuUse(device)) + '%' + if getGpuUse(device, silent) != -1: + gpu_busy = str(getGpuUse(device, silent)) + '%' else: gpu_busy = 'Unsupported' - vram_used, vram_total = getMemInfo(device, 'vram', True) + vram_used, vram_total = getMemInfo(device, 'vram', silent) mem_use_pct = 0 if vram_used is None: mem_use_pct='Unsupported' @@ -1681,7 +1709,7 @@ def showAllConcise(deviceList): for device in deviceList: printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), values['card%s' % (str(device))])), None) - gpu_dev_product_info = getDevProductInfo(device) + gpu_dev_product_info = getDevProductInfo(device, silent) gpu_dev_product_info_names = list(gpu_dev_product_info[device]) if (len(gpu_dev_product_info_names) > 1): printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in @@ -1705,19 +1733,20 @@ def showAllConciseHw(deviceList): header = ['GPU', 'DID', 'DREV', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS'] head_widths = [len(head) + 2 for head in header] values = {} + silent = True for device in deviceList: - gpuid = getId(device) + gpuid = getId(device, silent) if str(gpuid).startswith('0x'): gpuid = str(gpuid)[2:] - gpurev = getRev(device) + gpurev = getRev(device, silent) if str(gpurev).startswith('0x'): gpurev = str(gpurev)[2:] - gfxRas = getRasEnablement(device, 'GFX') - sdmaRas = getRasEnablement(device, 'SDMA') - umcRas = getRasEnablement(device, 'UMC') - vbios = getVbiosVersion(device) - bus = getBus(device) + gfxRas = getRasEnablement(device, 'GFX', silent) + sdmaRas = getRasEnablement(device, 'SDMA', silent) + umcRas = getRasEnablement(device, 'UMC', silent) + vbios = getVbiosVersion(device, silent) + bus = getBus(device, silent) values['card%s' % (str(device))] = [device, gpuid, gpurev, gfxRas, sdmaRas, umcRas, vbios, bus] val_widths = {} for device in deviceList: @@ -1812,8 +1841,8 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): if concise: # in case function is used for concise output, no need to print. return '{:.0f}Mhz'.format(fr) printLog(device, '{} clock level'.format(clk_defined), '{} ({:.0f}Mhz)'.format(levl, fr)) - else: - printErrLog(device, '%s clock is unsupported' % (clk_defined)) + elif not concise: + logging.debug('{} clock is unsupported on device[{}]'.format(clk_defined, device)) else: # if clk is not defined, will display all current clk for clk_type in sorted(rsmi_clk_names_dict): @@ -1830,7 +1859,7 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): printLog(device, '%s clock level:' % (clk_type), levl) else: printLog(device, '%s clock level: %s' % (clk_type, levl), '(%sMhz)' % (str(fr)[:-2])) - else: + elif not concise: logging.debug('{} clock is unsupported on device[{}]'.format(clk_type, device)) # pcie clocks if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1: @@ -1843,8 +1872,8 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): fr = '{:.1f}GT/s x{}'.format(bw.transfer_rate.frequency[current_f] / 1000000000, bw.lanes[current_f]) printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr)) - else: - logging.debug('PCIe clock is unsupported on device[{}]'.format(device)) + elif not concise: + logging.debug('{} clock is unsupported on device[{}]'.format('PCIe', device)) if not concise: printLogSpacer() @@ -2403,47 +2432,51 @@ def showProductName(deviceList): printLogSpacer() -def getDevProductInfo(device): +def getDevProductInfo(device, silent=False): """ Show the requested product name for the device requested @param device: Device we want to get the info for + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ # Retrieve card vendor MAX_BUFF_SIZE = 256 MAX_DESC_SIZE = 20 - device_info = "N/A" + device_series = "N/A" + device_model = "N/A" + gpu_revision = "N/A" device_list = {} vendor = create_string_buffer(MAX_BUFF_SIZE) ret = rocmsmi.rsmi_dev_vendor_name_get(device, vendor, MAX_BUFF_SIZE) # Only continue if GPU vendor is AMD - if rsmi_ret_ok(ret, device, 'get_vendor_name') and isAmdDevice(device): + if rsmi_ret_ok(ret, device, 'get_vendor_name', silent) and isAmdDevice(device): # Retrieve the device series series = create_string_buffer(MAX_BUFF_SIZE) ret = rocmsmi.rsmi_dev_name_get(device, series, MAX_BUFF_SIZE) - if rsmi_ret_ok(ret, device, 'get_name'): + if rsmi_ret_ok(ret, device, 'get_name', silent): try: device_series = series.value.decode() except UnicodeDecodeError: - device_series = "N/A" - printErrLog(device, "Unable to read card series") + if not silent: + printErrLog(device, "Unable to read card series") # Retrieve the device model model = create_string_buffer(MAX_BUFF_SIZE) ret = rocmsmi.rsmi_dev_subsystem_name_get(device, model, MAX_BUFF_SIZE) - if rsmi_ret_ok(ret, device, 'get_subsystem_name'): + if rsmi_ret_ok(ret, device, 'get_subsystem_name', silent): try: device_model = model.value.decode() device_model = padHexValue(device_model, 4) except UnicodeDecodeError: - device_model = "N/A" - printErrLog(device, "Unable to read device model") + if not silent: + printErrLog(device, "Unable to read device model") try: gpu_revision = padHexValue(getRev(device), 2) except Exception as exc: - gpu_revision = "N/A" - printErrLog(device, "Unable to read card revision %s" % (exc)) + if not silent: + printErrLog(device, "Unable to read card revision %s" % (exc)) device_series_str = str(device_series[:MAX_DESC_SIZE]) device_series_str = device_series_str.ljust(MAX_DESC_SIZE, ' ') @@ -3382,7 +3415,7 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False): @param my_ret: Return of RSMI call (rocm_smi_lib API) @param metric: Parameter of GPU currently being analyzed @param silent: Echo verbose error reponse. - True siliences err output, False does not silience err output (default). + True silences err output, False does not silence err output (default). """ global RETCODE global PRINT_JSON @@ -3399,8 +3432,8 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False): if err_str.value is not None: returnString += '%s\t' % (err_str.value.decode()) if not PRINT_JSON: - logging.debug('%s', returnString) if not silent: + logging.debug('%s', returnString) if my_ret in rsmi_status_verbose_err_out: printLog(device, metric + ", " + rsmi_status_verbose_err_out[my_ret], None) RETCODE = my_ret From d9381b6dae0090ae2724c9396ea68669385600aa Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 13 Sep 2023 19:49:46 -0500 Subject: [PATCH 08/19] Fix misspelling averge -> average Change-Id: I3546348560acadb1e775e10ad24115de4ccfc800 Signed-off-by: Galantsev, Dmitrii --- rocm_smi/example/rocm_smi_example.cc | 2 +- tests/rocm_smi_test/functional/power_read.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc index 0e78debb91..815b00382e 100755 --- a/rocm_smi/example/rocm_smi_example.cc +++ b/rocm_smi/example/rocm_smi_example.cc @@ -839,7 +839,7 @@ int main() { } CHK_RSMI_NOT_SUPPORTED_RET(ret) - std::cout << "\t**Averge Power Usage: "; + std::cout << "\t**Average Power Usage: "; ret = rsmi_dev_power_ave_get(i, 0, &val_ui64); if (ret == RSMI_STATUS_SUCCESS) { std::cout << static_cast(val_ui64)/1000 << " W" << std::endl; diff --git a/tests/rocm_smi_test/functional/power_read.cc b/tests/rocm_smi_test/functional/power_read.cc index 70f3b104d0..02ec355b46 100755 --- a/tests/rocm_smi_test/functional/power_read.cc +++ b/tests/rocm_smi_test/functional/power_read.cc @@ -118,7 +118,7 @@ void TestPowerRead::Run(void) { err = rsmi_dev_power_ave_get(i, 0, &val_ui64); IF_VERB(STANDARD) { - std::cout << "\t**Averge Power Usage: "; + std::cout << "\t**Average Power Usage: "; CHK_RSMI_PERM_ERR(err) if (err == RSMI_STATUS_SUCCESS) { std::cout << static_cast(val_ui64)/1000 << " mW" << std::endl; From 12f395e592b781feb0df21ed4130bc6bbb28708a Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 12 Sep 2023 16:34:04 -0500 Subject: [PATCH 09/19] rocm_smi_lib: Fix rocm-smi --resetfans results in Permission Denied For operations related to: --resetfans --setfan We report 'Not supported' for these cases instead of 'Permission denied' Code changes related to the following: * rocm_smi_properties * rocm_smi related APIs Change-Id: I144646efc3804fabd45cc5a46351803950b4feb7 Signed-off-by: Oliveira, Daniel --- python_smi_tools/rocm_smi.py | 8 ++++++-- src/rocm_smi.cc | 9 +++++---- src/rocm_smi_properties.cc | 35 +++++++++++++++++++++++++---------- src/rocm_smi_utils.cc | 7 +++++++ 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 4a943a66a4..2d96cf3c73 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -813,8 +813,10 @@ def resetFans(deviceList): for device in deviceList: sensor_ind = c_uint32(0) ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind) - if rsmi_ret_ok(ret, device, 'reset_fan'): + if rsmi_ret_ok(ret, device, silent=True): printLog(device, 'Successfully reset fan speed to driver control', None) + else: + printLog(device, 'Not supported on the given system', None) printLogSpacer() @@ -1335,8 +1337,10 @@ def setFanSpeed(deviceList, fan): else: fanLevel = int(str(fan)) ret = rocmsmi.rsmi_dev_fan_speed_set(device, 0, int(fanLevel)) - if rsmi_ret_ok(ret, device, 'set_fan_speed'): + if rsmi_ret_ok(ret, device, silent=True): printLog(device, 'Successfully set fan speed to level %s' % (str(int(fanLevel))), None) + else: + printLog(device, 'Not supported on the given system', None) printLogSpacer() diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index d8bd892ac8..892f6664c7 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -407,6 +407,10 @@ static rsmi_status_t set_dev_mon_value(amd::smi::MonitorTypes type, } int ret = dev->monitor()->writeMonitor(type, sensor_ind, std::to_string(val)); + /// If the sysfs file doesn't exist, it is not supported. + if (ret == ENOENT) { + return rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; + } return amd::smi::ErrnoToRsmiStatus(ret); } @@ -2631,9 +2635,8 @@ rsmi_dev_fan_reset(uint32_t dv_ind, uint32_t sensor_ind) { LOG_TRACE(ss); ++sensor_ind; // fan sysfs files have 1-based indices - + REQUIRE_ROOT_ACCESS DEVICE_MUTEX - ret = set_dev_mon_value(amd::smi::kMonFanCntrlEnable, dv_ind, sensor_ind, 2); return ret; @@ -2669,14 +2672,12 @@ rsmi_dev_fan_speed_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t speed) { // First need to set fan mode (pwm1_enable) to 1 (aka, "manual") ret = set_dev_mon_value(amd::smi::kMonFanCntrlEnable, dv_ind, sensor_ind, 1); - if (ret != RSMI_STATUS_SUCCESS) { return ret; } ret = set_dev_mon_value(amd::smi::kMonFanSpeed, dv_ind, sensor_ind, speed); - return ret; CATCH diff --git a/src/rocm_smi_properties.cc b/src/rocm_smi_properties.cc index b076991881..d73f974286 100644 --- a/src/rocm_smi_properties.cc +++ b/src/rocm_smi_properties.cc @@ -166,6 +166,7 @@ const AMDGpuVerbList_t amdgpu_verb_check_list { { AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, "amdsmi_get_gpu_od_volt_curve_regions" } }; +const uint16_t kDevIDAll(0xFFFF); const uint16_t kDevRevIDAll(0xFFFF); const AMDGpuPropertyList_t amdgpu_property_reinforcement_list { // @@ -176,6 +177,14 @@ const AMDGpuPropertyList_t amdgpu_property_reinforcement_list { // rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL = rsmi_dev_clk_range_set; // + // AMD All Families + {kDevIDAll, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanCntrlEnable), + AMDGpuVerbTypes_t::kResetGpuFan, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + // AMD Instinct MI210 {0x740F, {0x02, make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, @@ -239,12 +248,6 @@ const AMDGpuPropertyList_t amdgpu_property_reinforcement_list { AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets, AMDGpuPropertyOpModeTypes_t::kBoth, false } }, - {0x74A1, {kDevRevIDAll, - make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, - DevInfoTypes::kDevGpuReset), - AMDGpuVerbTypes_t::kResetGpu, - AMDGpuPropertyOpModeTypes_t::kSrIov, false } - }, {0x74A1, {kDevRevIDAll, make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_DETERMINISM), @@ -350,7 +353,7 @@ rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbT // likely the reinforcement table does not contain any entries/rules for the // dev_id in question. // - auto amdgpu_property_query_result_hdlr = [](rsmi_status_t query_result) { + auto amdgpu_property_query_result_hdlr = [&](const rsmi_status_t query_result) { switch (query_result) { case (rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR): case (rsmi_status_t::RSMI_STATUS_NO_DATA): @@ -363,7 +366,7 @@ rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbT break; default: - return rsmi_status_t::RSMI_STATUS_NOT_FOUND; + return actual_error_code; break; } }; @@ -415,7 +418,7 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx std::ostringstream osstream; auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR); - AMDGpuPropertyQuery_t amdgpu_property_query = [&]() { + auto amdgpu_property_query = [&]() { AMDGpuPropertyQuery_t amdgpu_property_query_init{}; amdgpu_property_query_init.m_asic_id = 0; amdgpu_property_query_init.m_pci_rev_id = 0; @@ -445,6 +448,18 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx LOG_TRACE(osstream); bool is_proper_query(false); + + // Generic filter for checking properties for all asics and revisions. + auto amdgpu_property_query_all_asics = amdgpu_property_query; + amdgpu_property_query_all_asics.m_asic_id = kDevIDAll; + amdgpu_property_query_all_asics.m_pci_rev_id = kDevRevIDAll; + auto amdgpu_property_query_result = run_amdgpu_property_reinforcement_query(amdgpu_property_query_all_asics); + // We found a generic entry for all asics and revisions + if (amdgpu_property_query_result != rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR) { + return amdgpu_property_query_result; + } + + // If no generic entry, then we query for specific asic and revision ids. amdgpu_property_query = build_asic_id_filters(amdgpu_property_query, is_proper_query); if (!is_proper_query) { rsmi_status = rsmi_status_t::RSMI_STATUS_NO_DATA; @@ -487,7 +502,7 @@ rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuProper osstream << __PRETTY_FUNCTION__ << " asic id found: " << itr_begin->first << "\n"; // Pci_rev_id matches the filter or ALL Revisions if ((itr_begin->second.m_pci_rev_id == amdgpu_property_query.m_pci_rev_id) || - (itr_begin->second.m_pci_rev_id == kDevRevIDAll)) { + (itr_begin->second.m_pci_rev_id == kDevRevIDAll)) { osstream << __PRETTY_FUNCTION__ << " asic rev.id found: " << itr_begin->second.m_pci_rev_id << "\n"; // Do we have the property we are looking for? if (((amdgpu_property_query.m_property != 0) && diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index 1e9a444320..4b8f61a842 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -176,6 +176,13 @@ int isRegularFile(std::string fname, bool *is_reg) { } int WriteSysfsStr(std::string path, std::string val) { + // On success, zero is returned. On error, -1 is returned, and + // errno is set to indicate the error. + auto is_regular_file_result = isRegularFile(path, nullptr); + if (is_regular_file_result != 0) { + return ENOENT; + } + std::ofstream fs; int ret = 0; std::ostringstream ss; From a4b470fe71f723fe2c3b90480922820ae8102558 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Tue, 29 Aug 2023 19:33:10 -0500 Subject: [PATCH 10/19] Add errors for existing but empty dev files Change-Id: Iad9febc50f9b8e6085f8b605249ee884d2f134d6 Signed-off-by: Galantsev, Dmitrii --- python_smi_tools/rocm_smi.py | 22 ++++--- src/rocm_smi.cc | 4 ++ src/rocm_smi_device.cc | 5 +- .../functional/frequencies_read.cc | 26 +++++---- .../functional/frequencies_read_write.cc | 58 +++++-------------- 5 files changed, 49 insertions(+), 66 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 2d96cf3c73..8834e518b5 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -1791,15 +1791,19 @@ def showClocks(deviceList): for clk_type in sorted(rsmi_clk_names_dict): if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1: ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq)) - if rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True): - printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None) - for x in range(freq.num_supported): - fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000) - if x == freq.current: - printLog(device, str(x), str(fr) + ' *') - else: - printLog(device, str(x), str(fr)) - printLog(device, '', None) + if ret == rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: + printLog(device, 'Clock [%s] on device [%s] exists but EMPTY! Likely driver error!' % (clk_type, str(device))) + continue + if not rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True): + continue + printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None) + for x in range(freq.num_supported): + fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000) + if x == freq.current: + printLog(device, str(x), str(fr) + ' *') + else: + printLog(device, str(x), str(fr)) + printLog(device, '', None) else: logging.debug('{} frequency is unsupported on device[{}]'.format(clk_type, device)) printLog(device, '', None) diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 892f6664c7..ed96a7ee65 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -3740,6 +3740,10 @@ rsmi_dev_memory_reserved_pages_get(uint32_t dv_ind, uint32_t *num_pages, ret = GetDevValueVec(amd::smi::kDevMemPageBad, dv_ind, &val_vec); + // file is empty, which is valid for no errors + if (ret == RSMI_STATUS_UNEXPECTED_DATA) { + ret = RSMI_STATUS_SUCCESS; + } if (ret == RSMI_STATUS_FILE_ERROR) { return RSMI_STATUS_NOT_SUPPORTED; } diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index b01734f0b1..03a38dc749 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -858,8 +858,8 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, ss << "Read devInfoMultiLineStr for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ")" << ", but contained no string lines"; - LOG_INFO(ss); - return 0; + LOG_ERROR(ss); + return ENXIO; } // Remove any *trailing* empty (whitespace) lines while (!retVec->empty() && @@ -882,6 +882,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, << RocmSMI::devInfoTypesStrings.at(type) << ")" << ", but lines were empty"; LOG_INFO(ss); + return ENXIO; } return 0; } diff --git a/tests/rocm_smi_test/functional/frequencies_read.cc b/tests/rocm_smi_test/functional/frequencies_read.cc index 2b5466ae93..37bb9ec0b2 100755 --- a/tests/rocm_smi_test/functional/frequencies_read.cc +++ b/tests/rocm_smi_test/functional/frequencies_read.cc @@ -123,16 +123,22 @@ void TestFrequenciesRead::Run(void) { // Verify api support checking functionality is working err = rsmi_dev_gpu_clk_freq_get(i, t, nullptr); ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); - } else { - CHK_ERR_ASRT(err) - IF_VERB(STANDARD) { - std::cout << "\t**Supported " << name << " clock frequencies: "; - std::cout << f.num_supported << std::endl; - print_frequencies(&f); - // Verify api support checking functionality is working - err = rsmi_dev_gpu_clk_freq_get(i, t, nullptr); - ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); - } + } + + // special driver issue, shouldn't normally occur + if (err == RSMI_STATUS_UNEXPECTED_DATA) { + std::cerr << "WARN: Clock file [" << FreqEnumToStr(t) << "] exists on device [" << i << "] but empty!" << std::endl; + std::cerr << " Likely a driver issue!" << std::endl; + } + + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Supported " << name << " clock frequencies: "; + std::cout << f.num_supported << std::endl; + print_frequencies(&f); + // Verify api support checking functionality is working + err = rsmi_dev_gpu_clk_freq_get(i, t, nullptr); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); } }; diff --git a/tests/rocm_smi_test/functional/frequencies_read_write.cc b/tests/rocm_smi_test/functional/frequencies_read_write.cc index 9fce5c429e..5ad627cb5f 100755 --- a/tests/rocm_smi_test/functional/frequencies_read_write.cc +++ b/tests/rocm_smi_test/functional/frequencies_read_write.cc @@ -114,14 +114,20 @@ void TestFrequenciesReadWrite::Run(void) { std::cout << "\t**Set " << FreqEnumToStr(rsmi_clk) << ": Not supported on this machine" << std::endl; return false; - } else { - // CHK_ERR_ASRT(ret) - IF_VERB(STANDARD) { - std::cout << "Initial frequency for clock " << - FreqEnumToStr(rsmi_clk) << " is " << f.current << std::endl; - } - return true; } + + // special driver issue, shouldn't normally occur + if (ret == RSMI_STATUS_UNEXPECTED_DATA) { + std::cerr << "WARN: Clock file [" << FreqEnumToStr(rsmi_clk) << "] exists on device [" << dv_ind << "] but empty!" << std::endl; + std::cerr << " Likely a driver issue!" << std::endl; + } + + // CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "Initial frequency for clock " << + FreqEnumToStr(rsmi_clk) << " is " << f.current << std::endl; + } + return true; }; auto freq_write = [&]() { @@ -177,44 +183,6 @@ void TestFrequenciesReadWrite::Run(void) { } freq_write(); CHK_ERR_ASRT(ret) -#if 0 - ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f); - CHK_ERR_ASRT(ret) - - IF_VERB(STANDARD) { - std::cout << "Initial frequency for clock " << rsmi_clk << " is " << - f.current << std::endl; - } - // Set clocks to something other than the usual default of the lowest - // frequency. - freq_bitmask = 0b01100; // Try the 3rd and 4th clocks - - std::string freq_bm_str = - std::bitset(freq_bitmask).to_string(); - - freq_bm_str.erase(0, std::min(freq_bm_str.find_first_not_of('0'), - freq_bm_str.size()-1)); - - IF_VERB(STANDARD) { - std::cout << "Setting frequency mask for clock " << rsmi_clk << - " to 0b" << freq_bm_str << " ..." << std::endl; - } - ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, freq_bitmask); - CHK_ERR_ASRT(ret) - - ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f); - CHK_ERR_ASRT(ret) - - IF_VERB(STANDARD) { - std::cout << "Frequency is now index " << f.current << std::endl; - std::cout << "Resetting mask to all frequencies." << std::endl; - } - ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, 0xFFFFFFFF); - CHK_ERR_ASRT(ret) - - ret = rsmi_dev_perf_level_set(dv_ind, RSMI_DEV_PERF_LEVEL_AUTO); - CHK_ERR_ASRT(ret) -#endif } } } From 26c4578ee2d8caefb9209836320cdec25c9c9e9e Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 13 Sep 2023 19:16:49 -0500 Subject: [PATCH 11/19] README - Add a documentation link Change-Id: Ia56994825e99e72829283f07bed7379d95d24498 Signed-off-by: Galantsev, Dmitrii --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 948ef14988..9d7b0a019d 100755 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ The ROCm System Management Interface Library, or ROCm SMI library, is part of the Radeon Open Compute [ROCm](https://github.com/RadeonOpenCompute) software stack . It is a C library for Linux that provides a user space interface for applications to monitor and control GPU applications. +For additional information refer to [ROCm Documentation](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/) + ## DISCLAIMER The information contained herein is for informational purposes only, and is subject to change without notice. In addition, any stated support is planned and is also subject to change. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. @@ -56,7 +58,6 @@ python3 -m venv .venv In order to verify the build and capability of ROCm SMI on your system and to see an example of how ROCm SMI can be used, you may build and run the tests that are available in the repo. To build the tests, follow these steps: ```shell -# Set environment variables used in CMakeLists.txt file mkdir build cd build cmake -DBUILD_TESTS=ON .. From 238c7f6dcab244081ab5e47dea0fedef8da5e8af Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 13 Sep 2023 19:33:21 -0500 Subject: [PATCH 12/19] README - shell -> bash Change-Id: I3a50c38ae280747b4874cff443091f332980fe50 Signed-off-by: Galantsev, Dmitrii --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9d7b0a019d..12a12bd63c 100755 --- a/README.md +++ b/README.md @@ -27,24 +27,24 @@ In order to build the latest documentation, the following are required: The source code for ROCm SMI is available on [Github](https://github.com/RadeonOpenCompute/rocm_smi_lib). After the ROCm SMI library git repository has been cloned to a local Linux machine, building the library is achieved by following the typical CMake build sequence. Specifically, -```shell +```bash mkdir -p build cd build cmake .. make -j $(nproc) # Install library file and header; default location is /opt/rocm -$ make install +make install ``` The built library will appear in the `build` folder. To build the rpm and deb packages follow the above steps with: -```shell +```bash make package ``` #### Documentation The following is an example of how to build the docs: -```shell +```bash sudo apt install -y npm sudo npm install -g sass @@ -57,7 +57,7 @@ python3 -m venv .venv #### Building the Tests In order to verify the build and capability of ROCm SMI on your system and to see an example of how ROCm SMI can be used, you may build and run the tests that are available in the repo. To build the tests, follow these steps: -```shell +```bash mkdir build cd build cmake -DBUILD_TESTS=ON .. From 5c574ac79cbf75d89ff6f304b2c0e640c15765c6 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 13 Sep 2023 20:48:49 -0500 Subject: [PATCH 13/19] TESTS - Check power and frequency support It is not guaranteed that power can be read or set for some GPUs (MI300). It is also not guaranteed that frequencies can be set. As this is not a tool issue - we simply skip the failing test. Change-Id: I134e96a476040cef513cd924f00e30cd6dea42a5 Signed-off-by: Galantsev, Dmitrii --- .../functional/frequencies_read_write.cc | 23 ++++++++++++++----- .../functional/power_cap_read_write.cc | 7 ++++++ tests/rocm_smi_test/functional/power_read.cc | 10 ++++++++ .../functional/power_read_write.cc | 9 ++++++++ tests/rocm_smi_test/rsmitst.exclude | 3 +-- 5 files changed, 44 insertions(+), 8 deletions(-) diff --git a/tests/rocm_smi_test/functional/frequencies_read_write.cc b/tests/rocm_smi_test/functional/frequencies_read_write.cc index 5ad627cb5f..5060b59208 100755 --- a/tests/rocm_smi_test/functional/frequencies_read_write.cc +++ b/tests/rocm_smi_test/functional/frequencies_read_write.cc @@ -104,8 +104,7 @@ void TestFrequenciesReadWrite::Run(void) { for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { PrintDeviceHeader(dv_ind); - for (uint32_t clk = (uint32_t)RSMI_CLK_TYPE_FIRST; - clk <= RSMI_CLK_TYPE_LAST; ++clk) { + for (uint32_t clk = RSMI_CLK_TYPE_FIRST; clk <= RSMI_CLK_TYPE_LAST; ++clk) { rsmi_clk = (rsmi_clk_type)clk; auto freq_read = [&]() -> bool { @@ -147,14 +146,18 @@ void TestFrequenciesReadWrite::Run(void) { std::endl; } ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, freq_bitmask); - //Certain ASICs does not allow to set particular clocks. If set function for a clock returns - //permission error despite root access, manually set ret value to success and return - if (ret == RSMI_STATUS_PERMISSION && geteuid() == 0) { + // Certain ASICs does not allow to set particular clocks. If set function for a clock returns + // permission error despite root access, manually set ret value to success and return + // + // Sometimes setting clock frequencies is completely not supported + if ((ret == RSMI_STATUS_PERMISSION && geteuid() == 0) || + (ret == RSMI_STATUS_NOT_SUPPORTED)) { std::cout << "\t**Set " << FreqEnumToStr(rsmi_clk) << ": Not supported on this machine. Skipping..." << std::endl; ret = RSMI_STATUS_SUCCESS; return; } + CHK_ERR_ASRT(ret) ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f); if (ret != RSMI_STATUS_SUCCESS) { @@ -166,12 +169,20 @@ void TestFrequenciesReadWrite::Run(void) { std::cout << "Resetting mask to all frequencies." << std::endl; } ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, 0xFFFFFFFF); + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**Set " << FreqEnumToStr(rsmi_clk) + << ": Not supported on this machine. Skipping..." << std::endl; + ret = RSMI_STATUS_SUCCESS; + return; + } if (ret != RSMI_STATUS_SUCCESS) { return; } ret = rsmi_dev_perf_level_set(dv_ind, RSMI_DEV_PERF_LEVEL_AUTO); - if (ret != RSMI_STATUS_SUCCESS) { + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**Setting performance level is not supported on this machine. Skipping..." << std::endl; + ret = RSMI_STATUS_SUCCESS; return; } }; diff --git a/tests/rocm_smi_test/functional/power_cap_read_write.cc b/tests/rocm_smi_test/functional/power_cap_read_write.cc index 5d57ec0db2..5481996627 100755 --- a/tests/rocm_smi_test/functional/power_cap_read_write.cc +++ b/tests/rocm_smi_test/functional/power_cap_read_write.cc @@ -114,6 +114,13 @@ void TestPowerCapReadWrite::Run(void) { ret = rsmi_dev_power_cap_get(dv_ind, 0, nullptr); ASSERT_EQ(ret, RSMI_STATUS_INVALID_ARGS); + // Check if power cap is within the range + // skip the test otherwise + if (orig < min || orig > max) { + std::cout << "Power cap is not within the range. Skipping test for " << dv_ind << std::endl; + continue; + } + new_cap = (max + min)/2; IF_VERB(STANDARD) { diff --git a/tests/rocm_smi_test/functional/power_read.cc b/tests/rocm_smi_test/functional/power_read.cc index 02ec355b46..a18cd70676 100755 --- a/tests/rocm_smi_test/functional/power_read.cc +++ b/tests/rocm_smi_test/functional/power_read.cc @@ -117,6 +117,16 @@ void TestPowerRead::Run(void) { } err = rsmi_dev_power_ave_get(i, 0, &val_ui64); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << + "\t**Power average information is not supported for this device" + << std::endl; + + // Verify api support checking functionality is working + err = rsmi_dev_power_ave_get(i, 0, nullptr); + ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); + continue; + } IF_VERB(STANDARD) { std::cout << "\t**Average Power Usage: "; CHK_RSMI_PERM_ERR(err) diff --git a/tests/rocm_smi_test/functional/power_read_write.cc b/tests/rocm_smi_test/functional/power_read_write.cc index dc18cb6de0..1040716ad0 100755 --- a/tests/rocm_smi_test/functional/power_read_write.cc +++ b/tests/rocm_smi_test/functional/power_read_write.cc @@ -123,6 +123,15 @@ void TestPowerReadWrite::Run(void) { PrintDeviceHeader(dv_ind); ret = rsmi_dev_power_profile_presets_get(dv_ind, 0, &status); + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << + "\t**Power profile presets are not supported for this device" + << std::endl; + // Verify api support checking functionality is working + ret = rsmi_dev_power_profile_presets_get(dv_ind, 0, nullptr); + ASSERT_EQ(ret, RSMI_STATUS_NOT_SUPPORTED); + continue; + } CHK_ERR_ASRT(ret) // Verify api support checking functionality is working diff --git a/tests/rocm_smi_test/rsmitst.exclude b/tests/rocm_smi_test/rsmitst.exclude index 0632242f93..d87c409421 100644 --- a/tests/rocm_smi_test/rsmitst.exclude +++ b/tests/rocm_smi_test/rsmitst.exclude @@ -64,8 +64,7 @@ $BLACKLIST_ALL_ASICS\ # /sys/class/kfd/kfd/topology/nodes/*/properties FILTER[90400]=\ $BLACKLIST_ALL_ASICS\ -"rsmitstReadOnly.TestVoltCurvRead:"\ -"rsmitstReadWrite.TestPowerReadWrite" +"rsmitstReadOnly.TestVoltCurvRead" FILTER[90401]=${FILTER[90400]} FILTER[90402]=${FILTER[90400]} From 3b95214fffafe68978fb82dde033e9b764b5fb4b Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 14 Sep 2023 11:30:47 -0500 Subject: [PATCH 14/19] rsmiBindings.py - Add initRsmiBindings() Library path was printed at all times even with --json flag. This commit adds a mandatory initRsmiBindings function which is a core component of the rsmiBindings.py library. It **MUST** be called on import. Change-Id: Ic6ae1ec5d1fabba288910e6aed6c4706e53e5cd7 Signed-off-by: Galantsev, Dmitrii --- .gitignore | 5 ++- python_smi_tools/rocm_smi.py | 11 ++++-- python_smi_tools/rsmiBindings.py.in | 59 ++++++++++++++++------------- 3 files changed, 44 insertions(+), 31 deletions(-) diff --git a/.gitignore b/.gitignore index 91cbeef563..1629ea81d3 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,7 @@ build/ .cache/ # Simulated SYSFS - for early development or debug -device/ \ No newline at end of file +device/ + +# Misc +__pycache__ diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 8834e518b5..a8b3e5a74a 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -3507,8 +3507,7 @@ def save(deviceList, savefilepath): # The code below is for when this script is run as an executable instead of when imported as a module if __name__ == '__main__': parser = argparse.ArgumentParser( - description='AMD ROCm System Management Interface | ROCM-SMI version: %s | Kernel version: %s' % ( - __version__, getVersion(None, rsmi_sw_component_t.RSMI_SW_COMP_DRIVER)), + description=f'AMD ROCm System Management Interface | ROCM-SMI version: {__version__}', formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=90, width=120)) groupDev = parser.add_argument_group() groupDisplayOpt = parser.add_argument_group('Display Options') @@ -3668,6 +3667,11 @@ if __name__ == '__main__': args = parser.parse_args() + # Must set PRINT_JSON early so the prints can be silenced + if args.json or args.csv: + PRINT_JSON = True + # Initialize rsmiBindings + rocmsmi = initRsmiBindings(silent=PRINT_JSON) # Initialize the rocm SMI library initializeRsmi() @@ -3703,8 +3707,7 @@ if __name__ == '__main__': sys.exit(1) # If we want JSON/CSV output, initialize the keys (devices) - if args.json or args.csv: - PRINT_JSON = True + if PRINT_JSON: for device in deviceList: JSON_DATA['card' + str(device)] = {} diff --git a/python_smi_tools/rsmiBindings.py.in b/python_smi_tools/rsmiBindings.py.in index 9ffcac138d..e6b141889f 100644 --- a/python_smi_tools/rsmiBindings.py.in +++ b/python_smi_tools/rsmiBindings.py.in @@ -1,5 +1,6 @@ #!/usr/bin/env python3 """ROCm_SMI_LIB CLI Tool Python Bindings""" +# NOTE: You MUST call rsmiBindings.initRsmiBindings() when using this library! # TODO: Get most (or all) of these from rocm_smi.h to avoid mismatches and redundancy from __future__ import print_function @@ -14,36 +15,42 @@ import os # relative path changed accordingly. # if ROCM_SMI_LIB_PATH is set, we can load 'librocm_smi64.so' from that location # +# Library load is wrapped in a function so prints can be hidden for PRINT_JSON mode. path_librocm = str() -rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH') -if (rocm_smi_lib_path != None): - path_librocm = rocm_smi_lib_path -else: - path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@' +def initRsmiBindings(silent=False): + def print_silent(*args): + if not silent: + print(args) -if not os.path.isfile(path_librocm): - print('Unable to find %s . Trying /opt/rocm*' % path_librocm) - for root, dirs, files in os.walk('/opt', followlinks=True): - if 'librocm_smi64.so.@VERSION_MAJOR@' in files: - path_librocm = os.path.join(os.path.realpath(root), 'librocm_smi64.so.@VERSION_MAJOR@') - if os.path.isfile(path_librocm): - print('Using lib from %s' % path_librocm) + rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH') + if (rocm_smi_lib_path != None): + path_librocm = rocm_smi_lib_path else: - print('Unable to find librocm_smi64.so.@VERSION_MAJOR@') -else: - print('Library loaded from: %s ' % path_librocm) + path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@' -# ----------> TODO: Support static libs as well as SO -try: - cdll.LoadLibrary(path_librocm) - rocmsmi = CDLL(path_librocm) -except OSError: - print('Unable to load the rocm_smi library.\n'\ - 'Set LD_LIBRARY_PATH to the folder containing librocm_smi64.so.@VERSION_MAJOR@\n'\ - '{0}Please refer to https://github.com/'\ - 'RadeonOpenCompute/rocm_smi_lib for the installation guide.{1}'\ - .format('\33[33m', '\033[0m')) - exit() + if not os.path.isfile(path_librocm): + print_silent('Unable to find %s . Trying /opt/rocm*' % path_librocm) + for root, dirs, files in os.walk('/opt', followlinks=True): + if 'librocm_smi64.so.@VERSION_MAJOR@' in files: + path_librocm = os.path.join(os.path.realpath(root), 'librocm_smi64.so.@VERSION_MAJOR@') + if os.path.isfile(path_librocm): + print_silent('Using lib from %s' % path_librocm) + else: + print('Unable to find librocm_smi64.so.@VERSION_MAJOR@') + else: + print_silent('Library loaded from: %s ' % path_librocm) + + # ----------> TODO: Support static libs as well as SO + try: + cdll.LoadLibrary(path_librocm) + return CDLL(path_librocm) + except OSError: + print('Unable to load the rocm_smi library.\n'\ + 'Set LD_LIBRARY_PATH to the folder containing librocm_smi64.so.@VERSION_MAJOR@\n'\ + '{0}Please refer to https://github.com/'\ + 'RadeonOpenCompute/rocm_smi_lib for the installation guide.{1}'\ + .format('\33[33m', '\033[0m')) + exit() # Device ID dv_id = c_uint64() From 094c98a74fa9d770dc5f714de512eedd4ab78971 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 14 Sep 2023 11:58:49 -0500 Subject: [PATCH 15/19] rocm_smi.py: Fix pipe into head error When piping rocm_smi into 'head' it failed with "Broken pipe" error. The error can be safely ignored. head closes the pipe early which causes calls a SIGPIPE signal to be raised. https://docs.python.org/3/library/signal.html#note-on-sigpipe Change-Id: I4a589c6ed9a8c5b50de84b33e28115c6b510045f Signed-off-by: Galantsev, Dmitrii --- python_smi_tools/rocm_smi.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index a8b3e5a74a..4d1a5bad41 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -638,10 +638,21 @@ def printLog(device, metricName, value=None, extraSpace=False, useItalics=False) lock.acquire() if useItalics: logstr = italics + logstr + end - if extraSpace: - print('\n' + logstr + '\n', end='', flush=True) - else: - print(logstr + '\n', end='', flush=True) + try: + if extraSpace: + print('\n', end='') + print(logstr + '\n', end='') + sys.stdout.flush() + # when piped into programs like 'head' - print throws an error. + # silently ignore instead + except(BrokenPipeError, IOError): + # https://docs.python.org/3/library/signal.html#note-on-sigpipe + # Python flushes standard streams on exit; redirect remaining output + # to devnull to avoid another BrokenPipeError at shutdown + devnull = os.open(os.devnull, os.O_WRONLY) + os.dup2(devnull, sys.stdout.fileno()) + sys.exit(1) # Python exits with error code 1 on EPIPE + lock.release() From 7b32ea614b45d35a9b47f1db832edd8273f108b6 Mon Sep 17 00:00:00 2001 From: Sam Wu Date: Wed, 6 Sep 2023 10:55:33 -0600 Subject: [PATCH 16/19] fix toc to point to correct doxysphinx output path update doc requirements; rocm-docs-core to 0.24.1 Change-Id: I78257d476a8bc47fd1a4ee03aa3db1a430ed116f --- README.md | 31 ++++++++++++++++++------------- docs/.doxygen/.gitignore | 1 + docs/.gitignore | 2 ++ docs/.sphinx/_toc.yml | 5 +++-- docs/.sphinx/_toc.yml.in | 5 +++-- docs/.sphinx/requirements.in | 2 +- docs/.sphinx/requirements.txt | 26 ++++++++++++++++---------- docs/conf.py | 2 -- 8 files changed, 44 insertions(+), 30 deletions(-) create mode 100644 docs/.doxygen/.gitignore diff --git a/README.md b/README.md index 12a12bd63c..c987c36a82 100755 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ - - # ROCm System Management Interface (ROCm SMI) Library The ROCm System Management Interface Library, or ROCm SMI library, is part of the Radeon Open Compute [ROCm](https://github.com/RadeonOpenCompute) software stack . It is a C library for Linux that provides a user space interface for applications to monitor and control GPU applications. @@ -12,22 +10,25 @@ The information contained herein is for informational purposes only, and is subj © 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. +## Building ROCm SMI -# Building ROCm SMI +### Additional Required software for building -#### Additional Required software for building In order to build the ROCm SMI library, the following components are required. Note that the software versions listed are what was used in development. Earlier versions are not guaranteed to work: + * CMake (v3.5.0) * g++ (5.4.0) In order to build the latest documentation, the following are required: + * Python 3.8+ * NPM (sass) The source code for ROCm SMI is available on [Github](https://github.com/RadeonOpenCompute/rocm_smi_lib). After the ROCm SMI library git repository has been cloned to a local Linux machine, building the library is achieved by following the typical CMake build sequence. Specifically, -```bash + +```shell mkdir -p build cd build cmake .. @@ -35,26 +36,27 @@ make -j $(nproc) # Install library file and header; default location is /opt/rocm make install ``` + The built library will appear in the `build` folder. To build the rpm and deb packages follow the above steps with: -```bash + +```shell make package ``` #### Documentation + The following is an example of how to build the docs: -```bash -sudo apt install -y npm -sudo npm install -g sass +```shell python3 -m venv .venv - .venv/bin/python3 -m pip install -r docs/.sphinx/requirements.txt .venv/bin/python3 -m sphinx -T -E -b html -d docs/_build/doctrees -D language=en docs docs/_build/html ``` #### Building the Tests + In order to verify the build and capability of ROCm SMI on your system and to see an example of how ROCm SMI can be used, you may build and run the tests that are available in the repo. To build the tests, follow these steps: ```bash @@ -66,11 +68,14 @@ make -j $(nproc) To run the test, execute the program `rsmitst` that is built from the steps above. -# Usage Basics -## Device Indices +## Usage Basics + +### Device Indices + Many of the functions in the library take a "device index". The device index is a number greater than or equal to 0, and less than the number of devices detected, as determined by `rsmi_num_monitor_devices()`. The index is used to distinguish the detected devices from one another. It is important to note that a device may end up with a different index after a reboot, so an index should not be relied upon to be constant over reboots. -# Hello ROCm SMI +## Hello ROCm SMI + The only required ROCm-SMI call for any program that wants to use ROCm-SMI is the `rsmi_init()` call. This call initializes some internal data structures that will be used by subsequent ROCm-SMI calls. When ROCm-SMI is no longer being used, `rsmi_shut_down()` should be called. This provides a way to do any releasing of resources that ROCm-SMI may have held. In many cases, this may have no effect, but may be necessary in future versions of the library. diff --git a/docs/.doxygen/.gitignore b/docs/.doxygen/.gitignore new file mode 100644 index 0000000000..5ebfac1dea --- /dev/null +++ b/docs/.doxygen/.gitignore @@ -0,0 +1 @@ +docBin/ diff --git a/docs/.gitignore b/docs/.gitignore index b8ea6fcbcd..b84233aed8 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -7,3 +7,5 @@ /_templates/ /html/ /latex/ +404.md +data/AMD-404.png diff --git a/docs/.sphinx/_toc.yml b/docs/.sphinx/_toc.yml index 84f11ded40..45624f4efe 100644 --- a/docs/.sphinx/_toc.yml +++ b/docs/.sphinx/_toc.yml @@ -2,5 +2,6 @@ defaults: numbered: False maxdepth: 6 root: index -entries: - - file: html/index \ No newline at end of file +subtrees: + - entries: + - file: .doxygen/docBin/html/index diff --git a/docs/.sphinx/_toc.yml.in b/docs/.sphinx/_toc.yml.in index ac4814ce11..a9a3dd8f89 100755 --- a/docs/.sphinx/_toc.yml.in +++ b/docs/.sphinx/_toc.yml.in @@ -4,5 +4,6 @@ defaults: numbered: False maxdepth: 6 root: index -entries: - - file: html/index \ No newline at end of file +subtrees: + - entries: + - file: .doxygen/docBin/html/index diff --git a/docs/.sphinx/requirements.in b/docs/.sphinx/requirements.in index b98c8ab534..0bcf7dd930 100755 --- a/docs/.sphinx/requirements.in +++ b/docs/.sphinx/requirements.in @@ -1 +1 @@ -rocm-docs-core[api_reference]==0.9.0 +rocm-docs-core[api_reference]==0.24.1 diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt index 5257eac6b3..297193c462 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/.sphinx/requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile .sphinx/requirements.in +# pip-compile requirements.in # accessible-pygments==0.0.4 # via pydata-sphinx-theme @@ -41,7 +41,9 @@ docutils==0.16 # myst-parser # pydata-sphinx-theme # sphinx -doxysphinx==3.2.3 +doxysphinx==3.3.4 + # via rocm-docs-core +fastjsonschema==2.18.0 # via rocm-docs-core gitdb==4.0.10 # via gitpython @@ -59,10 +61,8 @@ jinja2==3.1.2 # via # myst-parser # sphinx -json5==0.9.11 +libsass==0.22.0 # via doxysphinx -linkify-it-py==1.0.3 - # via myst-parser lxml==4.9.2 # via doxysphinx markdown-it-py==2.2.0 @@ -75,7 +75,9 @@ mdit-py-plugins==0.3.5 # via myst-parser mdurl==0.1.2 # via markdown-it-py -myst-parser[linkify]==1.0.0 +mpire==2.8.0 + # via doxysphinx +myst-parser==1.0.0 # via rocm-docs-core packaging==23.1 # via @@ -92,8 +94,11 @@ pygithub==1.58.1 pygments==2.15.0 # via # accessible-pygments + # mpire # pydata-sphinx-theme # sphinx +pyjson5==1.6.4 + # via doxysphinx pyjwt[crypto]==2.6.0 # via pygithub pynacl==1.5.0 @@ -105,13 +110,14 @@ pytz==2023.3 pyyaml==6.0 # via # myst-parser + # rocm-docs-core # sphinx-external-toc requests==2.28.2 # via # pygithub # sphinx -rocm-docs-core[api_reference]==0.9.0 - # via -r .sphinx/requirements.in +rocm-docs-core[api_reference]==0.24.1 + # via -r requirements.in smmap==5.0.0 # via gitdb snowballstemmer==2.2.0 @@ -151,10 +157,10 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx +tqdm==4.66.1 + # via mpire typing-extensions==4.5.0 # via pydata-sphinx-theme -uc-micro-py==1.0.1 - # via linkify-it-py urllib3==1.26.15 # via requests wrapt==1.15.0 diff --git a/docs/conf.py b/docs/conf.py index e2d485b9ce..733e8bcc24 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,5 +13,3 @@ docs_core.setup() for sphinx_var in ROCmDocs.SPHINX_VARS: globals()[sphinx_var] = getattr(docs_core, sphinx_var) - -html_theme_options["show_navbar_depth"] = 2 From 431a7071a044409947da2e5320ec9c8b6e1a7e89 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Fri, 15 Sep 2023 02:59:28 -0500 Subject: [PATCH 17/19] PACKAGE - Cleanup packaging - Clean-up packaging scripts. More consistent with RDC. - Remove all 'sudo' calls. all these scripts are to be ran by root. - Reduce scope of variables. - Remove unnecessary functions Change-Id: Ib90f8e66ef4eae24f73e940fff44f515e12233f5 Signed-off-by: Galantsev, Dmitrii --- DEBIAN/postinst.in | 79 ++++++++++++++++++---------------------- DEBIAN/prerm.in | 48 +++++++++++-------------- RPM/post.in | 90 +++++++++++++++++++--------------------------- RPM/postun.in | 2 +- RPM/preun.in | 28 +++++++-------- 5 files changed, 106 insertions(+), 141 deletions(-) diff --git a/DEBIAN/postinst.in b/DEBIAN/postinst.in index ab2f640553..9e0fe4457c 100755 --- a/DEBIAN/postinst.in +++ b/DEBIAN/postinst.in @@ -1,21 +1,18 @@ #!/bin/bash -#set -x - -packageName="rocm-smi-lib" -logPath=/var/log/rocm_smi_lib -logName=ROCm-SMI-lib.log -logFile="${logPath}/${logName}" -logrotateConfFile=/etc/logrotate.d/rocm_smi.conf - -do_addLogFolder() { - sudo mkdir -p "${logPath}" - sudo touch "${logFile}" - sudo chmod -R a+rw "${logPath}" - sudo chmod a+rw "${logFile}" -} do_configureLogrotate() { - logrotate --version &>/dev/null + local IS_SYSTEMD=0 + local packageName="rocm-smi-lib" + local logPath=/var/log/rocm_smi_lib + local logFile="${logPath}/ROCm-SMI-lib.log" + local logrotateConfFile=/etc/logrotate.d/rocm_smi.conf + + mkdir -p "${logPath}" + touch "${logFile}" + chmod -R a+rw "${logPath}" + chmod a+rw "${logFile}" + + command -v logrotate &>/dev/null if [ $? -ne 0 ]; then echo "[WARNING] Detected logrotate is not installed."\ "$packageName logs (when turned on) will not rotate properly." @@ -23,14 +20,14 @@ do_configureLogrotate() { fi if [ ! -f $logrotateConfFile ]; then - sudo touch "${logrotateConfFile}" - sudo chmod 644 "${logrotateConfFile}" # root r/w, all others read + touch "${logrotateConfFile}" + chmod 644 "${logrotateConfFile}" # root r/w, all others read # ROCm SMI logging rotation, rotates files using root user/group # Hourly logrotation check # Only rotates if size grew larger than 1MB # Max of 4 rotation files, oldest will be removed # Rotated files use date extention of ex. ROCm-SMI-lib.log.2023-05-09_16:51:42 - cat << EOF | sudo tee "${logrotateConfFile}" >/dev/null + cat << EOF > "${logrotateConfFile}" ${logFile} { su root root hourly @@ -47,43 +44,29 @@ EOF # issue was RPM build thought we were using macros # https://gitlab.kitware.com/cmake/cmake/-/issues/22965 # https://rpm-software-management.github.io/rpm/manual/spec.html - sudo sed -i s/%%/%/g "${logrotateConfFile}" + sed -i s/%%/%/g "${logrotateConfFile}" # workaround: remove extra 'OURCE' text # from rocm_smi.conf. Unsure if CMAKE, # bash, or here document # issue (only seen on RHEL 8.7) - sudo sed -i s/OURCE//g "${logrotateConfFile}" + sed -i s/OURCE//g "${logrotateConfFile}" fi # check if logrotate uses system timers, Ubuntu/modern OS's do # Several older OS's like RHEL 8.7, do not. Instead defaults # to use daily cron jobs - see https://stackoverflow.com/a/69465677 - sudo systemctl list-timers|grep -iq logrotate - if [ $? -ne 0 ]; then - # confirm logrotate file exists in daily - if [ -f /etc/cron.daily/logrotate ]; then - # move logrotate daily to hourly - if [ -d /etc/cron.hourly ]; then - sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate - else - echo "[WARNING] Could find and configure hourly cron for $packageName's"\ - " logrotate. $packageName logs (when turned on) will not rotate properly." - fi - else - # confirm that it's already been moved to hourly - sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly - if [ $? -ne 0 ]; then - echo "[WARNING] Could not configure an hourly cron for $packageName's logrotate."\ - "$packageName logs (when turned on) may not rotate properly." - fi + if [ -d /run/systemd/system ]; then + systemctl list-timers | grep -iq logrotate + if [ $? -ne 0 ]; then + IS_SYSTEMD=1 fi - return #done configuring for non-systemd timers - else + fi + if [ "$IS_SYSTEMD" -eq 1 ]; then # Configure systemd timers - the typical setup for modern Linux logrotation setups if [ -f /lib/systemd/system/logrotate.timer ]; then if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then - sudo cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup + cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup fi - cat <<'EOF' | sudo tee /lib/systemd/system/logrotate.timer >/dev/null + cat << EOF > /lib/systemd/system/logrotate.timer [Unit] Description=Hourly rotation of log files Documentation=man:logrotate(8) man:logrotate.conf(5) @@ -97,12 +80,19 @@ Persistent=true [Install] WantedBy=timers.target EOF - sudo systemctl reenable --now logrotate.timer + systemctl reenable --now logrotate.timer else echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ "$packageName logs (when turned on) will not rotate properly." fi - return #done configuring for systemd timers + else + # $IS_SYSTEMD -eq 0 + if [ -f /etc/cron.daily/logrotate ]; then + # move logrotate daily to hourly + if [ -d /etc/cron.hourly ]; then + mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate + fi + fi fi } @@ -117,7 +107,6 @@ do_ldconfig() { case "$1" in ( configure ) do_ldconfig - do_addLogFolder do_configureLogrotate ;; ( abort-upgrade | abort-remove | abort-deconfigure ) diff --git a/DEBIAN/prerm.in b/DEBIAN/prerm.in index 9b4efa9899..7bf61f02d1 100755 --- a/DEBIAN/prerm.in +++ b/DEBIAN/prerm.in @@ -1,29 +1,4 @@ #!/bin/bash -set -e - -packageName="rocm-smi-lib" -logPath=/var/log/rocm_smi_lib -logName=ROCm-SMI-lib.log -logFile="${logPath}/${logName}" -logrotateConfFile=/etc/logrotate.d/rocm_smi.conf - -rm_logFolder() { - sudo rm -rf "$logPath" -} - -return_logrotateToOrigConfig() { - if [ -f $logrotateConfFile ]; then - sudo rm -rf "${logrotateConfFile}" - fi - if [ -f /etc/cron.hourly/logrotate ]; then - sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate - fi - if [ -f /lib/systemd/system/logrotate.timer.backup ]; then - sudo cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer - sudo rm -rf /lib/systemd/system/logrotate.timer.backup - sudo systemctl reenable --now logrotate.timer - fi -} rm_ldconfig() { # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build @@ -34,8 +9,27 @@ rm_ldconfig() { } rm_pyc() { - # remove pyc file generated by python - rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rocm_smi/__pycache__ + # remove pyc file generated by python + rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rocm_smi/__pycache__ +} + +rm_logFolder() { + rm -rf /var/log/rocm_smi_lib +} + +return_logrotateToOrigConfig() { + local logrotateConfFile=/etc/logrotate.d/rocm_smi.conf + if [ -f $logrotateConfFile ]; then + rm -rf "$logrotateConfFile" + fi + if [ -f /etc/cron.hourly/logrotate ]; then + mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate + fi + if [ -f /lib/systemd/system/logrotate.timer.backup ]; then + cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer + rm -rf /lib/systemd/system/logrotate.timer.backup + systemctl reenable --now logrotate.timer + fi } diff --git a/RPM/post.in b/RPM/post.in index b04e31c9f9..ccf1f814a5 100755 --- a/RPM/post.in +++ b/RPM/post.in @@ -1,21 +1,18 @@ #!/bin/bash -#set -x - -packageName="rocm-smi-lib" -logPath=/var/log/rocm_smi_lib -logName=ROCm-SMI-lib.log -logFile="${logPath}/${logName}" -logrotateConfFile=/etc/logrotate.d/rocm_smi.conf - -do_addLogFolder() { - sudo mkdir -p "${logPath}" - sudo touch "${logFile}" - sudo chmod -R a+rw "${logPath}" - sudo chmod a+rw "${logFile}" -} do_configureLogrotate() { - logrotate --version &>/dev/null + local IS_SYSTEMD=0 + local packageName="rocm-smi-lib" + local logPath=/var/log/rocm_smi_lib + local logFile="${logPath}/ROCm-SMI-lib.log" + local logrotateConfFile=/etc/logrotate.d/rocm_smi.conf + + mkdir -p "${logPath}" + touch "${logFile}" + chmod -R a+rw "${logPath}" + chmod a+rw "${logFile}" + + command -v logrotate &>/dev/null if [ $? -ne 0 ]; then echo "[WARNING] Detected logrotate is not installed."\ "$packageName logs (when turned on) will not rotate properly." @@ -23,14 +20,14 @@ do_configureLogrotate() { fi if [ ! -f $logrotateConfFile ]; then - sudo touch "${logrotateConfFile}" - sudo chmod 644 "${logrotateConfFile}" # root r/w, all others read + touch "${logrotateConfFile}" + chmod 644 "${logrotateConfFile}" # root r/w, all others read # ROCm SMI logging rotation, rotates files using root user/group # Hourly logrotation check # Only rotates if size grew larger than 1MB # Max of 4 rotation files, oldest will be removed # Rotated files use date extention of ex. ROCm-SMI-lib.log.2023-05-09_16:51:42 - cat << EOF | sudo tee "${logrotateConfFile}" >/dev/null + cat << EOF > "${logrotateConfFile}" ${logFile} { su root root hourly @@ -47,43 +44,29 @@ EOF # issue was RPM build thought we were using macros # https://gitlab.kitware.com/cmake/cmake/-/issues/22965 # https://rpm-software-management.github.io/rpm/manual/spec.html - sudo sed -i s/%%/%/g "${logrotateConfFile}" + sed -i s/%%/%/g "${logrotateConfFile}" # workaround: remove extra 'OURCE' text # from rocm_smi.conf. Unsure if CMAKE, # bash, or here document # issue (only seen on RHEL 8.7) - sudo sed -i s/OURCE//g "${logrotateConfFile}" + sed -i s/OURCE//g "${logrotateConfFile}" fi # check if logrotate uses system timers, Ubuntu/modern OS's do # Several older OS's like RHEL 8.7, do not. Instead defaults # to use daily cron jobs - see https://stackoverflow.com/a/69465677 - sudo systemctl list-timers|grep -iq logrotate - if [ $? -ne 0 ]; then - # confirm logrotate file exists in daily - if [ -f /etc/cron.daily/logrotate ]; then - # move logrotate daily to hourly - if [ -d /etc/cron.hourly ]; then - sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate - else - echo "[WARNING] Could find and configure hourly cron for $packageName's"\ - " logrotate. $packageName logs (when turned on) will not rotate properly." - fi - else - # confirm that it's already been moved to hourly - sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly - if [ $? -ne 0 ]; then - echo "[WARNING] Could not configure an hourly cron for $packageName's logrotate."\ - "$packageName logs (when turned on) may not rotate properly." - fi + if [ -d /run/systemd/system ]; then + systemctl list-timers | grep -iq logrotate + if [ $? -ne 0 ]; then + IS_SYSTEMD=1 fi - return #done configuring for non-systemd timers - else + fi + if [ "$IS_SYSTEMD" -eq 1 ]; then # Configure systemd timers - the typical setup for modern Linux logrotation setups if [ -f /lib/systemd/system/logrotate.timer ]; then if [ ! -f /lib/systemd/system/logrotate.timer.backup ]; then - sudo cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup + cp /lib/systemd/system/logrotate.timer /lib/systemd/system/logrotate.timer.backup fi - cat <<'EOF' | sudo tee /lib/systemd/system/logrotate.timer >/dev/null + cat << EOF > /lib/systemd/system/logrotate.timer [Unit] Description=Hourly rotation of log files Documentation=man:logrotate(8) man:logrotate.conf(5) @@ -97,12 +80,19 @@ Persistent=true [Install] WantedBy=timers.target EOF - sudo systemctl reenable --now logrotate.timer + systemctl reenable --now logrotate.timer else echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ "$packageName logs (when turned on) will not rotate properly." fi - return #done configuring for systemd timers + else + # $IS_SYSTEMD -eq 0 + if [ -f /etc/cron.daily/logrotate ]; then + # move logrotate daily to hourly + if [ -d /etc/cron.hourly ]; then + mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate + fi + fi fi } @@ -114,14 +104,8 @@ do_ldconfig() { fi } -# left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build -if [ "@ENABLE_LDCONFIG@" == "ON" ]; then - echo -e "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@" > /etc/ld.so.conf.d/x86_64-librocm_smi_lib.conf - ldconfig -fi - # post install or upgrade, $i is 1 or 2 -> do these actions -if [ $1 -ge 1 ]; then - do_addLogFolder +if [ "$1" -ge 1 ]; then + do_ldconfig do_configureLogrotate -fi \ No newline at end of file +fi diff --git a/RPM/postun.in b/RPM/postun.in index 0dd41d82d8..b42b5b90b9 100755 --- a/RPM/postun.in +++ b/RPM/postun.in @@ -1,7 +1,7 @@ #!/bin/bash # second term originates from ENABLE_LDCONFIG = ON/OFF at package build -if [ $1 -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then +if [ "$1" -le 1 ] && [ "@ENABLE_LDCONFIG@" == "ON" ]; then # perform the below actions for rpm remove($1=0) or upgrade($1=1) operations rm -f /etc/ld.so.conf.d/x86_64-librocm_smi_lib.conf ldconfig diff --git a/RPM/preun.in b/RPM/preun.in index b9c2eda6b3..b72c39465c 100755 --- a/RPM/preun.in +++ b/RPM/preun.in @@ -1,34 +1,32 @@ #!/bin/bash -#set -x -packageName="rocm-smi-lib" -logPath=/var/log/rocm_smi_lib -logName=ROCm-SMI-lib.log -logFile="${logPath}/${logName}" -logrotateConfFile=/etc/logrotate.d/rocm_smi.conf +rm_pyc() { + # remove pyc file generated by python + rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rocm_smi/__pycache__ +} rm_logFolder() { - sudo rm -rf "$logPath" + rm -rf /var/log/rocm_smi_lib } return_logrotateToOrigConfig() { + local logrotateConfFile=/etc/logrotate.d/rocm_smi.conf if [ -f $logrotateConfFile ]; then - sudo rm -rf "${logrotateConfFile}" + rm -rf "$logrotateConfFile" fi if [ -f /etc/cron.hourly/logrotate ]; then - sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate + mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate fi if [ -f /lib/systemd/system/logrotate.timer.backup ]; then - sudo cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer - sudo rm -rf /lib/systemd/system/logrotate.timer.backup - sudo systemctl reenable --now logrotate.timer + cp /lib/systemd/system/logrotate.timer.backup /lib/systemd/system/logrotate.timer + rm -rf /lib/systemd/system/logrotate.timer.backup + systemctl reenable --now logrotate.timer fi } -if [ $1 -le 1 ]; then +if [ "$1" -le 1 ]; then # perform the below actions for rpm remove($1=0) or upgrade($1=1) operations - # remove pyc file generated by python - rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rocm_smi/__pycache__ + rm_pyc rm_logFolder return_logrotateToOrigConfig fi From b99867eb80cc7b6b47ae4b3e89a9e0ca9df0e4a7 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 20 Sep 2023 10:05:34 -0500 Subject: [PATCH 18/19] PACKAGE - Fix packaging Allow for configureLogrotate to fail without failing configure In previous commit I forgot to invert the check when switching "IS_SYSTEMD" and "!IS_SYSTEMD" if-else statements. Change-Id: I8eb8e7981c6353a2e60064eb3a6e35821ea2a0d0 Signed-off-by: Galantsev, Dmitrii --- DEBIAN/postinst.in | 4 ++-- RPM/post.in | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/DEBIAN/postinst.in b/DEBIAN/postinst.in index 9e0fe4457c..641949f95a 100755 --- a/DEBIAN/postinst.in +++ b/DEBIAN/postinst.in @@ -56,7 +56,7 @@ EOF # to use daily cron jobs - see https://stackoverflow.com/a/69465677 if [ -d /run/systemd/system ]; then systemctl list-timers | grep -iq logrotate - if [ $? -ne 0 ]; then + if [ $? -eq 0 ]; then IS_SYSTEMD=1 fi fi @@ -107,7 +107,7 @@ do_ldconfig() { case "$1" in ( configure ) do_ldconfig - do_configureLogrotate + do_configureLogrotate || return 0 ;; ( abort-upgrade | abort-remove | abort-deconfigure ) echo "$1" diff --git a/RPM/post.in b/RPM/post.in index ccf1f814a5..d2bd016589 100755 --- a/RPM/post.in +++ b/RPM/post.in @@ -56,7 +56,7 @@ EOF # to use daily cron jobs - see https://stackoverflow.com/a/69465677 if [ -d /run/systemd/system ]; then systemctl list-timers | grep -iq logrotate - if [ $? -ne 0 ]; then + if [ $? -eq 0 ]; then IS_SYSTEMD=1 fi fi @@ -107,5 +107,5 @@ do_ldconfig() { # post install or upgrade, $i is 1 or 2 -> do these actions if [ "$1" -ge 1 ]; then do_ldconfig - do_configureLogrotate + do_configureLogrotate || return 0 fi From e0483f2ee292b2d8b3b15f3ee5cbf24656976a19 Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 19 Sep 2023 13:44:20 -0500 Subject: [PATCH 19/19] rocm_smi_lib: Fix [linux BM] [AMDSMI] Memory Bandwidth Implements APIs for 'gpu_metrics_v1_3' utilization averages Code changes related to the following: * rsmi_dev_activity_metric_get() * rsmi_dev_activity_avg_mm_get() * CLI shows "Avg.Memory Bandwidth" under "--showmemuse" Change-Id: I8e4600f350a7c18499abf022534db2b875f09d5f Signed-off-by: Oliveira, Daniel --- include/rocm_smi/rocm_smi.h | 77 +++++++++++- python_smi_tools/rocm_smi.py | 7 ++ src/rocm_smi.cc | 115 +++++++++++++++++- src/rocm_smi_gpu_metrics.cc | 6 +- .../functional/gpu_metrics_read.cc | 4 +- 5 files changed, 202 insertions(+), 7 deletions(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 92ac970841..14e4db0d58 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -466,6 +466,19 @@ typedef enum { RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type } rsmi_temperature_type_t; +/** + * @brief Activity (Utilization) Metrics. This enum is used to identify + * various activity metrics. + * + */ +typedef enum { + /* Utilization */ + RSMI_ACTIVITY_GFX = (0x1 << 0), + RSMI_ACTIVITY_UMC = (0x1 << 1), //!< memory controller + RSMI_ACTIVITY_MM = (0x1 << 2) //!< UVD or VCN +} rsmi_activity_metric_t; + + /** * @brief Voltage Metrics. This enum is used to identify various * Volatge metrics. Corresponding values will be in millivolt. @@ -774,6 +787,17 @@ typedef struct { typedef rsmi_pcie_bandwidth_t rsmi_pcie_bandwidth; /// \endcond +/** + * @brief This structure holds information about the possible activity + * averages. Specifically, the utilization counters. + */ +typedef struct { + /* Utilization */ + uint16_t average_gfx_activity; + uint16_t average_umc_activity; //!< memory controller + uint16_t average_mm_activity; //!< UVD or VCN +} rsmi_activity_metric_counter_t; + /** * @brief This structure holds version information. */ @@ -964,7 +988,7 @@ typedef struct { uint16_t padding; // new in v1 uint32_t gfx_activity_acc; // new in v1 - uint32_t mem_actvity_acc; // new in v1 + uint32_t mem_activity_acc; // new in v1 uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1 /// \endcond } rsmi_gpu_metrics_t; @@ -2259,6 +2283,57 @@ rsmi_utilization_count_get(uint32_t dv_ind, uint32_t count, uint64_t *timestamp); +/** + * @brief Get activity metric average utilization counter of the specified device + * + * @details Given a device index @p dv_ind, the activity metric type, + * this function returns the requested utilization counters + * + * @param[in] dv_ind a device index + * + * @param[in] activity_metric_type a metric type + * + * @param[inout] activity_metric_counter Multiple utilization counters can be retrieved with a single + * call. The caller must allocate enough space to the rsmi_activity_metric_counter_t structure. + * + * If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding + * field of the counter will be set in the value field of + * the activity_metric_counter_t. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t +rsmi_dev_activity_metric_get(uint32_t dv_ind, + rsmi_activity_metric_t activity_metric_type, + rsmi_activity_metric_counter_t* activity_metric_counter); + +/** + * @brief Get activity metric bandwidth average utilization counter of the specified device + * + * @details Given a device index @p dv_ind, the activity metric type, + * this function returns the requested utilization counters + * + * @param[in] dv_ind a device index + * + * @param[inout] avg_activity average bandwidth utilization counters can be retrieved + * + * If the function returns RSMI_STATUS_SUCCESS, the requested type will be set in the corresponding + * field of the counter will be set in the value field of + * the activity_metric_counter_t. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t +rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity); + /** * @brief Get the performance level of the device with provided * device index. diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 4d1a5bad41..2a0a4655d7 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -2160,6 +2160,7 @@ def showMemUse(deviceList): @param deviceList: List of DRM devices (can be a single-item list) """ memoryUse = c_uint64() + avgMemBandwidth = c_uint16() printLogSpacer(' Current Memory Use ') for device in deviceList: ret = rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse)) @@ -2171,6 +2172,12 @@ def showMemUse(deviceList): printLog(device, utilization_counter_name[ut_counter.type], ut_counter.val) else: printLog(device, 'Memory Activity', 'N/A') + + ret = rocmsmi.rsmi_dev_activity_avg_mm_get(device, byref(avgMemBandwidth)) + if rsmi_ret_ok(ret, device, silent=True): + printLog(device, 'Avg. Memory Bandwidth', avgMemBandwidth.value) + else: + printLog(device, 'Not supported on the given system', None) printLogSpacer() diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index ed96a7ee65..bfe427fcba 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -3295,7 +3295,7 @@ rsmi_utilization_count_get(uint32_t dv_ind, val_ui32 = gpu_metrics.gfx_activity_acc; break; case RSMI_COARSE_GRAIN_MEM_ACTIVITY: - val_ui32 = gpu_metrics.mem_actvity_acc; + val_ui32 = gpu_metrics.mem_activity_acc; break; default: return RSMI_STATUS_INVALID_ARGS; @@ -3312,6 +3312,119 @@ rsmi_utilization_count_get(uint32_t dv_ind, CATCH } +rsmi_status_t +rsmi_dev_activity_metric_get(uint32_t dv_ind, + rsmi_activity_metric_t activity_metric_type, + rsmi_activity_metric_counter_t* activity_metric_counter) { + + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + if (!activity_metric_counter) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Metric Type: " << activity_metric_type + << " | Cause: rsmi_activity_metric_counter_t was a null ptr reference" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_ERROR(ostrstream); + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + rsmi_gpu_metrics_t gpu_metrics; + status_code = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Metric Type: " << activity_metric_type + << " | Cause: rsmi_dev_gpu_metrics_info_get returned " + << getRSMIStatusString(status_code) + << " | Returning = " + << status_code << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_GFX) { + activity_metric_counter->average_gfx_activity = gpu_metrics.average_gfx_activity; + ostrstream << __PRETTY_FUNCTION__ + << " | For GFX: " << activity_metric_counter->average_gfx_activity; + LOG_INFO(ostrstream); + } + if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_UMC) { + activity_metric_counter->average_umc_activity = gpu_metrics.average_umc_activity; + ostrstream << __PRETTY_FUNCTION__ + << " | For UMC: " << activity_metric_counter->average_umc_activity; + LOG_INFO(ostrstream); + } + if (activity_metric_type & rsmi_activity_metric_t::RSMI_ACTIVITY_MM) { + activity_metric_counter->average_mm_activity = gpu_metrics.average_mm_activity; + ostrstream << __PRETTY_FUNCTION__ + << " | For MM: " << activity_metric_counter->average_mm_activity; + LOG_INFO(ostrstream); + } + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Metric Type: " << activity_metric_type + << " | Returning = " + << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity) { + + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + if (!avg_activity) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Metric Type: " << rsmi_activity_metric_t::RSMI_ACTIVITY_MM + << " | Cause: avg_activity was a null ptr reference" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_ERROR(ostrstream); + return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + } + + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + auto avg_mm_activity(uint16_t(0)); + rsmi_activity_metric_counter_t activity_metric_counter; + status_code = rsmi_dev_activity_metric_get(dv_ind, rsmi_activity_metric_t::RSMI_ACTIVITY_MM, &activity_metric_counter); + avg_activity = &activity_metric_counter.average_mm_activity; + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Metric Type: " << rsmi_activity_metric_t::RSMI_ACTIVITY_MM + << " | Returning = " + << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + + rsmi_status_t rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len) { TRY diff --git a/src/rocm_smi_gpu_metrics.cc b/src/rocm_smi_gpu_metrics.cc index c2ad2e2659..d7aab133c3 100755 --- a/src/rocm_smi_gpu_metrics.cc +++ b/src/rocm_smi_gpu_metrics.cc @@ -288,8 +288,8 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, rsmi_gpu_metrics->gfx_activity_acc, "rsmi_gpu_metrics->gfx_activity_acc") << print_unsigned_hex_and_int( - rsmi_gpu_metrics->mem_actvity_acc, - "rsmi_gpu_metrics->mem_actvity_acc"); + rsmi_gpu_metrics->mem_activity_acc, + "rsmi_gpu_metrics->mem_activity_acc"); for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) { ss << print_unsigned_hex_and_int( rsmi_gpu_metrics->temperature_hbm[i], @@ -414,7 +414,7 @@ static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind, // These fields didn't exist in v0 data->gfx_activity_acc = 0; - data->mem_actvity_acc = 0; + data->mem_activity_acc = 0; (void)memset(data->temperature_hbm, 0, RSMI_NUM_HBM_INSTANCES * sizeof(uint16_t)); } // else handle other conversions to format 1 diff --git a/tests/rocm_smi_test/functional/gpu_metrics_read.cc b/tests/rocm_smi_test/functional/gpu_metrics_read.cc index a1b362fc31..f7944ddcbf 100644 --- a/tests/rocm_smi_test/functional/gpu_metrics_read.cc +++ b/tests/rocm_smi_test/functional/gpu_metrics_read.cc @@ -177,8 +177,8 @@ void TestGpuMetricsRead::Run(void) { << std::to_string(smu.pcie_link_speed) << '\n'; std::cout << "gfx_activity_acc=" << std::dec << smu.gfx_activity_acc << '\n'; - std::cout << "mem_actvity_acc=" - << std::dec << smu.mem_actvity_acc << '\n'; + std::cout << "mem_activity_acc=" + << std::dec << smu.mem_activity_acc << '\n'; for (int i = 0; i < RSMI_NUM_HBM_INSTANCES; ++i) { std::cout << "temperature_hbm[" << i << "]=" << std::dec <<