From fab0542ab16b7378c523e1038e23ca1b22b9b91c Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 6 Sep 2023 13:16:52 -0500 Subject: [PATCH 1/6] Fix doxygen warning messages The Doxygen will enable warning as error message. Change-Id: Ie7a7c9a823388c4140f31489604d65ec43005772 --- include/rocm_smi/rocm_smi.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 9fa7de0b35..bf5db443e8 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -884,14 +884,28 @@ struct metrics_table_header_t { #define RSMI_GPU_METRICS_API_FORMAT_VER 1 // The content version increments when gpu_metrics is extended with new and/or // existing field sizes are changed. + +/** + * @brief The GPU metrics version 1 + */ #define RSMI_GPU_METRICS_API_CONTENT_VER_1 1 +/** + * @brief The GPU metrics version 2 + */ #define RSMI_GPU_METRICS_API_CONTENT_VER_2 2 +/** + * @brief The GPU metrics version 3 + */ #define RSMI_GPU_METRICS_API_CONTENT_VER_3 3 -// This should match NUM_HBM_INSTANCES +/** + * @brief This should match NUM_HBM_INSTANCES + */ #define RSMI_NUM_HBM_INSTANCES 4 -// Unit conversion factor for HBM temperatures +/** + * @brief Unit conversion factor for HBM temperatures + */ #define CENTRIGRADE_TO_MILLI_CENTIGRADE 1000 typedef struct { @@ -2230,7 +2244,7 @@ rsmi_dev_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent); * If the function reutrns RSMI_STATUS_SUCCESS, the counter will be set in the value field of * the rsmi_utilization_counter_t. * - * @param[in] count The size of @utilization_counters array. + * @param[in] count The size of utilization_counters array. * * @param[inout] timestamp The timestamp when the counter is retreived. Resolution: 1 ns. * @retval ::RSMI_STATUS_SUCCESS call was successful @@ -3340,7 +3354,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices, * @brief Get the info of a process on a specific device. * * @details Given a process id @p pid, a @p dv_ind, this function will - * write the process information for @p pid on the device, if available, to + * write the process information for pid on the device, if available, to * the memory pointed to by @p proc. * * @param[in] pid The process id of the process for which the gpu @@ -3348,7 +3362,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices, * * @param[in] dv_ind a device index where the process running on. * - * @param[inout] procs a pointer to memory provided by the caller to which + * @param[inout] proc a pointer to memory provided by the caller to which * process information will be written. * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call @@ -3540,7 +3554,7 @@ rsmi_topo_get_link_type(uint32_t dv_ind_src, uint32_t dv_ind_dst, * * @details Given a source device index @p dv_ind_src and * a destination device index @p dv_ind_dst, and a pointer to a - * bool @accessible, this function will write the P2P connection status + * bool @p accessible, this function will write the P2P connection status * between the device @p dv_ind_src and @p dv_ind_dst to the memory * pointed to by @p accessible. * From 4aef7675962c9cbd4baf031aebc2a5bacabdd1ba Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 31 Aug 2023 11:34:58 -0500 Subject: [PATCH 2/6] Cleanup rocm_smi.cc Change-Id: Ia676c237222b0dd5d9e8a054a93776f3b11e2225 Signed-off-by: Galantsev, Dmitrii --- include/rocm_smi/rocm_smi.h | 6 +- include/rocm_smi/rocm_smi_common.h | 2 +- src/rocm_smi.cc | 588 ++++++++++++----------------- src/rocm_smi_counters.cc | 26 +- src/rocm_smi_device.cc | 89 +++-- src/rocm_smi_gpu_metrics.cc | 30 +- src/rocm_smi_io_link.cc | 40 +- src/rocm_smi_kfd.cc | 37 +- src/rocm_smi_logger.cc | 23 +- src/rocm_smi_main.cc | 65 ++-- src/rocm_smi_monitor.cc | 48 ++- src/rocm_smi_power_mon.cc | 14 +- src/rocm_smi_properties.cc | 10 +- src/rocm_smi_utils.cc | 99 ++--- tests/rocm_smi_test/main.cc | 3 - tests/rocm_smi_test/test_base.cc | 11 +- tests/rocm_smi_test/test_base.h | 4 +- tests/rocm_smi_test/test_common.cc | 6 +- tests/rocm_smi_test/test_common.h | 4 +- 19 files changed, 483 insertions(+), 622 deletions(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index bf5db443e8..92ac970841 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -2406,7 +2406,7 @@ rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid * */ -rsmi_status_t rsmi_dev_gpu_reset(int32_t dv_ind); +rsmi_status_t rsmi_dev_gpu_reset(uint32_t dv_ind); /** * @brief This function retrieves the voltage/frequency curve information @@ -2640,7 +2640,7 @@ rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t sensor_ind, * */ rsmi_status_t -rsmi_dev_perf_level_set(int32_t dv_ind, rsmi_dev_perf_level_t perf_lvl); +rsmi_dev_perf_level_set(uint32_t dv_ind, rsmi_dev_perf_level_t perf_lvl); /** * @brief Set the PowerPlay performance level associated with the device with @@ -2706,7 +2706,7 @@ rsmi_dev_perf_level_set_v1(uint32_t dv_ind, rsmi_dev_perf_level_t perf_lvl); * @retval ::RSMI_STATUS_PERMISSION function requires root access * */ -rsmi_status_t rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od); +rsmi_status_t rsmi_dev_overdrive_level_set(uint32_t dv_ind, uint32_t od); /** * @brief Set the overdrive percent associated with the device with provided diff --git a/include/rocm_smi/rocm_smi_common.h b/include/rocm_smi/rocm_smi_common.h index bff8c8edc5..f29e427789 100755 --- a/include/rocm_smi/rocm_smi_common.h +++ b/include/rocm_smi/rocm_smi_common.h @@ -90,7 +90,7 @@ /* This group of macros is used to facilitate checking of support for rsmi_dev* * "getter" functions. When the return buffer is set to nullptr, the macro will * check the previously gathered device support data to see if the function, - * with possible variants (e.g., memory types, firware types,...) and + * with possible variants (e.g., memory types, firmware types,...) and * subvariants (e.g. monitors/sensors) are supported. */ // This macro assumes dev already available diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 5c2e3f8fd5..506e784206 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -41,29 +41,29 @@ * */ -#include -#include -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include #include -#include +#include +#include -#include #include -#include #include +#include +#include +#include #include -#include -#include -#include +#include #include #include +#include +#include #include +#include +#include #include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h #include "rocm_smi/rocm_smi.h" @@ -81,16 +81,24 @@ using namespace ROCmLogging; using namespace amd::smi; static const uint32_t kMaxOverdriveLevel = 20; -static const float kEnergyCounterResolution = 15.3f; +static const float kEnergyCounterResolution = 15.3F; -std::map ClkStateMap = { - {RSMI_CLK_TYPE_SYS, "SCLK"}, - {RSMI_CLK_TYPE_DF, "DFCLK"}, - {RSMI_CLK_TYPE_DCEF, "DCEFCLK"}, - {RSMI_CLK_TYPE_SOC, "SOCCLK"}, - {RSMI_CLK_TYPE_MEM, "MCLK"}, - {RSMI_CLK_TYPE_PCIE, "PCIECLK"}, - }; +static const std::map kClkStateMap = { + { RSMI_CLK_TYPE_SYS, "SCLK" }, + { RSMI_CLK_TYPE_DF, "DFCLK" }, + { RSMI_CLK_TYPE_DCEF, "DCEFCLK" }, + { RSMI_CLK_TYPE_SOC, "SOCCLK" }, + { RSMI_CLK_TYPE_MEM, "MCLK" }, + { RSMI_CLK_TYPE_PCIE, "PCIECLK" }, +}; + +static const std::map kClkTypeMap = { + { RSMI_CLK_TYPE_SYS, amd::smi::kDevGPUSClk }, + { RSMI_CLK_TYPE_MEM, amd::smi::kDevGPUMClk }, + { RSMI_CLK_TYPE_DF, amd::smi::kDevFClk }, + { RSMI_CLK_TYPE_DCEF, amd::smi::kDevDCEFClk }, + { RSMI_CLK_TYPE_SOC, amd::smi::kDevSOCClk }, +}; #define TRY try { #define CATCH } catch (...) {return amd::smi::handleException();} @@ -156,7 +164,7 @@ static uint64_t freq_string_to_int(const std::vector &freq_lines, } if (is_curr != nullptr) { - if (freq_lines[i].find("*") != std::string::npos) { + if (freq_lines[i].find('*') != std::string::npos) { *is_curr = true; } else { *is_curr = false; @@ -167,7 +175,7 @@ static uint64_t freq_string_to_int(const std::vector &freq_lines, if (star_str[0] == 'x') { assert(lanes != nullptr && "Lanes are provided but null lanes pointer"); if (lanes) { - if (star_str.substr(1) == "") { + if (star_str.substr(1).empty()) { throw amd::smi::rsmi_exception(RSMI_STATUS_NO_DATA, __FUNCTION__); } @@ -209,8 +217,6 @@ static void freq_volt_string_to_point(std::string in_line, multiplier = get_multiplier_from_str(volts_units_str[0]); pt->voltage = static_cast(volts*multiplier); - - return; } static void od_value_pair_str_to_range(std::string in_line, rsmi_range_t *rg) { @@ -237,8 +243,6 @@ static void od_value_pair_str_to_range(std::string in_line, rsmi_range_t *rg) { multiplier = get_multiplier_from_str(hi_units_str[0]); rg->upper_bound = static_cast(hi*multiplier); - - return; } /** @@ -258,7 +262,7 @@ power_prof_string_to_int(std::string pow_prof_line, bool *is_curr, fs >> *prof_ind; fs >> mode; - while (1) { + while (true) { tmp = mode.find_last_of("* :"); if (tmp == std::string::npos) { break; @@ -267,7 +271,7 @@ power_prof_string_to_int(std::string pow_prof_line, bool *is_curr, } if (is_curr != nullptr) { - if (pow_prof_line.find("*") != std::string::npos) { + if (pow_prof_line.find('*') != std::string::npos) { *is_curr = true; } else { *is_curr = false; @@ -759,14 +763,13 @@ rsmi_status_t rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node) { TRY rsmi_status_t ret; - uint64_t val = 0; CHK_SUPPORT_NAME_ONLY(numa_node) DEVICE_MUTEX std::string str_val; ret = get_dev_value_str(amd::smi::kDevNumaNode, dv_ind, &str_val); - *numa_node = std::stol(str_val, 0); + *numa_node = std::stoi(str_val, nullptr); return ret; CATCH @@ -1005,13 +1008,10 @@ rsmi_dev_mem_overdrive_level_get(uint32_t dv_ind, uint32_t *od) { } rsmi_status_t -rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od) { +rsmi_dev_overdrive_level_set(uint32_t dv_ind, uint32_t od) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); - if (dv_ind < 0) { - return RSMI_STATUS_INVALID_ARGS; - } return rsmi_dev_overdrive_level_set_v1(static_cast(dv_ind), od); } @@ -1032,11 +1032,11 @@ rsmi_dev_overdrive_level_set_v1(uint32_t dv_ind, uint32_t od) { } rsmi_status_t -rsmi_dev_perf_level_set(int32_t dv_ind, rsmi_dev_perf_level_t perf_level) { +rsmi_dev_perf_level_set(uint32_t dv_ind, rsmi_dev_perf_level_t perf_level) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); - return rsmi_dev_perf_level_set_v1(static_cast(dv_ind), perf_level); + return rsmi_dev_perf_level_set_v1(dv_ind, perf_level); } rsmi_status_t @@ -1075,7 +1075,7 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_ } assert(val_vec.size() <= RSMI_MAX_NUM_FREQUENCIES); - if (val_vec.size() == 0) { + if (val_vec.empty()) { return RSMI_STATUS_NOT_YET_IMPLEMENTED; } @@ -1090,7 +1090,8 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_ // Check that that is true. if (i > 0) { if (f->frequency[i] < f->frequency[i-1]) { - std::string sysvalue = ClkStateMap[clk_type]; + std::string sysvalue; + sysvalue += kClkStateMap.find(clk_type)->second; sysvalue += " Current Value"; sysvalue += ' ' + std::to_string(f->frequency[i]); sysvalue += " Previous Value"; @@ -1101,7 +1102,8 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, rsmi_clk_type_ if (current) { // set the current frequency if (f->current != RSMI_MAX_NUM_FREQUENCIES + 1) { - std::string sysvalue = ClkStateMap[clk_type]; + std::string sysvalue; + sysvalue += kClkStateMap.find(clk_type)->second; sysvalue += " Current Value"; sysvalue += ' ' + std::to_string(f->frequency[i]); sysvalue += " Previous Value"; @@ -1140,7 +1142,7 @@ static rsmi_status_t get_power_profiles(uint32_t dv_ind, return ret; } assert(val_vec.size() <= RSMI_MAX_NUM_POWER_PROFILES); - if (val_vec.size() > RSMI_MAX_NUM_POWER_PROFILES + 1 || val_vec.size() < 1) { + if (val_vec.size() > RSMI_MAX_NUM_POWER_PROFILES + 1 || val_vec.empty()) { return RSMI_STATUS_UNEXPECTED_SIZE; } // -1 for the header line, below @@ -1326,8 +1328,9 @@ rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, LOG_TRACE(ss); assert(minclkvalue < maxclkvalue); - std::string min_sysvalue, max_sysvalue; - std::map ClkStateMap = { + std::string min_sysvalue; + std::string max_sysvalue; + std::map clk_char_map = { {RSMI_CLK_TYPE_SYS, "s"}, {RSMI_CLK_TYPE_MEM, "m"}, }; @@ -1345,11 +1348,11 @@ rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, // minimum clock. And 1 if to set maximum clock. E.g., "s 0 500" will update // minimum sclk to be 500 MHz. "m 1 800" will update maximum mclk to 800Mhz. - min_sysvalue = ClkStateMap[clkType]; + min_sysvalue = clk_char_map[clkType]; min_sysvalue += ' ' + std::to_string(RSMI_FREQ_IND_MIN); min_sysvalue += ' ' + std::to_string(minclkvalue); min_sysvalue += '\n'; - max_sysvalue = ClkStateMap[clkType]; + max_sysvalue = clk_char_map[clkType]; max_sysvalue += ' ' + std::to_string(RSMI_FREQ_IND_MAX); max_sysvalue += ' ' + std::to_string(maxclkvalue); max_sysvalue += '\n'; @@ -1381,7 +1384,7 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, LOG_TRACE(ss); std::string sysvalue; - std::map ClkStateMap = { + std::map clk_char_map = { {RSMI_CLK_TYPE_SYS, "s"}, {RSMI_CLK_TYPE_MEM, "m"}, }; @@ -1400,14 +1403,8 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, switch (clkType) { case RSMI_CLK_TYPE_SYS: - sysvalue = ClkStateMap[clkType]; - sysvalue += ' ' + std::to_string(level); - sysvalue += ' ' + std::to_string(clkvalue); - sysvalue += '\n'; - break; - case RSMI_CLK_TYPE_MEM: - sysvalue = ClkStateMap[clkType]; + sysvalue = clk_char_map[clkType]; sysvalue += ' ' + std::to_string(level); sysvalue += ' ' + std::to_string(clkvalue); sysvalue += '\n'; @@ -1487,7 +1484,6 @@ static void get_vc_region(uint32_t start_ind, } od_value_pair_str_to_range((*val_vec)[start_ind], &p->freq_range); od_value_pair_str_to_range((*val_vec)[start_ind + 1], &p->volt_range); - return; } /* @@ -1614,24 +1610,11 @@ rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, rsmi_clk_type_t clk_type, CHK_SUPPORT_VAR(f, clk_type) - switch (clk_type) { - case RSMI_CLK_TYPE_SYS: - dev_type = amd::smi::kDevGPUSClk; - break; - case RSMI_CLK_TYPE_MEM: - dev_type = amd::smi::kDevGPUMClk; - break; - case RSMI_CLK_TYPE_DF: - dev_type = amd::smi::kDevFClk; - break; - case RSMI_CLK_TYPE_DCEF: - dev_type = amd::smi::kDevDCEFClk; - break; - case RSMI_CLK_TYPE_SOC: - dev_type = amd::smi::kDevSOCClk; - break; - default: - return RSMI_STATUS_INVALID_ARGS; + const auto & clk_type_it = kClkTypeMap.find(clk_type); + if (clk_type_it != kClkTypeMap.end()) { + dev_type = clk_type_it->second; + } else { + return RSMI_STATUS_INVALID_ARGS; } DEVICE_MUTEX @@ -1653,72 +1636,35 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, std::string val_str; amd::smi::DevInfoTypes dev_type; - switch (block) { - case RSMI_FW_BLOCK_ASD: - dev_type = amd::smi::kDevFwVersionAsd; - break; - case RSMI_FW_BLOCK_CE: - dev_type = amd::smi::kDevFwVersionCe; - break; - case RSMI_FW_BLOCK_DMCU: - dev_type = amd::smi::kDevFwVersionDmcu; - break; - case RSMI_FW_BLOCK_MC: - dev_type = amd::smi::kDevFwVersionMc; - break; - case RSMI_FW_BLOCK_ME: - dev_type = amd::smi::kDevFwVersionMe; - break; - case RSMI_FW_BLOCK_MEC: - dev_type = amd::smi::kDevFwVersionMec; - break; - case RSMI_FW_BLOCK_MEC2: - dev_type = amd::smi::kDevFwVersionMec2; - break; - case RSMI_FW_BLOCK_PFP: - dev_type = amd::smi::kDevFwVersionPfp; - break; - case RSMI_FW_BLOCK_RLC: - dev_type = amd::smi::kDevFwVersionRlc; - break; - case RSMI_FW_BLOCK_RLC_SRLC: - dev_type = amd::smi::kDevFwVersionRlcSrlc; - break; - case RSMI_FW_BLOCK_RLC_SRLG: - dev_type = amd::smi::kDevFwVersionRlcSrlg; - break; - case RSMI_FW_BLOCK_RLC_SRLS: - dev_type = amd::smi::kDevFwVersionRlcSrls; - break; - case RSMI_FW_BLOCK_SDMA: - dev_type = amd::smi::kDevFwVersionSdma; - break; - case RSMI_FW_BLOCK_SDMA2: - dev_type = amd::smi::kDevFwVersionSdma2; - break; - case RSMI_FW_BLOCK_SMC: - dev_type = amd::smi::kDevFwVersionSmc; - break; - case RSMI_FW_BLOCK_SOS: - dev_type = amd::smi::kDevFwVersionSos; - break; - case RSMI_FW_BLOCK_TA_RAS: - dev_type = amd::smi::kDevFwVersionTaRas; - break; - case RSMI_FW_BLOCK_TA_XGMI: - dev_type = amd::smi::kDevFwVersionTaXgmi; - break; - case RSMI_FW_BLOCK_UVD: - dev_type = amd::smi::kDevFwVersionUvd; - break; - case RSMI_FW_BLOCK_VCE: - dev_type = amd::smi::kDevFwVersionVce; - break; - case RSMI_FW_BLOCK_VCN: - dev_type = amd::smi::kDevFwVersionVcn; - break; - default: - return RSMI_STATUS_INVALID_ARGS; + static const std::map kFWBlockTypeMap = { + { RSMI_FW_BLOCK_ASD, amd::smi::kDevFwVersionAsd }, + { RSMI_FW_BLOCK_CE, amd::smi::kDevFwVersionCe }, + { RSMI_FW_BLOCK_DMCU, amd::smi::kDevFwVersionDmcu }, + { RSMI_FW_BLOCK_MC, amd::smi::kDevFwVersionMc }, + { RSMI_FW_BLOCK_ME, amd::smi::kDevFwVersionMe }, + { RSMI_FW_BLOCK_MEC, amd::smi::kDevFwVersionMec }, + { RSMI_FW_BLOCK_MEC2, amd::smi::kDevFwVersionMec2 }, + { RSMI_FW_BLOCK_PFP, amd::smi::kDevFwVersionPfp }, + { RSMI_FW_BLOCK_RLC, amd::smi::kDevFwVersionRlc }, + { RSMI_FW_BLOCK_RLC_SRLC, amd::smi::kDevFwVersionRlcSrlc }, + { RSMI_FW_BLOCK_RLC_SRLG, amd::smi::kDevFwVersionRlcSrlg }, + { RSMI_FW_BLOCK_RLC_SRLS, amd::smi::kDevFwVersionRlcSrls }, + { RSMI_FW_BLOCK_SDMA, amd::smi::kDevFwVersionSdma }, + { RSMI_FW_BLOCK_SDMA2, amd::smi::kDevFwVersionSdma2 }, + { RSMI_FW_BLOCK_SMC, amd::smi::kDevFwVersionSmc }, + { RSMI_FW_BLOCK_SOS, amd::smi::kDevFwVersionSos }, + { RSMI_FW_BLOCK_TA_RAS, amd::smi::kDevFwVersionTaRas }, + { RSMI_FW_BLOCK_TA_XGMI, amd::smi::kDevFwVersionTaXgmi }, + { RSMI_FW_BLOCK_UVD, amd::smi::kDevFwVersionUvd }, + { RSMI_FW_BLOCK_VCE, amd::smi::kDevFwVersionVce }, + { RSMI_FW_BLOCK_VCN, amd::smi::kDevFwVersionVcn }, + }; + + const auto & dev_type_it = kFWBlockTypeMap.find(block); + if (dev_type_it != kFWBlockTypeMap.end()) { + dev_type = dev_type_it->second; + } else { + return RSMI_STATUS_INVALID_ARGS; } DEVICE_MUTEX @@ -1728,7 +1674,7 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, static std::string bitfield_to_freq_string(uint64_t bitf, uint32_t num_supported) { - std::string bf_str(""); + std::string bf_str; std::bitset bs(bitf); if (num_supported > RSMI_MAX_NUM_FREQUENCIES) { @@ -1793,24 +1739,11 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, int ret_i; amd::smi::DevInfoTypes dev_type; - switch (clk_type) { - case RSMI_CLK_TYPE_SYS: - dev_type = amd::smi::kDevGPUSClk; - break; - case RSMI_CLK_TYPE_MEM: - dev_type = amd::smi::kDevGPUMClk; - break; - case RSMI_CLK_TYPE_DF: - dev_type = amd::smi::kDevFClk; - break; - case RSMI_CLK_TYPE_SOC: - dev_type = amd::smi::kDevSOCClk; - break; - case RSMI_CLK_TYPE_DCEF: - dev_type = amd::smi::kDevDCEFClk; - break; - default: - return RSMI_STATUS_INVALID_ARGS; + const auto & clk_type_it = kClkTypeMap.find(clk_type); + if (clk_type_it != kClkTypeMap.end()) { + dev_type = clk_type_it->second; + } else { + return RSMI_STATUS_INVALID_ARGS; } ret_i = dev->writeDevInfo(dev_type, freq_enable_str); @@ -1878,7 +1811,7 @@ get_id_name_str_from_line(uint64_t id, std::string ln, *ln_str >> token1; - if (token1 == "") { + if (token1.empty()) { throw amd::smi::rsmi_exception(RSMI_STATUS_NO_DATA, __FUNCTION__); } @@ -1991,13 +1924,13 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name, } } - for (auto fl : pci_name_files) { + for (const auto& fl : pci_name_files) { std::ifstream id_file_strm(fl); while (std::getline(id_file_strm, ln)) { std::istringstream ln_str(ln); // parse line - if (ln[0] == '#' || ln.size() == 0) { + if (ln[0] == '#' || ln.empty()) { continue; } @@ -2008,29 +1941,28 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name, if (typ == NAME_STR_SUBSYS && found_device_id_for_subsys) { val_str = get_id_name_str_from_line(subsys_vend_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { // We've chopped the subsys_vend ID, now we need to get the // subsys description val_str = get_id_name_str_from_line(subsys_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { break; - } else { - val_str.clear(); } + val_str.clear(); } } } else if (typ == NAME_STR_DEVICE) { // ln[1] != '\t' // This is a device line val_str = get_id_name_str_from_line(device_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { break; } } else if (typ == NAME_STR_SUBSYS) { // match the device id line val_str = get_id_name_str_from_line(device_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { found_device_id_for_subsys = true; } } @@ -2048,22 +1980,21 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name, val_str = get_id_name_str_from_line(vendor_id, ln, &ln_str); - if (val_str.size() > 0) { + if (!val_str.empty()) { if (typ == NAME_STR_VENDOR) { break; - } else { - val_str.clear(); - found_device_vendor = true; } + val_str.clear(); + found_device_vendor = true; } } } - if (val_str.size() > 0) { + if (!val_str.empty()) { break; } } - if (val_str.size() == 0) { + if (val_str.empty()) { return get_backup_name(vendor_id, name, len); } size_t ct = val_str.copy(name, len); @@ -2293,8 +2224,8 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { const uint32_t SPEED_DATA_LENGTH = sizeof(link_speed)/sizeof(uint32_t); // Calculate the index - int width_index = -1; - int speed_index = -1; + uint32_t width_index = -1; + uint32_t speed_index = -1; uint32_t cur_index = 0; for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH; cur_index++) { if (link_width[cur_index] == gpu_metrics.pcie_link_width) { @@ -2302,8 +2233,7 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { break; } } - for (cur_index = 0; - cur_index < SPEED_DATA_LENGTH; cur_index++) { + for (cur_index = 0; cur_index < SPEED_DATA_LENGTH; cur_index++) { if (link_speed[cur_index] == gpu_metrics.pcie_link_speed) { speed_index = cur_index; break; @@ -2315,11 +2245,10 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { // Set possible lanes and frequencies b->transfer_rate.num_supported = WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; b->transfer_rate.current = speed_index*WIDTH_DATA_LENGTH + width_index; - for (cur_index = 0; - cur_index < WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; cur_index++) { - b->transfer_rate.frequency[cur_index] - = link_speed[cur_index/WIDTH_DATA_LENGTH] * 100 * 1000000L; - b->lanes[cur_index] = link_width[cur_index % WIDTH_DATA_LENGTH]; + for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; cur_index++) { + b->transfer_rate.frequency[cur_index] = + static_cast(link_speed[cur_index/WIDTH_DATA_LENGTH]) * 100 * 1000000L; + b->lanes[cur_index] = link_width[cur_index % WIDTH_DATA_LENGTH]; } /* frequency = {2500, 2500, 2500, 2500, 2500, 2500, @@ -2429,54 +2358,29 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, LOG_TRACE(ss); rsmi_status_t ret; - amd::smi::MonitorTypes mon_type; + amd::smi::MonitorTypes mon_type = amd::smi::kMonInvalid; uint16_t val_ui16; - switch (metric) { - case RSMI_TEMP_CURRENT: - mon_type = amd::smi::kMonTemp; - break; - case RSMI_TEMP_MAX: - mon_type = amd::smi::kMonTempMax; - break; - case RSMI_TEMP_MIN: - mon_type = amd::smi::kMonTempMin; - break; - case RSMI_TEMP_MAX_HYST: - mon_type = amd::smi::kMonTempMaxHyst; - break; - case RSMI_TEMP_MIN_HYST: - mon_type = amd::smi::kMonTempMinHyst; - break; - case RSMI_TEMP_CRITICAL: - mon_type = amd::smi::kMonTempCritical; - break; - case RSMI_TEMP_CRITICAL_HYST: - mon_type = amd::smi::kMonTempCriticalHyst; - break; - case RSMI_TEMP_EMERGENCY: - mon_type = amd::smi::kMonTempEmergency; - break; - case RSMI_TEMP_EMERGENCY_HYST: - mon_type = amd::smi::kMonTempEmergencyHyst; - break; - case RSMI_TEMP_CRIT_MIN: - mon_type = amd::smi::kMonTempCritMin; - break; - case RSMI_TEMP_CRIT_MIN_HYST: - mon_type = amd::smi::kMonTempCritMinHyst; - break; - case RSMI_TEMP_OFFSET: - mon_type = amd::smi::kMonTempOffset; - break; - case RSMI_TEMP_LOWEST: - mon_type = amd::smi::kMonTempLowest; - break; - case RSMI_TEMP_HIGHEST: - mon_type = amd::smi::kMonTempHighest; - break; - default: - mon_type = amd::smi::kMonInvalid; + static const std::map kMetricTypeMap = { + { RSMI_TEMP_CURRENT, amd::smi::kMonTemp }, + { RSMI_TEMP_MAX, amd::smi::kMonTempMax }, + { RSMI_TEMP_MIN, amd::smi::kMonTempMin }, + { RSMI_TEMP_MAX_HYST, amd::smi::kMonTempMaxHyst }, + { RSMI_TEMP_MIN_HYST, amd::smi::kMonTempMinHyst }, + { RSMI_TEMP_CRITICAL, amd::smi::kMonTempCritical }, + { RSMI_TEMP_CRITICAL_HYST, amd::smi::kMonTempCriticalHyst }, + { RSMI_TEMP_EMERGENCY, amd::smi::kMonTempEmergency }, + { RSMI_TEMP_EMERGENCY_HYST, amd::smi::kMonTempEmergencyHyst }, + { RSMI_TEMP_CRIT_MIN, amd::smi::kMonTempCritMin }, + { RSMI_TEMP_CRIT_MIN_HYST, amd::smi::kMonTempCritMinHyst }, + { RSMI_TEMP_OFFSET, amd::smi::kMonTempOffset }, + { RSMI_TEMP_LOWEST, amd::smi::kMonTempLowest }, + { RSMI_TEMP_HIGHEST, amd::smi::kMonTempHighest }, + }; + + const auto mon_type_it = kMetricTypeMap.find(metric); + if (mon_type_it != kMetricTypeMap.end()) { + mon_type = mon_type_it->second; } if (temperature == nullptr) { @@ -2492,80 +2396,81 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, return RSMI_STATUS_INVALID_ARGS; } - // The HBM temperature is retreived from the gpu_metrics - if (sensor_type == RSMI_TEMP_TYPE_HBM_0 - || sensor_type == RSMI_TEMP_TYPE_HBM_1 - || sensor_type == RSMI_TEMP_TYPE_HBM_2 - || sensor_type == RSMI_TEMP_TYPE_HBM_3) { - if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Type: " << monitorTypesToString.at(mon_type) - << " | Cause: To retreive HBM temp, we only support metric = " - << "RSMI_TEMP_CURRENT" - << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; - LOG_ERROR(ss); - return RSMI_STATUS_NOT_SUPPORTED; - } + // The HBM temperature is retrieved from the gpu_metrics + if (sensor_type == RSMI_TEMP_TYPE_HBM_0 || + sensor_type == RSMI_TEMP_TYPE_HBM_1 || + sensor_type == RSMI_TEMP_TYPE_HBM_2 || + sensor_type == RSMI_TEMP_TYPE_HBM_3) { + if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: To retrieve HBM temp, we only support metric = " + << "RSMI_TEMP_CURRENT" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } - rsmi_gpu_metrics_t gpu_metrics; - ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); - if (ret != RSMI_STATUS_SUCCESS) { - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Type: " << monitorTypesToString.at(mon_type) - << " | Cause: rsmi_dev_gpu_metrics_info_get returned " - << getRSMIStatusString(ret) - << " | Returning = " - << getRSMIStatusString(ret) << " |"; - LOG_ERROR(ss); - return ret; - } + rsmi_gpu_metrics_t gpu_metrics; + ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: rsmi_dev_gpu_metrics_info_get returned " + << getRSMIStatusString(ret) + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_ERROR(ss); + return ret; + } - switch (sensor_type) { - case RSMI_TEMP_TYPE_HBM_0: - val_ui16 = gpu_metrics.temperature_hbm[0]; - break; - case RSMI_TEMP_TYPE_HBM_1: - val_ui16 = gpu_metrics.temperature_hbm[1]; - break; - case RSMI_TEMP_TYPE_HBM_2: - val_ui16 = gpu_metrics.temperature_hbm[2]; - break; - case RSMI_TEMP_TYPE_HBM_3: - val_ui16 = gpu_metrics.temperature_hbm[3]; - break; - default: - return RSMI_STATUS_INVALID_ARGS; - } - if (val_ui16 == UINT16_MAX) { - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Type: " << monitorTypesToString.at(mon_type) - << " | Cause: Reached UINT16 max value, overflow" - << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; - LOG_ERROR(ss); - return RSMI_STATUS_NOT_SUPPORTED; - } else - *temperature = val_ui16 * CENTRIGRADE_TO_MILLI_CENTIGRADE; + switch (sensor_type) { + case RSMI_TEMP_TYPE_HBM_0: + val_ui16 = gpu_metrics.temperature_hbm[0]; + break; + case RSMI_TEMP_TYPE_HBM_1: + val_ui16 = gpu_metrics.temperature_hbm[1]; + break; + case RSMI_TEMP_TYPE_HBM_2: + val_ui16 = gpu_metrics.temperature_hbm[2]; + break; + case RSMI_TEMP_TYPE_HBM_3: + val_ui16 = gpu_metrics.temperature_hbm[3]; + break; + default: + return RSMI_STATUS_INVALID_ARGS; + } + if (val_ui16 == UINT16_MAX) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: Reached UINT16 max value, overflow" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_NOT_SUPPORTED; + } - ss << __PRETTY_FUNCTION__ << " | ======= end ======= " - << " | Success " - << " | Device #: " << dv_ind - << " | Type: " << monitorTypesToString.at(mon_type) - << " | Data: " << *temperature - << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " | "; - LOG_INFO(ss); - return RSMI_STATUS_SUCCESS; + *temperature = static_cast(val_ui16) * CENTRIGRADE_TO_MILLI_CENTIGRADE; + + ss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Data: " << *temperature + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " | "; + LOG_INFO(ss); + return RSMI_STATUS_SUCCESS; } // end HBM temperature DEVICE_MUTEX @@ -2811,7 +2716,7 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) { } rsmi_status_t -rsmi_dev_gpu_reset(int32_t dv_ind) { +rsmi_dev_gpu_reset(uint32_t dv_ind) { TRY std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; @@ -2985,7 +2890,8 @@ rsmi_status_t rsmi_dev_power_cap_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t cap) { TRY rsmi_status_t ret; - uint64_t min, max; + uint64_t min; + uint64_t max; std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); @@ -3354,10 +3260,10 @@ rsmi_utilization_count_get(uint32_t dv_ind, default: return RSMI_STATUS_INVALID_ARGS; } - if (val_ui32 == UINT32_MAX) + if (val_ui32 == UINT32_MAX) { return RSMI_STATUS_NOT_SUPPORTED; - else - utilization_counters[index].value = val_ui32; + } + utilization_counters[index].value = val_ui32; } *timestamp = gpu_metrics.system_clock_counter; @@ -3583,7 +3489,7 @@ rsmi_dev_counter_destroy(rsmi_event_handle_t evnt_handle) { rsmi_status_t rsmi_counter_control(rsmi_event_handle_t evt_handle, - rsmi_counter_command_t cmd, void *) { + rsmi_counter_command_t cmd, void * /*unused*/) { TRY amd::smi::evt::Event *evt = @@ -3644,9 +3550,9 @@ rsmi_counter_read(rsmi_event_handle_t evt_handle, } if (ret == 0) { return RSMI_STATUS_SUCCESS; - } else { - return RSMI_STATUS_UNEXPECTED_SIZE; } + + return RSMI_STATUS_UNEXPECTED_SIZE; CATCH } @@ -3689,9 +3595,9 @@ rsmi_dev_counter_group_supported(uint32_t dv_ind, rsmi_event_group_t group) { if (grp->find(group) == grp->end()) { return RSMI_STATUS_NOT_SUPPORTED; - } else { - return RSMI_STATUS_SUCCESS; } + + return RSMI_STATUS_SUCCESS; CATCH } @@ -4183,7 +4089,8 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, return RSMI_STATUS_INVALID_ARGS; } - uint32_t node_ind_src, node_ind_dst; + uint32_t node_ind_src; + uint32_t node_ind_dst; // Fetch the source and destination GPU node index if (smi.get_node_index(dv_ind_src, &node_ind_src) || smi.get_node_index(dv_ind_dst, &node_ind_dst)) { @@ -4241,19 +4148,13 @@ get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { } switch (mapStringToRSMIComputePartitionTypes[compute_partition_str]) { - case RSMI_COMPUTE_PARTITION_INVALID: - // Retrieved an unknown compute partition - return RSMI_STATUS_UNEXPECTED_DATA; case RSMI_COMPUTE_PARTITION_CPX: - break; case RSMI_COMPUTE_PARTITION_SPX: - break; case RSMI_COMPUTE_PARTITION_DPX: - break; case RSMI_COMPUTE_PARTITION_TPX: - break; case RSMI_COMPUTE_PARTITION_QPX: break; + case RSMI_COMPUTE_PARTITION_INVALID: default: // Retrieved an unknown compute partition return RSMI_STATUS_UNEXPECTED_DATA; @@ -4326,19 +4227,13 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, std::string currentComputePartition; switch (compute_partition) { - case RSMI_COMPUTE_PARTITION_INVALID: - // Retrieved an unknown compute partition - return RSMI_STATUS_INVALID_ARGS; case RSMI_COMPUTE_PARTITION_CPX: - break; case RSMI_COMPUTE_PARTITION_SPX: - break; case RSMI_COMPUTE_PARTITION_DPX: - break; case RSMI_COMPUTE_PARTITION_TPX: - break; case RSMI_COMPUTE_PARTITION_QPX: break; + case RSMI_COMPUTE_PARTITION_INVALID: default: return RSMI_STATUS_INVALID_ARGS; } @@ -4385,17 +4280,12 @@ static rsmi_status_t get_nps_mode(uint32_t dv_ind, std::string &nps_mode) { } switch (mapStringToNPSModeTypes[val_str]) { - case RSMI_MEMORY_PARTITION_UNKNOWN: - // Retrieved an unknown NPS mode - return RSMI_STATUS_UNEXPECTED_DATA; case RSMI_MEMORY_PARTITION_NPS1: - break; case RSMI_MEMORY_PARTITION_NPS2: - break; case RSMI_MEMORY_PARTITION_NPS4: - break; case RSMI_MEMORY_PARTITION_NPS8: break; + case RSMI_MEMORY_PARTITION_UNKNOWN: default: // Retrieved an unknown NPS mode return RSMI_STATUS_UNEXPECTED_DATA; @@ -4429,7 +4319,7 @@ rsmi_dev_nps_mode_set(uint32_t dv_ind, rsmi_nps_mode_type_t nps_mode) { } } - if (isCorrectDevice == false) { + if (!isCorrectDevice) { return RSMI_STATUS_NOT_SUPPORTED; } @@ -4438,17 +4328,12 @@ rsmi_dev_nps_mode_set(uint32_t dv_ind, rsmi_nps_mode_type_t nps_mode) { std::string currentNPSMode; switch (nps_mode) { - case RSMI_MEMORY_PARTITION_UNKNOWN: - // Retrieved an unknown NPS mode - return RSMI_STATUS_INVALID_ARGS; case RSMI_MEMORY_PARTITION_NPS1: - break; case RSMI_MEMORY_PARTITION_NPS2: - break; case RSMI_MEMORY_PARTITION_NPS4: - break; case RSMI_MEMORY_PARTITION_NPS8: break; + case RSMI_MEMORY_PARTITION_UNKNOWN: default: return RSMI_STATUS_INVALID_ARGS; } @@ -4584,19 +4469,19 @@ rsmi_dev_supported_func_iterator_open(uint32_t dv_ind, if (dev->supported_funcs()->begin() == dev->supported_funcs()->end()) { delete *handle; return RSMI_STATUS_NO_DATA; - } else { - SupportedFuncMapIt *supp_func_iter = new SupportedFuncMapIt; - - if (supp_func_iter == nullptr) { - return RSMI_STATUS_OUT_OF_RESOURCES; - } - *supp_func_iter = dev->supported_funcs()->begin(); - - (*handle)->func_id_iter = reinterpret_cast(supp_func_iter); - (*handle)->container_ptr = - reinterpret_cast(dev->supported_funcs()); } + SupportedFuncMapIt *supp_func_iter = new SupportedFuncMapIt; + + if (supp_func_iter == nullptr) { + return RSMI_STATUS_OUT_OF_RESOURCES; + } + *supp_func_iter = dev->supported_funcs()->begin(); + + (*handle)->func_id_iter = reinterpret_cast(supp_func_iter); + (*handle)->container_ptr = + reinterpret_cast(dev->supported_funcs()); + return RSMI_STATUS_SUCCESS; CATCH @@ -4963,7 +4848,8 @@ rsmi_event_notification_get(int timeout_ms, if (*num_elem < buffer_size && errno != EAGAIN) { return amd::smi::ErrnoToRsmiStatus(errno); - } else if (*num_elem >= buffer_size) { + } + if (*num_elem >= buffer_size) { return RSMI_STATUS_SUCCESS; } @@ -5039,7 +4925,7 @@ rsmi_test_refcount(uint64_t refcnt_type) { amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); std::lock_guard guard(*smi.bootstrap_mutex()); - if (smi.ref_count() == 0 && smi.devices().size() != 0) { + if (smi.ref_count() == 0 && !smi.devices().empty()) { return -1; } diff --git a/src/rocm_smi_counters.cc b/src/rocm_smi_counters.cc index 9f82798183..a08819568e 100755 --- a/src/rocm_smi_counters.cc +++ b/src/rocm_smi_counters.cc @@ -41,20 +41,20 @@ * */ -#include -#include -#include -#include #include +#include #include #include -#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include #include #include "rocm_smi/rocm_smi.h" @@ -164,8 +164,7 @@ GetSupportedEventGroups(uint32_t dev_num, dev_evt_grp_set_t *supported_grps) { } // /sys/bus/event_source/devices/_/type Event::Event(rsmi_event_type_t event, uint32_t dev_ind) : - event_type_(event), prev_cntr_val_(0) { - fd_ = -1; + event_type_(event), fd_(-1), prev_cntr_val_(0) { rsmi_event_group_t grp = EvtGrpFromEvtID(event); assert(grp != RSMI_EVNT_GRP_INVALID); // This should have failed before now @@ -398,10 +397,11 @@ readn(int fd, void *buf, size_t n) { return static_cast(n - left); } if (bytes < 0) { - if (errno == EINTR) /* read got interrupted */ + if (errno == EINTR) { + /* read got interrupted */ continue; - else - return -errno; + } + return -errno; } left -= static_cast(bytes); diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index 3830d602aa..95f27d8e12 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -43,30 +43,28 @@ #include #include -#include -#include #include -#include +#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include #include +#include #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" -#include "rocm_smi/rocm_smi_kfd.h" #include "rocm_smi/rocm_smi_logger.h" #include "shared_mutex.h" // NOLINT @@ -689,7 +687,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr) { int ret; std::ostringstream ss; - fs.rdbuf()->pubsetbuf(0,0); + fs.rdbuf()->pubsetbuf(nullptr,0); ret = openSysfsFileStream(type, &fs, valStr.c_str()); if (ret != 0) { ss << "Could not write device info string (" << valStr @@ -856,7 +854,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, retVec->push_back(line); } - if (retVec->size() == 0) { + if (retVec->empty()) { ss << "Read devInfoMultiLineStr for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ")" << ", but contained no string lines"; @@ -864,13 +862,13 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, return 0; } // Remove any *trailing* empty (whitespace) lines - while (retVec->size() != 0 && + while (!retVec->empty() && retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) { retVec->pop_back(); } // allow logging output of multiline strings - for (auto l: *retVec) { + for (const auto& l: *retVec) { allLines += "\n" + l; } @@ -905,10 +903,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); - if (tempStr == "") { + if (tempStr.empty()) { return EINVAL; } - tmp_val = std::stoi(tempStr, 0, 16); + tmp_val = std::stoi(tempStr, nullptr, 16); if (tmp_val < 0) { return EINVAL; } @@ -930,10 +928,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevXGMIError: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); - if (tempStr == "") { + if (tempStr.empty()) { return EINVAL; } - *val = std::stoul(tempStr, 0); + *val = std::stoul(tempStr, nullptr); break; case kDevUniqueId: @@ -960,10 +958,10 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevFwVersionVcn: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); - if (tempStr == "") { + if (tempStr.empty()) { return EINVAL; } - *val = std::stoul(tempStr, 0, 16); + *val = std::stoul(tempStr, nullptr, 16); break; case kDevGpuReset: @@ -1100,7 +1098,7 @@ void Device::DumpSupportedFunctions(void) { } void Device::fillSupportedFuncs(void) { - if (supported_funcs_.size() != 0) { + if (!supported_funcs_.empty()) { return; } if (monitor() == nullptr) { @@ -1140,7 +1138,7 @@ void Device::fillSupportedFuncs(void) { std::vector::const_iterator var = it->second.variants.begin(); - if (it->second.variants.size() == 0) { + if (it->second.variants.empty()) { supported_funcs_[it->first] = nullptr; it++; continue; @@ -1156,7 +1154,7 @@ void Device::fillSupportedFuncs(void) { (*supported_variants)[kDevInfoVarTypeToRSMIVariant.at(*var)] = nullptr; } - if ((*supported_variants).size() > 0) { + if (!(*supported_variants).empty()) { supported_funcs_[it->first] = supported_variants; } @@ -1202,35 +1200,32 @@ bool Device::DeviceAPISupported(std::string name, uint64_t variant, if (sub_variant == RSMI_DEFAULT_VARIANT) { return true; - } else { // sub_variant != RSMI_DEFAULT_VARIANT - // if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr - assert(var_it->second != nullptr); + } + // sub_variant != RSMI_DEFAULT_VARIANT + // if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr + assert(var_it->second != nullptr); - return subvariant_match(&(var_it->second), sub_variant); - } - } else { // variant == RSMI_DEFAULT_VARIANT - if (func_it->second != nullptr) { - var_it = func_it->second->find(variant); - } - if (sub_variant == RSMI_DEFAULT_VARIANT) { - return true; - } else { // sub_variant != RSMI_DEFAULT_VARIANT - if (func_it->second == nullptr) { - return false; - } - return subvariant_match(&(var_it->second), sub_variant); - } + return subvariant_match(&(var_it->second), sub_variant); } - assert(false); // We should not reach here - - return false; + // variant == RSMI_DEFAULT_VARIANT + if (func_it->second != nullptr) { + var_it = func_it->second->find(variant); + } + if (sub_variant == RSMI_DEFAULT_VARIANT) { + return true; + } + // sub_variant != RSMI_DEFAULT_VARIANT + if (func_it->second == nullptr) { + return false; + } + return subvariant_match(&(var_it->second), sub_variant); } rsmi_status_t Device::restartAMDGpuDriver(void) { REQUIRE_ROOT_ACCESS bool restartSuccessful = true; bool success = false; - std::string out = ""; + std::string out; bool wasGdmServiceActive = false; // sudo systemctl is-active gdm diff --git a/src/rocm_smi_gpu_metrics.cc b/src/rocm_smi_gpu_metrics.cc index 885c36d7f6..c2ad2e2659 100755 --- a/src/rocm_smi_gpu_metrics.cc +++ b/src/rocm_smi_gpu_metrics.cc @@ -41,23 +41,22 @@ * */ -#include #include - -#include -#include -#include -#include -#include -#include -#include // NOLINT -#include #include -#include + +#include +#include +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include #include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h #include "rocm_smi/rocm_smi_main.h" -#include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_logger.h" @@ -150,7 +149,7 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, const rsmi_gpu_metrics_v_1_2 *rsmi_gpu_metrics_v_1_2, const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3, const rsmi_gpu_metrics_t *rsmi_gpu_metrics) { - if (RocmSMI::getInstance().isLoggingOn() == false) { + if (!RocmSMI::getInstance().isLoggingOn()) { return; } std::ostringstream ss; @@ -170,9 +169,8 @@ void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, } if (rsmi_gpu_metrics == nullptr) { return; - } else { - // do nothing - continue } + ss /* Common Header */ << print_unsigned_hex_and_int( @@ -365,7 +363,7 @@ static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind, } #define ASSIGN_DATA_FIELD(FIELD, SRC) \ - data->FIELD = SRC->FIELD; + data->FIELD = (SRC)->FIELD; #define ASSIGN_COMMON_FORMATS(SRC) \ ASSIGN_DATA_FIELD(common_header, (SRC)) \ diff --git a/src/rocm_smi_io_link.cc b/src/rocm_smi_io_link.cc index 888f13fffa..218e520d84 100755 --- a/src/rocm_smi_io_link.cc +++ b/src/rocm_smi_io_link.cc @@ -41,20 +41,19 @@ * */ -#include -#include #include +#include #include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include "rocm_smi/rocm_smi.h" -#include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_io_link.h" @@ -161,7 +160,7 @@ static int ReadLinkProperties(uint32_t node_indx, uint32_t link_indx, retVec->push_back(line); } - if (retVec->size() == 0) { + if (retVec->empty()) { fs.close(); return 0; } @@ -182,7 +181,7 @@ static int DiscoverLinks(std::map, if (links == nullptr) { return EINVAL; } - assert(links->size() == 0); + assert(links->empty()); links->clear(); @@ -229,8 +228,8 @@ static int DiscoverLinks(std::map, } link_indx = static_cast(std::stoi(dentry_io_link->d_name)); - link = std::shared_ptr(new IOLink(node_indx, link_indx, - directory)); + link = std::make_shared(node_indx, link_indx, + directory); link->Initialize(); @@ -273,7 +272,7 @@ static int DiscoverLinksPerNode(uint32_t node_indx, std::mapsize() == 0); + assert(links->empty()); links->clear(); @@ -297,8 +296,8 @@ static int DiscoverLinksPerNode(uint32_t node_indx, std::map(std::stoi(dentry->d_name)); - link = std::shared_ptr(new IOLink(node_indx, link_indx, - directory)); + link = std::make_shared(node_indx, link_indx, + directory); link->Initialize(); @@ -323,16 +322,15 @@ int DiscoverP2PLinksPerNode(uint32_t node_indx, std::map propVec; - assert(properties_.size() == 0); - if (properties_.size() > 0) { + assert(properties_.empty()); + if (!properties_.empty()) { return 0; } @@ -347,8 +345,8 @@ int IOLink::ReadProperties(void) { uint64_t val_int; // Assume all properties are unsigned integers for now std::istringstream fs; - for (uint32_t i = 0; i < propVec.size(); ++i) { - fs.str(propVec[i]); + for (const auto & i : propVec) { + fs.str(i); fs >> key_str; fs >> val_int; diff --git a/src/rocm_smi_kfd.cc b/src/rocm_smi_kfd.cc index 092bcb3414..afe567d80d 100755 --- a/src/rocm_smi_kfd.cc +++ b/src/rocm_smi_kfd.cc @@ -41,27 +41,27 @@ * */ -#include -#include -#include -#include -#include #include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include #include "rocm_smi/rocm_smi_io_link.h" #include "rocm_smi/rocm_smi_kfd.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" -#include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi_main.h" namespace amd { @@ -195,7 +195,7 @@ int ReadKFDDeviceProperties(uint32_t kfd_node_id, retVec->push_back(line); } - if (retVec->size() == 0) { + if (retVec->empty()) { fs.close(); return ENOENT; } @@ -517,7 +517,7 @@ int DiscoverKFDNodes(std::map> *nodes) { if (nodes == nullptr) { return EINVAL; } - assert(nodes->size() == 0); + assert(nodes->empty()); nodes->clear(); @@ -548,7 +548,7 @@ int DiscoverKFDNodes(std::map> *nodes) { continue; } - node = std::shared_ptr(new KFDNode(node_indx)); + node = std::make_shared(node_indx); node->Initialize(); @@ -596,16 +596,15 @@ int DiscoverKFDNodes(std::map> *nodes) { return 0; } -KFDNode::~KFDNode() { -} +KFDNode::~KFDNode() = default; int KFDNode::ReadProperties(void) { int ret; std::vector propVec; - assert(properties_.size() == 0); - if (properties_.size() > 0) { + assert(properties_.empty()); + if (!properties_.empty()) { return 0; } @@ -620,8 +619,8 @@ int KFDNode::ReadProperties(void) { uint64_t val_int; // Assume all properties are unsigned integers for now std::istringstream fs; - for (uint32_t i = 0; i < propVec.size(); ++i) { - fs.str(propVec[i]); + for (const auto & i : propVec) { + fs.str(i); fs >> key_str; fs >> val_int; diff --git a/src/rocm_smi_logger.cc b/src/rocm_smi_logger.cc index 0600654ef3..ccbb12c29a 100644 --- a/src/rocm_smi_logger.cc +++ b/src/rocm_smi_logger.cc @@ -55,7 +55,7 @@ * be printed, unless RSMI_LOGGING is enabled. * * BUFFER log type should be use while logging raw buffer or raw messages - * Having direct interface as well as C++ Singleton inface. Can use + * Having direct interface as well as C++ Singleton iface. Can use * whatever interface fits your needs. */ @@ -70,7 +70,6 @@ // Code Specific Header Files(s) #include "rocm_smi/rocm_smi_logger.h" #include "rocm_smi/rocm_smi_main.h" -#include "rocm_smi/rocm_smi_utils.h" using namespace ROCmLogging; @@ -117,7 +116,7 @@ void Logger::logIntoFile(std::string& data) { if(!m_File.is_open()) { initialize_resources(); if (!m_File.is_open()) { - std::cout << "WARNING: re-initializing resources was unsuccessfull." + std::cout << "WARNING: re-initializing resources was unsuccessful." <<" Unable to print the following message." << std::endl; logOnConsole(data); unlock(); @@ -164,7 +163,7 @@ void Logger::error(const char* text) throw() { // By default, logging is disabled // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -198,7 +197,7 @@ void Logger::alarm(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -232,7 +231,7 @@ void Logger::always(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -270,7 +269,7 @@ void Logger::buffer(const char* text) throw() { if(!m_File.is_open()) { initialize_resources(); if (!m_File.is_open()) { - std::cout << "WARNING: re-initializing resources was unsuccessfull." + std::cout << "WARNING: re-initializing resources was unsuccessful." <<" Unable to print the following message." << std::endl; std::string txtStr(text); std::cout << txtStr << std::endl; @@ -300,7 +299,7 @@ void Logger::info(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -334,7 +333,7 @@ void Logger::trace(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -368,7 +367,7 @@ void Logger::debug(const char* text) throw() { // By default, logging is disabled (ie. no RSMI_LOGGING) // The check below allows us to toggle logging through RSMI_LOGGING // set or unset - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } @@ -426,7 +425,7 @@ void Logger::enableFileLogging() { // Returns a string of details on current log settings std::string Logger::getLogSettings() { - std::string logSettings = ""; + std::string logSettings; if (m_File.is_open()) { logSettings += "OpenStatus = File (" + logFileName + ") is open"; @@ -490,7 +489,7 @@ void Logger::initialize_resources() { // The check below allows us to toggle logging through RSMI_LOGGING // set or unset m_loggingIsOn = amd::smi::RocmSMI::getInstance().isLoggingOn(); - if (m_loggingIsOn == false) { + if (!m_loggingIsOn) { return; } m_File.open(logFileName.c_str(), std::ios::out | std::ios::app); diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index 8cb95fe7f2..1eb973c17e 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -39,25 +39,26 @@ * DEALINGS WITH THE SOFTWARE. * */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include + +#include #include -#include +#include +#include +#include +#include +#include #include +#include +#include #include +#include +#include +#include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_device.h" @@ -284,7 +285,8 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) { // We are looking for the last element in the path that has the form // XXXX:XX:XX.X, where X is a hex integer (lower case is expected) - std::size_t slash_i, end_i; + std::size_t slash_i; + std::size_t end_i; std::string tmp; std::string tpath_str(tpath); @@ -333,7 +335,7 @@ RocmSMI::Initialize(uint64_t flags) { // To help debug env variable issues // printEnvVarInfo(); - while (std::string(kAMDMonitorTypes[i]) != "") { + while (!std::string(kAMDMonitorTypes[i]).empty()) { amd_monitor_types_.insert(kAMDMonitorTypes[i]); ++i; } @@ -347,12 +349,12 @@ RocmSMI::Initialize(uint64_t flags) { } uint64_t bdfid; - for (uint32_t i = 0; i < devices_.size(); ++i) { - if (ConstructBDFID(devices_[i]->path(), &bdfid) != 0) { + for (auto & device : devices_) { + if (ConstructBDFID(device->path(), &bdfid) != 0) { std::cerr << "Failed to construct BDFID." << std::endl; ret = 1; } else { - devices_[i]->set_bdfid(bdfid); + device->set_bdfid(bdfid); } } if (ret != 0) { @@ -443,8 +445,7 @@ RocmSMI::RocmSMI(uint64_t flags) : init_options_(flags), kfd_notif_evt_fh_(-1), kfd_notif_evt_fh_refcnt_(0) { } -RocmSMI::~RocmSMI() { -} +RocmSMI::~RocmSMI() = default; RocmSMI& RocmSMI::getInstance(uint64_t flags) { // Assume c++11 or greater. static objects will be created by only 1 thread @@ -493,7 +494,7 @@ static inline std::unordered_set GetEnvVarUIntegerSets( if(ev_str == nullptr) { return returnSet; } std::string stringEnv = ev_str; - if (stringEnv.empty() == false) { + if (!stringEnv.empty()) { // parse out values by commas std::string parsedVal; std::istringstream ev_str_ss(stringEnv); @@ -571,7 +572,7 @@ void RocmSMI::printEnvVarInfo(void) { << std::endl; std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " << getLogSetting() << std::endl; - bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false; + bool isLoggingOn = RocmSMI::isLoggingOn(); std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " << (isLoggingOn ? "true" : "false") << std::endl; std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {"; @@ -637,7 +638,7 @@ RocmSMI::FindMonitor(std::string monitor_path) { fs.close(); if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) { - m = std::shared_ptr(new Monitor(mon_name, &env_vars_)); + m = std::make_shared(mon_name, &env_vars_); m->setTempSensorLabelMap(); m->setVoltSensorLabelMap(); break; @@ -665,12 +666,12 @@ RocmSMI::AddToDeviceList(std::string dev_name) { dev_path += "/"; dev_path += dev_name; - auto dev = std::shared_ptr(new Device(dev_path, &env_vars_)); + auto dev = std::make_shared(dev_path, &env_vars_); std::shared_ptr m = FindMonitor(dev_path + "/device/hwmon"); dev->set_monitor(m); - std::string d_name = dev_name; + const std::string& d_name = dev_name; uint32_t card_indx = GetDeviceIndex(d_name); dev->set_drm_render_minor(GetDrmRenderMinor(dev_path)); dev->set_card_index(card_indx); @@ -681,8 +682,6 @@ RocmSMI::AddToDeviceList(std::string dev_name) { << dev_name << " | path = " << dev_path << " | card index = " << std::to_string(card_indx) << " | "; LOG_DEBUG(ss); - - return; } static const uint32_t kAmdGpuId = 0x1002; @@ -789,7 +788,7 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) { power_mons_.clear(); } - if (power_mons_.size() != 0) { + if (!power_mons_.empty()) { return 0; } @@ -817,7 +816,7 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) { if (FileExists(tmp.c_str())) { std::shared_ptr mon = - std::shared_ptr(new PowerMon(mon_name, &env_vars_)); + std::make_shared(mon_name, &env_vars_); power_mons_.push_back(mon); mon->set_dev_index(GetDeviceIndex(dentry->d_name)); } @@ -830,8 +829,8 @@ int RocmSMI::DiscoverAMDPowerMonitors(bool force_update) { return errno; } - for (auto m : power_mons_) { - for (auto d : devices_) { + for (const auto& m : power_mons_) { + for (const auto& d : devices_) { if (m->dev_index() == d->index()) { d->set_power_monitor(m); break; diff --git a/src/rocm_smi_monitor.cc b/src/rocm_smi_monitor.cc index 00035e6307..f0fe75fdbc 100755 --- a/src/rocm_smi_monitor.cc +++ b/src/rocm_smi_monitor.cc @@ -41,19 +41,18 @@ * */ -#include #include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include #include // NOLINT +#include #include -#include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_exception.h" @@ -286,8 +285,7 @@ static const std::map kMonFuncDependsMap = { env_ = nullptr; #endif } -Monitor::~Monitor(void) { -} +Monitor::~Monitor(void) = default; std::string Monitor::MakeMonitorPath(MonitorTypes type, uint32_t sensor_id) { @@ -339,7 +337,7 @@ Monitor::setTempSensorLabelMap(void) { std::string type_str; int ret; - if (temp_type_index_map_.size() > 0) { + if (!temp_type_index_map_.empty()) { return 0; // We've already filled in the map } auto add_temp_sensor_entry = [&](uint32_t file_index) { @@ -377,7 +375,7 @@ Monitor::setVoltSensorLabelMap(void) { std::string type_str; int ret; - if (volt_type_index_map_.size() > 0) { + if (!volt_type_index_map_.empty()) { return 0; // We've already filled in the map } auto add_volt_sensor_entry = [&](uint32_t file_index) { @@ -510,10 +508,10 @@ typedef enum { static monitor_types getFuncType(std::string f_name) { monitor_types ret = eDefaultMonitor; - if (f_name.compare("rsmi_dev_temp_metric_get") == 0) { + if (f_name == "rsmi_dev_temp_metric_get") { ret = eTempMonitor; } - if (f_name.compare("rsmi_dev_volt_metric_get") == 0) { + if (f_name == "rsmi_dev_volt_metric_get") { ret = eVoltMonitor; } return ret; @@ -614,22 +612,22 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { } else { supported_monitors = intersect; } - if (supported_monitors.size() > 0) { - for (uint32_t i = 0; i < supported_monitors.size(); ++i) { + if (!supported_monitors.empty()) { + for (unsigned long & supported_monitor : supported_monitors) { if (m_type == eDefaultMonitor) { - assert(supported_monitors[i] > 0); - supported_monitors[i] |= - (supported_monitors[i] - 1) << MONITOR_TYPE_BIT_POSITION; + assert(supported_monitor > 0); + supported_monitor |= + (supported_monitor - 1) << MONITOR_TYPE_BIT_POSITION; } else if (m_type == eTempMonitor) { // Temp sensor file names are 1-based - assert(supported_monitors[i] > 0); - supported_monitors[i] |= - static_cast(getTempSensorEnum(supported_monitors[i])) + assert(supported_monitor > 0); + supported_monitor |= + static_cast(getTempSensorEnum(supported_monitor)) << MONITOR_TYPE_BIT_POSITION; } else if (m_type == eVoltMonitor) { // Voltage sensor file names are 0-based - supported_monitors[i] |= - static_cast(getVoltSensorEnum(supported_monitors[i])) + supported_monitor |= + static_cast(getVoltSensorEnum(supported_monitor)) << MONITOR_TYPE_BIT_POSITION; } else { assert(false); // Unexpected monitor type @@ -640,10 +638,10 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { } } - if (it->second.variants.size() == 0) { + if (it->second.variants.empty()) { (*supported_funcs)[it->first] = nullptr; supported_variants = nullptr; // Invoke destructor - } else if ((*supported_variants).size() > 0) { + } else if (!(*supported_variants).empty()) { (*supported_funcs)[it->first] = supported_variants; } diff --git a/src/rocm_smi_power_mon.cc b/src/rocm_smi_power_mon.cc index 3e1d7e0d45..454851651b 100755 --- a/src/rocm_smi_power_mon.cc +++ b/src/rocm_smi_power_mon.cc @@ -41,17 +41,14 @@ * */ -#include - -#include -#include +#include #include -#include +#include #include +#include #include +#include -#include "rocm_smi/rocm_smi_main.h" -#include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi_exception.h" @@ -70,8 +67,7 @@ static const std::map kMonitorNameMap = { PowerMon::PowerMon(std::string path, RocmSMI_env_vars const *e) : path_(path), env_(e) { } -PowerMon::~PowerMon(void) { -} +PowerMon::~PowerMon(void) = default; static int parse_power_str(std::string s, PowerMonTypes type, uint64_t *val) { std::stringstream ss(s); diff --git a/src/rocm_smi_properties.cc b/src/rocm_smi_properties.cc index 0e606e6874..b076991881 100644 --- a/src/rocm_smi_properties.cc +++ b/src/rocm_smi_properties.cc @@ -90,7 +90,6 @@ AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id) { static_cast(AMDGpuPropertyTypesOffset_t::kClkTypes) | static_cast(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes); - auto property_type_offset = (static_cast(property_type_offset_mask) & (property_id)); auto property_type_id = (static_cast(property_id) & ~(property_type_offset_mask)); return property_type_id; @@ -435,7 +434,7 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx id_filter_result = rsmi_dev_revision_get(dev_idx, &tmp_amdgpu_query.m_pci_rev_id); } } - is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) ? true : false; + is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS); return tmp_amdgpu_query; }; @@ -475,13 +474,6 @@ rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuProper return (amdgpu_property_reinforcement_list.find(asic_id) != amdgpu_property_reinforcement_list.end()); }; - auto ends_with = [](const std::string& value, const std::string& ending) { - if (value.size() < ending.size()) { - return false; - } - return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); - }; - // Traverse through all values for a given key osstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n"; LOG_TRACE(osstream); diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index 5009d3bd14..973d555d26 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -40,26 +40,26 @@ * DEALINGS WITH THE SOFTWARE. * */ -#include -#include -#include -#include + #include #include +#include #include +#include -#include -#include -#include +#include +#include +#include #include +#include +#include #include #include -#include -#include -#include #include -#include +#include +#include #include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_utils.h" @@ -137,7 +137,7 @@ std::vector globFilesExist(const std::string& filePattern) { glob_t result_glob; memset(&result_glob, 0, sizeof(result_glob)); - if (glob(filePattern.c_str(), GLOB_TILDE, NULL, &result_glob) != 0) { + if (glob(filePattern.c_str(), GLOB_TILDE, nullptr, &result_glob) != 0) { globfree(&result_glob); // Leaving below to help debug issues discovering future glob file searches // debugFilesDiscovered(fileNames); @@ -145,7 +145,7 @@ std::vector globFilesExist(const std::string& filePattern) { } for(size_t i = 0; i < result_glob.gl_pathc; ++i) { - fileNames.push_back(std::string(result_glob.gl_pathv[i])); + fileNames.emplace_back(result_glob.gl_pathv[i]); } globfree(&result_glob); @@ -367,7 +367,7 @@ std::string removeString(const std::string origStr, // defaults to trim stdOut std::pair executeCommand(std::string command, bool stdOut) { char buffer[128]; - std::string stdoutAndErr = ""; + std::string stdoutAndErr; bool successfulRun = true; command = "stdbuf -i0 -o0 -e0 " + command; // remove stdOut and err buffering @@ -397,14 +397,10 @@ std::pair executeCommand(std::string command, bool stdOut) { return std::make_pair(successfulRun, stdoutAndErr); } -// originalstring - string to search for substring +// originalString - string to search for substring // substring - string looking to find bool containsString(std::string originalString, std::string substring) { - if (originalString.find(substring) != std::string::npos) { - return true; - } else { - return false; - } + return (originalString.find(substring) != std::string::npos); } // Creates and stores supplied data into a temporary file (within /tmp/). @@ -415,9 +411,9 @@ bool containsString(std::string originalString, std::string substring) { // https://man7.org/linux/man-pages/man3/mkstemp.3.html // // Temporary file name format: -// ___ +// ___ // - prefix for our application's identifier (see kTmpFilePrefix) -// - name of parameter being stored +// - name of parameter being stored // - state at which the stored value captures // - device identifier // @@ -452,9 +448,8 @@ rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName, close(fd); if (rc_write == -1) { return RSMI_STATUS_FILE_ERROR; - } else { - return RSMI_STATUS_SUCCESS; } + return RSMI_STATUS_SUCCESS; } std::vector getListOfAppTmpFiles() { @@ -463,16 +458,18 @@ std::vector getListOfAppTmpFiles() { struct dirent *ent; std::vector tmpFiles; - if ((dir = opendir(path.c_str())) != nullptr) { - // captures all files & directories under specified path - while ((ent = readdir(dir)) != nullptr) { - std::string fileDirName = ent->d_name; - // we only want our app specific files - if (containsString(fileDirName, kTmpFilePrefix)) { - tmpFiles.emplace_back(path + "/" + fileDirName); - } else { - continue; - } + dir = opendir(path.c_str()); + if (dir == nullptr) { + return tmpFiles; + } + // captures all files & directories under specified path + while ((ent = readdir(dir)) != nullptr) { + std::string fileDirName = ent->d_name; + // we only want our app specific files + if (containsString(fileDirName, kTmpFilePrefix)) { + tmpFiles.emplace_back(path + "/" + fileDirName); + } else { + continue; } } return tmpFiles; @@ -501,7 +498,7 @@ std::vector readEntireFile(std::string path) { std::string line; while (std::getline(inFileStream, line)) { std::istringstream ss(line); - if(line.size() > 0) { + if (!line.empty()) { fileContent.push_back(line); } } @@ -513,7 +510,7 @@ std::vector readEntireFile(std::string path) { // and their content void displayAppTmpFilesContent() { std::vector tmpFiles = getListOfAppTmpFiles(); - if (tmpFiles.empty() == false) { + if (!tmpFiles.empty()) { for (auto &x: tmpFiles) { std::string out = readFile(x); std::cout << __PRETTY_FUNCTION__ << " | Temporary file: " << x @@ -529,7 +526,7 @@ void displayAppTmpFilesContent() { std::string debugVectorContent(std::vector v) { std::ostringstream ss; ss << "Vector = {"; - if (v.size() > 0) { + if (!v.empty()) { for (auto it=v.begin(); it < v.end(); it++) { ss << *it; auto temp_it = it; @@ -547,7 +544,7 @@ std::string debugVectorContent(std::vector v) { std::string displayAllDevicePaths(std::vector> v) { std::ostringstream ss; ss << "Vector = {"; - if (v.size() > 0) { + if (!v.empty()) { for (auto it=v.begin(); it < v.end(); it++) { ss << (*it)->path(); auto temp_it = it; @@ -562,7 +559,7 @@ std::string displayAllDevicePaths(std::vector> v) { } // Attempts to read application specific temporary file -// This method is to be used for reading (or determing if it exists), +// This method is to be used for reading (or determining if it exists), // in order to keep file naming scheme consistent. // // dv_ind - device index @@ -580,7 +577,7 @@ std::tuple readTmpFile(uint32_t dv_ind, "_" + std::to_string(dv_ind); std::string fileContent; std::vector tmpFiles = getListOfAppTmpFiles(); - if (tmpFiles.empty() == false) { + if (!tmpFiles.empty()) { for (auto &x: tmpFiles) { if (containsString(x, tmpFileName)) { fileContent = readFile(x); @@ -620,7 +617,11 @@ std::tuple fileContent = readEntireFile(filePath); for (auto &line: fileContent) { if (line.find("PRETTY_NAME=") != std::string::npos) { @@ -668,11 +669,17 @@ std::tupleSetUp(); test->Run(); - return; } static void RunCustomTestEpilog(TestBase *tst) { if (sRSMIGlvalues->verbosity >= TestBase::VERBOSE_STANDARD) { tst->DisplayResults(); } tst->Close(); - return; } // If the test case one big test, you should use RunGenericTest() @@ -127,7 +125,6 @@ static void RunCustomTestEpilog(TestBase *tst) { static void RunGenericTest(TestBase *test) { RunCustomTestProlog(test); RunCustomTestEpilog(test); - return; } // TEST ENTRY TEMPLATE: diff --git a/tests/rocm_smi_test/test_base.cc b/tests/rocm_smi_test/test_base.cc index a406868c63..0acce3a150 100755 --- a/tests/rocm_smi_test/test_base.cc +++ b/tests/rocm_smi_test/test_base.cc @@ -43,7 +43,7 @@ * */ -#include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi_test/test_base.h" @@ -61,10 +61,9 @@ static const char kResultsLabel[] = "TEST RESULTS"; // This one is used outside this file const char kSetupLabel[] = "TEST SETUP"; -TestBase::TestBase() : setup_failed_(false), description_("") { -} -TestBase::~TestBase() { +TestBase::TestBase() : setup_failed_(false) { } +TestBase::~TestBase() = default; void TestBase::MakeHeaderStr(const char *inStr, std::string *outStr) const { @@ -116,8 +115,6 @@ void TestBase::SetUp(uint64_t init_flags) { std::cout << "No ROCm SMI tests can be run." << std::endl; } } - - return; } void TestBase::PrintDeviceHeader(uint32_t dv_ind) { @@ -213,7 +210,7 @@ void TestBase::set_description(std::string d) { size_t endlptr; for (size_t i = le; i < description_.size(); i += le) { - endlptr = description_.find_last_of(" ", i); + endlptr = description_.find_last_of(' ', i); description_.replace(endlptr, 1, "\n"); i = endlptr; } diff --git a/tests/rocm_smi_test/test_base.h b/tests/rocm_smi_test/test_base.h index d2ced349bc..c175f55c9b 100755 --- a/tests/rocm_smi_test/test_base.h +++ b/tests/rocm_smi_test/test_base.h @@ -45,6 +45,7 @@ #ifndef TESTS_ROCM_SMI_TEST_TEST_BASE_H_ #define TESTS_ROCM_SMI_TEST_TEST_BASE_H_ +#include #include class TestBase { @@ -142,9 +143,8 @@ class TestBase { "\t===> Abort is over-ridden due to dont_fail command line option." \ << std::endl; \ return; \ - } else { \ - ASSERT_EQ(RSMI_STATUS_SUCCESS, (RET)); \ } \ + ASSERT_EQ(RSMI_STATUS_SUCCESS, (RET)); \ } void MakeHeaderStr(const char *inStr, std::string *outStr); diff --git a/tests/rocm_smi_test/test_common.cc b/tests/rocm_smi_test/test_common.cc index d7a8a34d86..db6fa24098 100755 --- a/tests/rocm_smi_test/test_common.cc +++ b/tests/rocm_smi_test/test_common.cc @@ -43,13 +43,13 @@ * */ -#include -#include #include +#include +#include #include -#include #include +#include #include "rocm_smi_test/test_base.h" #include "rocm_smi_test/test_common.h" diff --git a/tests/rocm_smi_test/test_common.h b/tests/rocm_smi_test/test_common.h index e601fb0e8e..ba425cc462 100755 --- a/tests/rocm_smi_test/test_common.h +++ b/tests/rocm_smi_test/test_common.h @@ -74,7 +74,7 @@ void DumpMonitorInfo(const TestBase *test); #endif #define DISPLAY_RSMI_ERR(RET) { \ - if (RET != RSMI_STATUS_SUCCESS) { \ + if ((RET) != RSMI_STATUS_SUCCESS) { \ const char *err_str; \ std::cout << "\t===> ERROR: RSMI call returned " << (RET) << std::endl; \ rsmi_status_string((RET), &err_str); \ @@ -91,7 +91,7 @@ void DumpMonitorInfo(const TestBase *test); } \ } #define CHK_RSMI_PERM_ERR(RET) { \ - if (RET == RSMI_STATUS_PERMISSION) { \ + if ((RET) == RSMI_STATUS_PERMISSION) { \ std::cout << "This command requires root access." << std::endl; \ } else { \ DISPLAY_RSMI_ERR(RET) \ From ed6777a8e7f94c0f82f433284d3c115454481074 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Fri, 25 Aug 2023 22:25:25 -0500 Subject: [PATCH 3/6] Add GPU partition nodes * Updates: - Fixed infinit loop on systems which did not have VRAM files - Fixed concise info from throwing exception with no amdgpu driver loaded - Fix for ability to see all nodes when after switching partitions (mirrors original card display/settings) - Added to logs build type, lib path, and set env. variables Change-Id: Ic0333df355144ce2242cecea93fe4ce51caf311c Signed-off-by: Charis Poag --- include/rocm_smi/rocm_smi_kfd.h | 4 + include/rocm_smi/rocm_smi_main.h | 3 +- include/rocm_smi/rocm_smi_utils.h | 6 +- oam/CMakeLists.txt | 2 +- python_smi_tools/rocm_smi.py | 11 +- rocm_smi/CMakeLists.txt | 2 +- src/rocm_smi.cc | 41 +++++- src/rocm_smi_kfd.cc | 91 +++++++++++++- src/rocm_smi_main.cc | 194 +++++++++++++++++++++++------ src/rocm_smi_utils.cc | 100 ++++++++++++--- tests/rocm_smi_test/CMakeLists.txt | 3 +- 11 files changed, 390 insertions(+), 67 deletions(-) diff --git a/include/rocm_smi/rocm_smi_kfd.h b/include/rocm_smi/rocm_smi_kfd.h index 9cf8fd8e40..90c7f6ff3b 100755 --- a/include/rocm_smi/rocm_smi_kfd.h +++ b/include/rocm_smi/rocm_smi_kfd.h @@ -118,6 +118,10 @@ GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_count); int ReadKFDDeviceProperties(uint32_t dev_id, std::vector *retVec); +int read_node_properties(uint32_t node, std::string property_name, + uint64_t *val); +int get_gpu_id(uint32_t node, uint64_t *gpu_id); + } // namespace smi } // namespace amd diff --git a/include/rocm_smi/rocm_smi_main.h b/include/rocm_smi/rocm_smi_main.h index f276bd85bb..8b60324988 100755 --- a/include/rocm_smi/rocm_smi_main.h +++ b/include/rocm_smi/rocm_smi_main.h @@ -113,7 +113,8 @@ class RocmSMI { uint64_t *weight); int get_node_index(uint32_t dv_ind, uint32_t *node_ind); const RocmSMI_env_vars& getEnv(void); - void printEnvVarInfo(void); + std::string getRSMIEnvVarInfo(void); + void debugRSMIEnvVarInfo(); bool isLoggingOn(void); uint32_t getLogSetting(void); static const std::map devInfoTypesStrings; diff --git a/include/rocm_smi/rocm_smi_utils.h b/include/rocm_smi/rocm_smi_utils.h index 5ba813a273..49a3521dc1 100755 --- a/include/rocm_smi/rocm_smi_utils.h +++ b/include/rocm_smi/rocm_smi_utils.h @@ -99,13 +99,17 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type, rsmi_status_t ErrnoToRsmiStatus(int err); std::string getRSMIStatusString(rsmi_status_t ret); std::tuple + std::string, std::string, std::string, std::string, + std::string, std::string, std::string> getSystemDetails(void); void logSystemDetails(void); rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str); void logHexDump(const char *desc, const void *addr, const size_t len, size_t perLine); bool isSystemBigEndian(); +std::string getBuildType(); +std::string getMyLibPath(); +int subDirectoryCountInPath(const std::string path); template std::string print_int_as_hex(T i, bool showHexNotation=true) { std::stringstream ss; diff --git a/oam/CMakeLists.txt b/oam/CMakeLists.txt index dc674b0bb0..6927d245e6 100644 --- a/oam/CMakeLists.txt +++ b/oam/CMakeLists.txt @@ -72,7 +72,7 @@ target_include_directories(${OAM_EXAMPLE_EXE} PRIVATE ${OAM_INC_LIST}) target_link_libraries(${OAM_EXAMPLE_EXE} ${OAM_TARGET}) add_library(${OAM_TARGET} ${CMN_SRC_LIST} ${OAM_SRC_LIST} ${CMN_INC_LIST} ${OAM_INC_LIST}) -target_link_libraries(${OAM_TARGET} pthread rt) +target_link_libraries(${OAM_TARGET} pthread rt dl) target_include_directories(${OAM_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include ${COMMON_PROJ_ROOT}/common/shared_mutex) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index d0bb5ab365..1d4c7e69a1 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -1594,7 +1594,9 @@ def showAllConcise(deviceList): printLogSpacer(' Concise Info ') deviceList.sort() - (temp_type, _) = findFirstAvailableTemp(deviceList[0]) + temp_type = '(' + temp_type_lst[0] + ')' + if len(deviceList) >= 1: + (temp_type, _) = findFirstAvailableTemp(deviceList[0]) available_temp_type = temp_type.lower() available_temp_type = available_temp_type.replace('(', '') available_temp_type = available_temp_type.replace(')', '') @@ -1843,7 +1845,8 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr)) else: logging.debug('PCIe clock is unsupported on device[{}]'.format(device)) - printLogSpacer() + if not concise: + printLogSpacer() def showCurrentFans(deviceList): @@ -2786,7 +2789,9 @@ def getGraphColor(percentage): def showTempGraph(deviceList): deviceList.sort() - (temp_type, temp_value) = findFirstAvailableTemp(deviceList[0]) + temp_type = '(' + temp_type_lst[0] + ')' + if len(deviceList) >= 1: + (temp_type, _) = findFirstAvailableTemp(deviceList[0]) printLogSpacer(' Temperature Graph ' + temp_type + ' ') temp_type = temp_type.lower() temp_type = temp_type.replace('(', '') diff --git a/rocm_smi/CMakeLists.txt b/rocm_smi/CMakeLists.txt index ae8c017933..08b2599542 100755 --- a/rocm_smi/CMakeLists.txt +++ b/rocm_smi/CMakeLists.txt @@ -80,7 +80,7 @@ add_executable(${SMI_EXAMPLE_EXE} "example/rocm_smi_example.cc") target_link_libraries(${SMI_EXAMPLE_EXE} ${ROCM_SMI_TARGET}) add_library(${ROCM_SMI_TARGET} ${CMN_SRC_LIST} ${SMI_SRC_LIST} ${CMN_INC_LIST} ${SMI_INC_LIST}) -target_link_libraries(${ROCM_SMI_TARGET} pthread rt) +target_link_libraries(${ROCM_SMI_TARGET} pthread rt dl) target_include_directories(${ROCM_SMI_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${COMMON_PROJ_ROOT}/common/shared_mutex) diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 506e784206..d8bd892ac8 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -2991,10 +2991,24 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, if (mem_type == RSMI_MEM_TYPE_VRAM && *total == 0) { GET_DEV_AND_KFDNODE_FROM_INDX if (kfd_node->get_total_memory(total) == 0 && *total > 0) { + ss << __PRETTY_FUNCTION__ + << " | inside success fallback... " + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: total = " << std::to_string(*total) + << " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS); + LOG_DEBUG(ss); return RSMI_STATUS_SUCCESS; } } + ss << __PRETTY_FUNCTION__ + << " | after fallback... " + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: total = " << std::to_string(*total) + << " | ret = " << getRSMIStatusString(ret); + LOG_DEBUG(ss); return ret; CATCH } @@ -3036,11 +3050,36 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, GET_DEV_AND_KFDNODE_FROM_INDX uint64_t total = 0; ret = get_dev_value_int(amd::smi::kDevMemTotVRAM, dv_ind, &total); - if (total != 0) return ret; // do not need to fallback + if (total != 0) { + ss << __PRETTY_FUNCTION__ + << " no fallback needed! - " + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: Used = " << std::to_string(*used) + << " | Data: total = " << std::to_string(total) + << " | ret = " << getRSMIStatusString(ret); + LOG_DEBUG(ss); + return ret; // do not need to fallback + } if ( kfd_node->get_used_memory(used) == 0 ) { + ss << __PRETTY_FUNCTION__ + << " | in fallback == success ..." + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: Used = " << std::to_string(*used) + << " | Data: total = " << std::to_string(total) + << " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS); + LOG_DEBUG(ss); return RSMI_STATUS_SUCCESS; } } + ss << __PRETTY_FUNCTION__ + << " | at end!!!! after fallback ..." + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: Used = " << std::to_string(*used) + << " | ret = " << getRSMIStatusString(ret); + LOG_DEBUG(ss); return ret; CATCH diff --git a/src/rocm_smi_kfd.cc b/src/rocm_smi_kfd.cc index afe567d80d..7fe9004cc3 100755 --- a/src/rocm_smi_kfd.cc +++ b/src/rocm_smi_kfd.cc @@ -63,6 +63,7 @@ #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_logger.h" namespace amd { namespace smi { @@ -775,20 +776,30 @@ KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth, // /sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties // size_in_bytes 68702699520 int KFDNode::get_total_memory(uint64_t* total) { - if (total == nullptr) return EINVAL; + std::ostringstream ss; + if (total == nullptr) { + return EINVAL; + } *total = 0; std::string f_path = kKFDNodesPathRoot; f_path += "/"; f_path += std::to_string(node_indx_); f_path += "/mem_banks"; + int subDirCount = subDirectoryCountInPath(f_path); + ss << __PRETTY_FUNCTION__ << " | [before loop] Within " << f_path + << " has subdirectory count = " << std::to_string(subDirCount); + LOG_DEBUG(ss); auto kfd_node_dir = opendir(f_path.c_str()); if (kfd_node_dir == nullptr) { return errno; } auto dentry = readdir(kfd_node_dir); - while (dentry != nullptr) { + while (dentry != nullptr && subDirCount > 0) { + ss << __PRETTY_FUNCTION__ << " | [inside loop] Within " << f_path + << " has subdirectory count = " << std::to_string(subDirCount); + LOG_DEBUG(ss); if (dentry->d_name[0] == '.') { dentry = readdir(kfd_node_dir); continue; @@ -822,6 +833,7 @@ int KFDNode::get_total_memory(uint64_t* total) { } } } // end loop for lines in property file + subDirCount--; } // end loop for mem_bank directory if (closedir(kfd_node_dir)) { @@ -862,5 +874,80 @@ int KFDNode::get_used_memory(uint64_t* used) { return 1; } +// /sys/class/kfd/kfd/topology/nodes/*/properties +int read_node_properties(uint32_t node, std::string property_name, + uint64_t *val) { + std::ostringstream ss; + int retVal = EINVAL; + if (property_name.empty() || val == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", property_name is empty or *val is nullptr " + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + return retVal; + } + std::shared_ptr myNode = std::shared_ptr(new KFDNode(node)); + myNode->Initialize(); + if (KFDNodeSupported(node)) { + retVal = myNode->get_property_value(property_name, val); + ss << __PRETTY_FUNCTION__ + << " | Successfully read node #" << std::to_string(node) + << " for property_name = " << property_name + << " | Data (" << property_name << ") * val = " + << std::to_string(*val) + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + } else { + retVal = 1; + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", KFD node was an unsupported node." + << " | return = " << std::to_string(retVal) + << " | "; + LOG_ERROR(ss); + } + return retVal; +} + +// /sys/class/kfd/kfd/topology/nodes/*/gpu_id +int get_gpu_id(uint32_t node, uint64_t *gpu_id) { + std::ostringstream ss; + int retVal = EINVAL; + if (gpu_id == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", gpu_id is a nullptr " + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + return retVal; + } + std::shared_ptr myNode = std::shared_ptr(new KFDNode(node)); + myNode->Initialize(); + if (KFDNodeSupported(node)) { + retVal = ReadKFDGpuId(node, gpu_id); + ss << __PRETTY_FUNCTION__ + << " | Successfully read node #" << std::to_string(node) + << " for gpu_id" + << " | Data (gpu_id) *gpu_id = " + << std::to_string(*gpu_id) + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + } else { + retVal = 1; + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", KFD node was an unsupported node." + << " | return = " << std::to_string(retVal) + << " | "; + LOG_ERROR(ss); + } + return retVal; +} + } // namespace smi } // namespace amd diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index 1eb973c17e..831e382b93 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -333,7 +333,7 @@ RocmSMI::Initialize(uint64_t flags) { GetEnvVariables(); // To help debug env variable issues - // printEnvVarInfo(); + // debugRSMIEnvVarInfo(); while (!std::string(kAMDMonitorTypes[i]).empty()) { amd_monitor_types_.insert(kAMDMonitorTypes[i]); @@ -390,7 +390,7 @@ RocmSMI::Initialize(uint64_t flags) { uint64_t bdfid = (*dev_iter)->bdfid(); if (tmp_map.find(bdfid) == tmp_map.end()) { ss << __PRETTY_FUNCTION__ << " | removing device = " - << (*dev_iter)->path(); + << (*dev_iter)->path() << "; bdfid = " << std::to_string(bdfid); dev_iter = devices_.erase(dev_iter); LOG_DEBUG(ss); continue; @@ -549,48 +549,54 @@ uint32_t RocmSMI::getLogSetting() { return this->env_vars_.logging_on; } -void RocmSMI::printEnvVarInfo(void) { - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_output_bitfield = " - << ((env_vars_.debug_output_bitfield == 0) ? "" - : std::to_string(env_vars_.debug_output_bitfield)) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_DRM_root_override = " - << ((env_vars_.path_DRM_root_override == nullptr) - ? "" : env_vars_.path_DRM_root_override) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_HWMon_root_override = " - << ((env_vars_.path_HWMon_root_override == nullptr) - ? "" : env_vars_.path_HWMon_root_override) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_power_root_override = " - << ((env_vars_.path_power_root_override == nullptr) - ? "" : env_vars_.path_power_root_override) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_inf_loop = " - << ((env_vars_.debug_inf_loop == 0) ? "" - : std::to_string(env_vars_.debug_inf_loop)) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " +void RocmSMI::debugRSMIEnvVarInfo(void) { + std::cout << __PRETTY_FUNCTION__ + << RocmSMI::getInstance().getRSMIEnvVarInfo(); +} + +std::string RocmSMI::getRSMIEnvVarInfo(void) { + std::ostringstream ss; + ss << "\n\tRSMI_DEBUG_BITFIELD = " + << ((env_vars_.debug_output_bitfield == 0) ? "" + : std::to_string(env_vars_.debug_output_bitfield)) + << std::endl; + ss << "\tRSMI_DEBUG_DRM_ROOT_OVERRIDE = " + << ((env_vars_.path_DRM_root_override == nullptr) + ? "" : env_vars_.path_DRM_root_override) + << std::endl; + ss << "\tRSMI_DEBUG_HWMON_ROOT_OVERRIDE = " + << ((env_vars_.path_HWMon_root_override == nullptr) + ? "" : env_vars_.path_HWMon_root_override) + << std::endl; + ss << "\tRSMI_DEBUG_PP_ROOT_OVERRIDE = " + << ((env_vars_.path_power_root_override == nullptr) + ? "" : env_vars_.path_power_root_override) + << std::endl; + ss << "\tRSMI_DEBUG_INFINITE_LOOP = " + << ((env_vars_.debug_inf_loop == 0) ? "" + : std::to_string(env_vars_.debug_inf_loop)) + << std::endl; + ss << "\tRSMI_LOGGING = " << getLogSetting() << std::endl; - bool isLoggingOn = RocmSMI::isLoggingOn(); - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " - << (isLoggingOn ? "true" : "false") << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {"; + bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false; + ss << "\tRSMI_LOGGING (are logs on) = " + << (isLoggingOn ? "TRUE" : "FALSE") << std::endl; + ss << "\tRSMI_DEBUG_ENUM_OVERRIDE = {"; if (env_vars_.enum_overrides.empty()) { - std::cout << "}" << std::endl; - return; + ss << "}" << std::endl; + return ss.str(); } for (auto it=env_vars_.enum_overrides.begin(); it != env_vars_.enum_overrides.end(); ++it) { DevInfoTypes type = static_cast(*it); - std::cout << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) - + ")"); + ss << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) + ")"); auto temp_it = it; if(++temp_it != env_vars_.enum_overrides.end()) { - std::cout << ", "; + ss << ", "; } } - std::cout << "}" << std::endl; + ss << "}" << std::endl; + return ss.str(); } std::shared_ptr @@ -692,8 +698,7 @@ static bool isAMDGPU(std::string dev_path) { std::string vend_path = dev_path + "/device/vendor"; if (!FileExists(vend_path.c_str())) { ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path - << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": - "is an amdgpu device - FALSE"); + << " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE"); LOG_DEBUG(ss); return isAmdGpu; } @@ -703,8 +708,7 @@ static bool isAMDGPU(std::string dev_path) { if (!fs.is_open()) { ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path - << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": - "is an amdgpu device - FALSE"); + << " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE"); LOG_DEBUG(ss); return isAmdGpu; } @@ -719,8 +723,7 @@ static bool isAMDGPU(std::string dev_path) { isAmdGpu = true; } ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path - << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": - "is an amdgpu device - FALSE"); + << " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE"); LOG_DEBUG(ss); return isAmdGpu; } @@ -728,6 +731,7 @@ static bool isAMDGPU(std::string dev_path) { uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { std::string err_msg; uint32_t count = 0; + std::ostringstream ss; // If this gets called more than once, clear previous findings. devices_.clear(); @@ -754,17 +758,125 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { } dentry = readdir(drm_dir); } + ss << __PRETTY_FUNCTION__ << " | Discovered a potential of " + << std::to_string(count) << " cards" << " | "; + LOG_DEBUG(ss); + struct systemNode { + uint32_t s_node_id = 0; + uint64_t s_gpu_id = 0; + uint64_t s_unique_id = 0; + }; + // allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id} + std::multimap allSystemNodes; + uint32_t node_id = 0; + while (true) { + uint64_t gpu_id = 0, unique_id = 0; + int ret_gpu_id = get_gpu_id(node_id, &gpu_id); + int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id); + if (ret_gpu_id == 0 || ret_unique_id == 0) { + systemNode myNode; + myNode.s_node_id = node_id; + myNode.s_gpu_id = gpu_id; + myNode.s_unique_id = unique_id; + if(gpu_id != 0) { // only add gpu nodes, 0 = CPU + allSystemNodes.emplace(unique_id, myNode); + } + } else { + break; + } + node_id++; + } + + ss << __PRETTY_FUNCTION__ << " | Ordered system nodes found = {"; + for(auto i: allSystemNodes) { + ss << "\n[node_id = " << std::to_string(i.second.s_node_id) + << "; gpu_id = " << std::to_string(i.second.s_gpu_id) + << "; unique_id = " << std::to_string(i.second.s_unique_id) + << "], " + ; + } + ss << "}"; + LOG_DEBUG(ss); + + // Discover all root cards & gpu partitions associated with each for (uint32_t node_id = 0; node_id < count; node_id++) { std::string path = kPathDRMRoot; path += "/card"; path += std::to_string(node_id); + uint64_t primary_unique_id = 0; + + // each identified gpu card node is a primary node for + // potential matching unique ids if (isAMDGPU(path) || (init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) { std::string d_name = "card"; d_name += std::to_string(node_id); AddToDeviceList(d_name); - } + + ss << __PRETTY_FUNCTION__ + << " | Ordered system nodes seen in lookup = {"; + for (auto i : allSystemNodes) { + ss << "\n[node_id = " << std::to_string(i.second.s_node_id) + << "; gpu_id = " << std::to_string(i.second.s_gpu_id) + << "; unique_id = " << std::to_string(i.second.s_unique_id) + << "], "; + } + ss << "}"; + LOG_DEBUG(ss); + + uint64_t temp_primary_unique_id = 0; + if (allSystemNodes.empty()) { + continue; + } + + // get lowest key 1st to keep order of nodes matching card + uint32_t lowest_NodeId = 0; + uint32_t curr_NodeId = 0; + + for (auto it = allSystemNodes.begin(), end = allSystemNodes.end(); + it != end; it = allSystemNodes.upper_bound(it->first)) { + curr_NodeId = it->second.s_node_id; + if (it == allSystemNodes.begin()) { + lowest_NodeId = it->second.s_node_id; + } + if (curr_NodeId <= lowest_NodeId) { + lowest_NodeId = curr_NodeId; + temp_primary_unique_id = it->second.s_unique_id; + } + } + ss << __PRETTY_FUNCTION__ + << " | lowest_NodeId = " << std::to_string(lowest_NodeId) + << " | curr_NodeId = " << std::to_string(curr_NodeId) + << " | temp_primary_unique_id = " + << std::to_string(temp_primary_unique_id); + LOG_DEBUG(ss); + + if (temp_primary_unique_id != 0) { + primary_unique_id = temp_primary_unique_id; + } else { + allSystemNodes.erase(primary_unique_id); + continue; + } + + auto numb_nodes = allSystemNodes.count(primary_unique_id); + ss << __PRETTY_FUNCTION__ << " | REFRESH - primary_unique_id = " + << std::to_string(primary_unique_id) << " has " + << std::to_string(numb_nodes) << " known gpu nodes"; + LOG_DEBUG(ss); + while (numb_nodes > 1) { + std::string secNode = "card"; + secNode += std::to_string(node_id); // add the primary node id + AddToDeviceList(secNode); + numb_nodes--; + } + // remove already added nodes associated with current card + auto erasedNodes = allSystemNodes.erase(primary_unique_id); + ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = " + << std::to_string(primary_unique_id) << " erased " + << std::to_string(erasedNodes) << " nodes"; + LOG_DEBUG(ss); + } } if (closedir(drm_dir)) { diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index 973d555d26..1e9a444320 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -40,12 +40,17 @@ * DEALINGS WITH THE SOFTWARE. * */ - +#define _GNU_SOURCE 1 // REQUIRED: to utilize some GNU features/functions, see + // _GNU_SOURCE functions which check +#include +#include +#include +#include #include #include #include #include -#include +#include #include #include @@ -612,7 +617,8 @@ std::string getRSMIStatusString(rsmi_status_t ret) { // Big Endian (BE), multi-bit symbols encoded as big endian (MSB first) // Little Endian (LE), multi-bit symbols encoded as little endian (LSB first) std::tuple + std::string, std::string, std::string, std::string, + std::string, std::string, std::string> getSystemDetails(void) { struct utsname buf; bool errorDetected = false; @@ -625,6 +631,9 @@ std::tupled_name, ".") == 0 || strcmp(dent->d_name, "..") == 0) { + continue; + } + + if (fstatat(dirfd(srcdir), dent->d_name, &st, 0) < 0) { + perror(dent->d_name); + continue; + } + + if (S_ISDIR(st.st_mode)) { + dir_count++; + } + } + closedir(srcdir); + return dir_count; +} } // namespace smi } // namespace amd diff --git a/tests/rocm_smi_test/CMakeLists.txt b/tests/rocm_smi_test/CMakeLists.txt index 9c8ed197dc..b2347e0bff 100755 --- a/tests/rocm_smi_test/CMakeLists.txt +++ b/tests/rocm_smi_test/CMakeLists.txt @@ -67,7 +67,8 @@ target_link_libraries( PUBLIC GTest::gtest_main PUBLIC c PUBLIC stdc++ - PUBLIC pthread) + PUBLIC pthread + PUBLIC dl) install(TARGETS ${RSMITST} gtest gtest_main DESTINATION ${SHARE_INSTALL_PREFIX}/rsmitst_tests From 41ade41d8467eadc37ab60c4a423a1ece9a65449 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Mon, 11 Sep 2023 19:23:46 -0500 Subject: [PATCH 4/6] SWDEV-409184 - Fix erroneous 'not supported' when HWMON is absent Change-Id: Ic5ff406977d962fadc709a03853dac61b5460a26 Signed-off-by: Galantsev, Dmitrii --- src/rocm_smi_device.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index 95f27d8e12..b01734f0b1 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -1101,10 +1101,6 @@ void Device::fillSupportedFuncs(void) { if (!supported_funcs_.empty()) { return; } - if (monitor() == nullptr) { - return; - } - std::map::const_iterator it = kDevFuncDependsMap.begin(); std::string dev_rt = path_ + "/device"; @@ -1160,7 +1156,9 @@ void Device::fillSupportedFuncs(void) { it++; } - monitor()->fillSupportedFuncs(&supported_funcs_); + if (monitor() != nullptr) { + monitor()->fillSupportedFuncs(&supported_funcs_); + } // DumpSupportedFunctions(); } From ff992e9b56669fc2897b24567d8ae9c2f14dd5be Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Mon, 11 Sep 2023 19:43:25 -0500 Subject: [PATCH 5/6] TESTS - re-enable frequency tests on aqua_vanjaram Change-Id: I8fcd9418da5b973897ccfffc7d8a2f3ea833ea77 Signed-off-by: Galantsev, Dmitrii --- tests/rocm_smi_test/rsmitst.exclude | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/rocm_smi_test/rsmitst.exclude b/tests/rocm_smi_test/rsmitst.exclude index 43a738ab22..0632242f93 100644 --- a/tests/rocm_smi_test/rsmitst.exclude +++ b/tests/rocm_smi_test/rsmitst.exclude @@ -65,8 +65,6 @@ $BLACKLIST_ALL_ASICS\ FILTER[90400]=\ $BLACKLIST_ALL_ASICS\ "rsmitstReadOnly.TestVoltCurvRead:"\ -"rsmitstReadOnly.TestFrequenciesRead:"\ -"rsmitstReadWrite.TestFrequenciesReadWrite:"\ "rsmitstReadWrite.TestPowerReadWrite" FILTER[90401]=${FILTER[90400]} FILTER[90402]=${FILTER[90400]} From 4acfb00ad59782cb011e19a05c15e37730af1f7c Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 7 Sep 2023 16:20:30 -0500 Subject: [PATCH 6/6] PY: Silence error output when printing concise info Change-Id: I9ce4ad523b3fe2ec8afc5bea791810ec67558f11 Signed-off-by: Galantsev, Dmitrii --- python_smi_tools/rocm_smi.py | 181 +++++++++++++++++++++-------------- 1 file changed, 107 insertions(+), 74 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 1d4c7e69a1..4a943a66a4 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -173,10 +173,12 @@ def formatMatrixToJSON(deviceList, matrix, metricName): printSysLog(metricName.format(deviceList[row_indx], deviceList[col_ind]), valueStr) -def getBus(device): +def getBus(device, silent=False): """ Return the bus identifier of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ bdfid = c_uint64(0) ret = rocmsmi.rsmi_dev_pci_id_get(device, byref(bdfid)) @@ -188,16 +190,18 @@ def getBus(device): function = bdfid.value & 0x7 pic_id = '{:04X}:{:02X}:{:02X}.{:0X}'.format(domain, bus, device, function) - if rsmi_ret_ok(ret, device, 'get_pci_id'): + if rsmi_ret_ok(ret, device, 'get_pci_id', silent): return pic_id -def getFanSpeed(device): +def getFanSpeed(device, silent=True): """ Return a tuple with the fan speed (value,%) for a specified device, or (None,None) if either current fan speed or max fan speed cannot be obtained @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ fanLevel = c_int64() fanMax = c_int64() @@ -209,7 +213,7 @@ def getFanSpeed(device): /sys/class/drm/cardX/device/hwmon/hwmonX/pwmX """ ret = rocmsmi.rsmi_dev_fan_speed_get(device, sensor_ind, byref(fanLevel)) - if rsmi_ret_ok(ret, device, 'get_fan_speed', True): + if rsmi_ret_ok(ret, device, 'get_fan_speed', silent): fl = fanLevel.value last_ret = ret @@ -217,7 +221,7 @@ def getFanSpeed(device): /sys/class/drm/cardX/device/hwmon/hwmonX/pwmX """ ret = rocmsmi.rsmi_dev_fan_speed_max_get(device, sensor_ind, byref(fanMax)) - if rsmi_ret_ok(ret, device, 'get_fan_max_speed', True): + if rsmi_ret_ok(ret, device, 'get_fan_max_speed', silent): fm = fanMax.value """ In case we had an error before, we don't overwrite it with a @@ -232,59 +236,67 @@ def getFanSpeed(device): return (last_ret, fl, round((float(fl) / float(fm)) * 100, 2)) -def getGpuUse(device): +def getGpuUse(device, silent=False): """ Return the current GPU usage as a percentage @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ percent = c_uint32() ret = rocmsmi.rsmi_dev_busy_percent_get(device, byref(percent)) - if rsmi_ret_ok(ret, device, 'GPU Utilization '): + if rsmi_ret_ok(ret, device, 'GPU Utilization ', silent): return percent.value return -1 -def getId(device): +def getId(device, silent=False): """ Return the hexadecimal value of a device's ID @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ dv_id = c_short() ret = rocmsmi.rsmi_dev_id_get(device, byref(dv_id)) - if rsmi_ret_ok(ret, device, 'get_device_id'): + if rsmi_ret_ok(ret, device, 'get_device_id', silent): return hex(dv_id.value) -def getRev(device): +def getRev(device, silent=False): """ Return the hexadecimal value of a device's Revision @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ dv_rev = c_short() ret = rocmsmi.rsmi_dev_revision_get(device, byref(dv_rev)) - if rsmi_ret_ok(ret, device, 'get_device_rev'): + if rsmi_ret_ok(ret, device, 'get_device_rev', silent): return hex(dv_rev.value) -def getMaxPower(device): +def getMaxPower(device, silent=False): """ Return the maximum power cap of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ power_cap = c_uint64() ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap)) - if rsmi_ret_ok(ret, device, 'get_power_cap'): + if rsmi_ret_ok(ret, device, 'get_power_cap', silent): return power_cap.value / 1000000 return -1 -def getMemInfo(device, memType, quiet=False): +def getMemInfo(device, memType, silent=False): """ Returns a tuple of (memory_used, memory_total) of the requested memory type usage for the device specified @param device: DRM device identifier @param type: [vram|vis_vram|gtt] Memory type to return - @param quiet=Turn on to silience error output + @param silent=Turn on to silence error output (you plan to handle manually). Default is off, which exposes any issue accessing the different memory types. @@ -300,11 +312,11 @@ def getMemInfo(device, memType, quiet=False): memTotal = None ret = rocmsmi.rsmi_dev_memory_usage_get(device, memory_type_l.index(memType), byref(memoryUse)) - if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), quiet): + if rsmi_ret_ok(ret, device, 'get_memory_usage_' + str(memType), silent): memUsed = memoryUse.value ret = rocmsmi.rsmi_dev_memory_total_get(device, memory_type_l.index(memType), byref(memoryTot)) - if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), quiet): + if rsmi_ret_ok(ret, device, 'get_memory_total_' + str(memType), silent): memTotal = memoryTot.value return (memUsed, memTotal) @@ -334,14 +346,16 @@ def getProcessName(pid): return pName -def getPerfLevel(device): +def getPerfLevel(device, silent=False): """ Return the current performance level of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ perf = rsmi_dev_perf_level_t() ret = rocmsmi.rsmi_dev_perf_level_get(device, byref(perf)) - if rsmi_ret_ok(ret, device, 'get_perf_level'): + if rsmi_ret_ok(ret, device, 'get_perf_level', silent): return perf_level_string(perf.value) return 'N/A' @@ -369,42 +383,48 @@ def getPidList(): return -def getPower(device): +def getPower(device, silent=False): """ Return the current power level of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ power = c_uint32() ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power)) - if rsmi_ret_ok(ret, device, 'get_power_avg'): + if rsmi_ret_ok(ret, device, 'get_power_avg', silent): return power.value / 1000000 return 'N/A' -def getRasEnablement(device, block): +def getRasEnablement(device, block, silent=True): """ Return RAS enablement state for a given device @param device: DRM device identifier @param block: RAS block identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ state = rsmi_ras_err_state_t() ret = rocmsmi.rsmi_dev_ecc_status_get(device, rsmi_gpu_block_d[block], byref(state)) - if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), True): + if rsmi_ret_ok(ret, device, 'get_ecc_status_' + str(block), silent): return rsmi_ras_err_stale_machine[state.value].upper() return 'N/A' -def getTemp(device, sensor): +def getTemp(device, sensor, silent=True): """ Display the current temperature from a given device's sensor @param device: DRM device identifier @param sensor: Temperature sensor identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ temp = c_int64(0) metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), temp_type_lst.index(sensor), metric, byref(temp)) - if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), True): + if rsmi_ret_ok(ret, device, 'get_temp_metric' + str(sensor), silent): return temp.value / 1000 return 'N/A' @@ -428,52 +448,60 @@ def findFirstAvailableTemp(device): continue return (ret_temp_type, ret_temp) -def getVbiosVersion(device): +def getVbiosVersion(device, silent=False): """ Returns the VBIOS version for a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ vbios = create_string_buffer(256) ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256) if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: return "Unsupported" - elif rsmi_ret_ok(ret, device): + elif rsmi_ret_ok(ret, device, silent=silent): return vbios.value.decode() -def getVersion(deviceList, component): +def getVersion(deviceList, component, silent=False): """ Return the software version for the specified component @param deviceList: List of DRM devices (can be a single-item list) @param component: Component (currently only driver) + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ ver_str = create_string_buffer(256) ret = rocmsmi.rsmi_version_str_get(component, ver_str, 256) - if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component)): + if rsmi_ret_ok(ret, None, 'get_version_str_' + str(component), silent): return ver_str.value.decode() return None -def getComputePartition(device): +def getComputePartition(device, silent=True): """ Return the current compute partition of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ currentComputePartition = create_string_buffer(256) ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256) - if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode(): + if rsmi_ret_ok(ret, device, 'get_compute_partition', silent) and currentComputePartition.value.decode(): return str(currentComputePartition.value.decode()) return "N/A" -def getMemoryPartition(device): +def getMemoryPartition(device, silent=True): """ Return the current memory partition of a given device @param device: DRM device identifier + @param silent=Turn on to silence error output + (you plan to handle manually). Default is on. """ currentNPSMode = create_string_buffer(256) ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256) - if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode(): + if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent) and currentNPSMode.value.decode(): return str(currentNPSMode.value.decode()) return "N/A" @@ -1591,6 +1619,7 @@ def showAllConcise(deviceList): MAX_ALL_CONCISE_WIDTH = 100 appWidth_temp = appWidth appWidth = MAX_ALL_CONCISE_WIDTH + silent = True printLogSpacer(' Concise Info ') deviceList.sort() @@ -1618,9 +1647,9 @@ def showAllConcise(deviceList): values = {} degree_sign = u'\N{DEGREE SIGN}' for device in deviceList: - gpu_dev_product_info = getDevProductInfo(device) + gpu_dev_product_info = getDevProductInfo(device, silent) gpu_dev_product_info_names = list(gpu_dev_product_info[device]) - temp_val = str(getTemp(device, available_temp_type)) + temp_val = str(getTemp(device, available_temp_type, silent)) if temp_val != 'N/A': temp_val += degree_sign + 'C' avgPwr = str(getPower(device)) @@ -1628,26 +1657,25 @@ def showAllConcise(deviceList): avgPwr += 'W' else: avgPwr = 'N/A' - combined_partition = (getMemoryPartition(device) + ", " - + getComputePartition(device)) - concise = True - sclk = showCurrentClocks([device], 'sclk', concise) - mclk = showCurrentClocks([device], 'mclk', concise) - (retCode, fanLevel, fanSpeed) = getFanSpeed(device) + combined_partition = (getMemoryPartition(device, silent) + ", " + + getComputePartition(device, silent)) + sclk = showCurrentClocks([device], 'sclk', concise=silent) + mclk = showCurrentClocks([device], 'mclk', concise=silent) + (retCode, fanLevel, fanSpeed) = getFanSpeed(device, silent) fan = str(fanSpeed) + '%' - if getPerfLevel(device) != -1: - perf = getPerfLevel(device) + if getPerfLevel(device, silent) != -1: + perf = getPerfLevel(device, silent) else: perf = 'Unsupported' - if getMaxPower(device) != -1: - pwrCap = str(getMaxPower(device)) + 'W' + if getMaxPower(device, silent) != -1: + pwrCap = str(getMaxPower(device, silent)) + 'W' else: pwrCap = 'Unsupported' - if getGpuUse(device) != -1: - gpu_busy = str(getGpuUse(device)) + '%' + if getGpuUse(device, silent) != -1: + gpu_busy = str(getGpuUse(device, silent)) + '%' else: gpu_busy = 'Unsupported' - vram_used, vram_total = getMemInfo(device, 'vram', True) + vram_used, vram_total = getMemInfo(device, 'vram', silent) mem_use_pct = 0 if vram_used is None: mem_use_pct='Unsupported' @@ -1681,7 +1709,7 @@ def showAllConcise(deviceList): for device in deviceList: printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), values['card%s' % (str(device))])), None) - gpu_dev_product_info = getDevProductInfo(device) + gpu_dev_product_info = getDevProductInfo(device, silent) gpu_dev_product_info_names = list(gpu_dev_product_info[device]) if (len(gpu_dev_product_info_names) > 1): printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in @@ -1705,19 +1733,20 @@ def showAllConciseHw(deviceList): header = ['GPU', 'DID', 'DREV', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS'] head_widths = [len(head) + 2 for head in header] values = {} + silent = True for device in deviceList: - gpuid = getId(device) + gpuid = getId(device, silent) if str(gpuid).startswith('0x'): gpuid = str(gpuid)[2:] - gpurev = getRev(device) + gpurev = getRev(device, silent) if str(gpurev).startswith('0x'): gpurev = str(gpurev)[2:] - gfxRas = getRasEnablement(device, 'GFX') - sdmaRas = getRasEnablement(device, 'SDMA') - umcRas = getRasEnablement(device, 'UMC') - vbios = getVbiosVersion(device) - bus = getBus(device) + gfxRas = getRasEnablement(device, 'GFX', silent) + sdmaRas = getRasEnablement(device, 'SDMA', silent) + umcRas = getRasEnablement(device, 'UMC', silent) + vbios = getVbiosVersion(device, silent) + bus = getBus(device, silent) values['card%s' % (str(device))] = [device, gpuid, gpurev, gfxRas, sdmaRas, umcRas, vbios, bus] val_widths = {} for device in deviceList: @@ -1812,8 +1841,8 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): if concise: # in case function is used for concise output, no need to print. return '{:.0f}Mhz'.format(fr) printLog(device, '{} clock level'.format(clk_defined), '{} ({:.0f}Mhz)'.format(levl, fr)) - else: - printErrLog(device, '%s clock is unsupported' % (clk_defined)) + elif not concise: + logging.debug('{} clock is unsupported on device[{}]'.format(clk_defined, device)) else: # if clk is not defined, will display all current clk for clk_type in sorted(rsmi_clk_names_dict): @@ -1830,7 +1859,7 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): printLog(device, '%s clock level:' % (clk_type), levl) else: printLog(device, '%s clock level: %s' % (clk_type, levl), '(%sMhz)' % (str(fr)[:-2])) - else: + elif not concise: logging.debug('{} clock is unsupported on device[{}]'.format(clk_type, device)) # pcie clocks if rocmsmi.rsmi_dev_pci_bandwidth_get(device, None) == 1: @@ -1843,8 +1872,8 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): fr = '{:.1f}GT/s x{}'.format(bw.transfer_rate.frequency[current_f] / 1000000000, bw.lanes[current_f]) printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr)) - else: - logging.debug('PCIe clock is unsupported on device[{}]'.format(device)) + elif not concise: + logging.debug('{} clock is unsupported on device[{}]'.format('PCIe', device)) if not concise: printLogSpacer() @@ -2403,47 +2432,51 @@ def showProductName(deviceList): printLogSpacer() -def getDevProductInfo(device): +def getDevProductInfo(device, silent=False): """ Show the requested product name for the device requested @param device: Device we want to get the info for + @param silent=Turn on to silence error output + (you plan to handle manually). Default is off. """ # Retrieve card vendor MAX_BUFF_SIZE = 256 MAX_DESC_SIZE = 20 - device_info = "N/A" + device_series = "N/A" + device_model = "N/A" + gpu_revision = "N/A" device_list = {} vendor = create_string_buffer(MAX_BUFF_SIZE) ret = rocmsmi.rsmi_dev_vendor_name_get(device, vendor, MAX_BUFF_SIZE) # Only continue if GPU vendor is AMD - if rsmi_ret_ok(ret, device, 'get_vendor_name') and isAmdDevice(device): + if rsmi_ret_ok(ret, device, 'get_vendor_name', silent) and isAmdDevice(device): # Retrieve the device series series = create_string_buffer(MAX_BUFF_SIZE) ret = rocmsmi.rsmi_dev_name_get(device, series, MAX_BUFF_SIZE) - if rsmi_ret_ok(ret, device, 'get_name'): + if rsmi_ret_ok(ret, device, 'get_name', silent): try: device_series = series.value.decode() except UnicodeDecodeError: - device_series = "N/A" - printErrLog(device, "Unable to read card series") + if not silent: + printErrLog(device, "Unable to read card series") # Retrieve the device model model = create_string_buffer(MAX_BUFF_SIZE) ret = rocmsmi.rsmi_dev_subsystem_name_get(device, model, MAX_BUFF_SIZE) - if rsmi_ret_ok(ret, device, 'get_subsystem_name'): + if rsmi_ret_ok(ret, device, 'get_subsystem_name', silent): try: device_model = model.value.decode() device_model = padHexValue(device_model, 4) except UnicodeDecodeError: - device_model = "N/A" - printErrLog(device, "Unable to read device model") + if not silent: + printErrLog(device, "Unable to read device model") try: gpu_revision = padHexValue(getRev(device), 2) except Exception as exc: - gpu_revision = "N/A" - printErrLog(device, "Unable to read card revision %s" % (exc)) + if not silent: + printErrLog(device, "Unable to read card revision %s" % (exc)) device_series_str = str(device_series[:MAX_DESC_SIZE]) device_series_str = device_series_str.ljust(MAX_DESC_SIZE, ' ') @@ -3382,7 +3415,7 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False): @param my_ret: Return of RSMI call (rocm_smi_lib API) @param metric: Parameter of GPU currently being analyzed @param silent: Echo verbose error reponse. - True siliences err output, False does not silience err output (default). + True silences err output, False does not silence err output (default). """ global RETCODE global PRINT_JSON @@ -3399,8 +3432,8 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False): if err_str.value is not None: returnString += '%s\t' % (err_str.value.decode()) if not PRINT_JSON: - logging.debug('%s', returnString) if not silent: + logging.debug('%s', returnString) if my_ret in rsmi_status_verbose_err_out: printLog(device, metric + ", " + rsmi_status_verbose_err_out[my_ret], None) RETCODE = my_ret