From 48131e94819cb05c90a9a5a01f0ede327349a1f8 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Thu, 2 May 2024 15:27:16 -0500 Subject: [PATCH] Revert "SWDEV-458102 - Deprecated Voltage Curve API" This reverts commit cc8eb1775976252e554feb6cc1690a7fe4328b8c. Change-Id: I8a3eaf0a9f28200e09fb35d5260fbc070fe8a4a9 [ROCm/amdsmi commit: 11c72946eb114683538ad03861594524df8ee0c8] --- projects/amdsmi/CHANGELOG.md | 52 +-- projects/amdsmi/amdsmi_cli/README.md | 3 +- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 38 +- projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 2 + projects/amdsmi/include/amd_smi/amdsmi.h | 13 +- projects/amdsmi/py-interface/README.md | 4 +- .../rocm_smi/include/rocm_smi/rocm_smi.h | 6 +- .../include/rocm_smi/rocm_smi_utils.h | 330 +----------------- projects/amdsmi/rocm_smi/src/rocm_smi.cc | 159 +++++---- .../amdsmi/rocm_smi/src/rocm_smi_utils.cc | 9 +- .../functional/mutual_exclusion.cc | 8 +- .../functional/volt_freq_curv_read.cc | 22 +- 12 files changed, 192 insertions(+), 454 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index ad3eba258e..6d7275eb7a 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -4,7 +4,7 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/]( ***All information listed below is for reference and subject to change.*** -## amd_smi_lib for ROCm 6.2 (Unreleased) +## amd_smi_lib for ROCm 6.1.2 ### Added @@ -42,11 +42,22 @@ GPU: 1 - **`amdsmi_get_power_cap_info` now returns values in uW instead of W** `amdsmi_get_power_cap_info` will return in uW as originally reflected by driver. Previously `amdsmi_get_power_cap_info` returned W values, this conflicts with our sets and modifies values retrieved from driver. We decided to keep the values returned from driver untouched (in original units, uW). Then in CLI we will convert to watts (as previously done - no changes here). Additionally, driver made updates to min power cap displayed for devices when overdrive is disabled which prompted for this change (in this case min_power_cap and max_power_cap are the same). +- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** +Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty. + ### Optimizations -- N/A +- **Updated `amd-smi monitor --pcie` output** +The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output: + +```shell +$ amd-smi monitor --pcie +GPU PCIE_BW + 0 26 Mb/s +``` ### Fixed + - **Fixed `amd-smi metric --power` now provides power output for Navi2x/Navi3x/MI1x** These systems use an older version of gpu_metrics in amdgpu. This fix only updates what CLI outputs. No change in any of our APIs. @@ -70,45 +81,13 @@ GPU: 1 POWER_MANAGEMENT: ENABLED THROTTLE_STATUS: UNTHROTTLED ``` + - **Fixed `amdsmitstReadWrite.TestPowerCapReadWrite` test for Navi3X, Navi2X, MI100** Updates required `amdsmi_get_power_cap_info` to return in uW as originally reflected by driver. Previously `amdsmi_get_power_cap_info` returned W values, this conflicts with our sets and modifies values retrieved from driver. We decided to keep the values returned from driver untouched (in original units, uW). Then in CLI we will convert to watts (as previously done - no changes here). Additionally, driver made updates to min power cap displayed for devices when overdrive is disabled which prompted for this change (in this case min_power_cap and max_power_cap are the same). - -## amd_smi_lib for ROCm 6.1.2 - -### Added - -- **Updated Python Library return types for amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** -Previously calls were returning "No bad pages found." if no pages were found, now it only returns the list type and can be empty. - -### Changed - -- **Deprecated Volt Curve APIs** -The latest amdgpu driver has dropped support for getting and setting volt curve information. amdsmi_set_gpu_od_volt_info() & amdsmi_get_gpu_od_volt_curve_regions() have been deprecated with amdsmi_get_gpu_od_volt_info() now no longer populating voltage curve frequencies. - -- **Removed `amd-smi metric --voltage-curve` from CLI Tool** -Due to amdgpu driver dropping support for voltage curve, the CLI option has been removed as well. - -### Optimizations - -- **Updated `amd-smi monitor --pcie` output** -The source for pcie bandwidth monitor output was a legacy file we no longer support and was causing delays within the monitor command. The output is no longer using TX/RX but instantaneous bandwidth from gpu_metrics instead; updated output: - -```shell -$ amd-smi monitor --pcie -GPU PCIE_BW - 0 26 Mb/s -``` - -### Fixed - - **Fixed python interface call amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** Previously python interface calls to populated bad pages resulted in a `ValueError: NULL pointer access`. This fixes the bad-pages subcommand CLI subcommand as well. -### Known issues - -- None - ## amd_smi_lib for ROCm 6.1.1 ### Added @@ -428,9 +407,6 @@ $ /opt/rocm/bin/amd-smi topology -a -t --json ### Fixed -- **Fixed python interface call amdsmi_get_gpu_memory_reserved_pages & amdsmi_get_gpu_bad_page_info** -Previously python interface calls to populated bad pages resulted in a `ValueError: NULL pointer access`. This fixes the bad-pages subcommand CLI subcommand as well. - - **Fix for GPU reset error on non-amdgpu cards** Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix updates CLI to target only AMD ASICs. diff --git a/projects/amdsmi/amdsmi_cli/README.md b/projects/amdsmi/amdsmi_cli/README.md index 7b2252911e..affc7a476b 100644 --- a/projects/amdsmi/amdsmi_cli/README.md +++ b/projects/amdsmi/amdsmi_cli/README.md @@ -269,7 +269,7 @@ Command Modifiers: ~$ amd-smi metric --help usage: amd-smi metric [-h] [-g GPU [GPU ...] | -U CPU [CPU ...] | -O CORE [CORE ...]] [-w INTERVAL] [-W TIME] [-i ITERATIONS] [-m] [-u] [-p] [-c] [-t] - [-P] [-e] [-k] [-f] [-o] [-l] [-x] [-E] [--cpu-power-metrics] + [-P] [-e] [-k] [-f] [-C] [-o] [-l] [-x] [-E] [--cpu-power-metrics] [--cpu-prochot] [--cpu-freq-metrics] [--cpu-c0-res] [--cpu-lclk-dpm-level NBIOID] [--cpu-pwr-svi-telemtry-rails] [--cpu-io-bandwidth IO_BW LINKID_NAME] @@ -313,6 +313,7 @@ Metric arguments: -e, --ecc Total number of ECC errors -k, --ecc-blocks Number of ECC errors per block -f, --fan Current fan speed + -C, --voltage-curve Display voltage curve -o, --overdrive Current GPU clock overdrive level -l, --perf-level Current DPM performance level -x, --xgmi-err XGMI error information since last read diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 6cf63f9b92..460a0f84d3 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -1115,7 +1115,7 @@ class AMDSMICommands(): def metric_gpu(self, args, multiple_devices=False, watching_output=False, gpu=None, usage=None, watch=None, watch_time=None, iterations=None, power=None, clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None, - fan=None, overdrive=None, perf_level=None, + fan=None, voltage_curve=None, overdrive=None, perf_level=None, xgmi_err=None, energy=None, mem_usage=None, schedule=None, guard=None, guest_data=None, fb_usage=None, xgmi=None,): """Get Metric information for target gpu @@ -1136,6 +1136,7 @@ class AMDSMICommands(): ecc_blocks (bool, optional): Value override for args.ecc. Defaults to None. pcie (bool, optional): Value override for args.pcie. Defaults to None. fan (bool, optional): Value override for args.fan. Defaults to None. + voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None. overdrive (bool, optional): Value override for args.overdrive. Defaults to None. perf_level (bool, optional): Value override for args.perf_level. Defaults to None. xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None. @@ -1194,6 +1195,8 @@ class AMDSMICommands(): if self.helpers.is_baremetal() and self.helpers.is_linux(): if fan: args.fan = fan + if voltage_curve: + args.voltage_curve = voltage_curve if overdrive: args.overdrive = overdrive if perf_level: @@ -1202,8 +1205,8 @@ class AMDSMICommands(): args.xgmi_err = xgmi_err if energy: args.energy = energy - current_platform_args += ["fan", "overdrive", "perf_level", "xgmi_err", "energy"] - current_platform_values += [args.fan, args.overdrive, args.perf_level, args.xgmi_err, args.energy] + current_platform_args += ["fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy"] + current_platform_values += [args.fan, args.voltage_curve, args.overdrive, args.perf_level, args.xgmi_err, args.energy] if self.helpers.is_hypervisor(): if schedule: @@ -1785,6 +1788,26 @@ class AMDSMICommands(): logging.debug("Failed to get fan rpms for gpu %s | %s", args.gpu, e.get_error_info()) values_dict["fan"] = fan_dict + if "voltage_curve" in current_platform_args: + if args.voltage_curve: + try: + od_volt = amdsmi_interface.amdsmi_get_gpu_od_volt_info(args.gpu) + + voltage_point_dict = {} + + for point in range(3): + if isinstance(od_volt, dict): + frequency = int(od_volt["curve.vc_points"][point].frequency / 1000000) + voltage = int(od_volt["curve.vc_points"][point].voltage) + else: + frequency = 0 + voltage = 0 + voltage_point_dict[f'voltage_point_{point}'] = f"{frequency} Mhz {voltage} mV" + + values_dict['voltage_curve'] = voltage_point_dict + except amdsmi_exception.AmdSmiLibraryException as e: + values_dict['voltage_curve'] = "N/A" + logging.debug("Failed to get voltage curve for gpu %s | %s", gpu_id, e.get_error_info()) if "overdrive" in current_platform_args: if args.overdrive: try: @@ -2300,7 +2323,7 @@ class AMDSMICommands(): def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, usage=None, watch=None, watch_time=None, iterations=None, power=None, clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None, - fan=None, overdrive=None, perf_level=None, + fan=None, voltage_curve=None, overdrive=None, perf_level=None, xgmi_err=None, energy=None, mem_usage=None, schedule=None, guard=None, guest_data=None, fb_usage=None, xgmi=None, cpu=None, cpu_power_metrics=None, cpu_prochot=None, cpu_freq_metrics=None, @@ -2329,6 +2352,7 @@ class AMDSMICommands(): ecc_blocks (bool, optional): Value override for args.ecc. Defaults to None. pcie (bool, optional): Value override for args.pcie. Defaults to None. fan (bool, optional): Value override for args.fan. Defaults to None. + voltage_curve (bool, optional): Value override for args.voltage_curve. Defaults to None. overdrive (bool, optional): Value override for args.overdrive. Defaults to None. perf_level (bool, optional): Value override for args.perf_level. Defaults to None. xgmi_err (bool, optional): Value override for args.xgmi_err. Defaults to None. @@ -2382,7 +2406,7 @@ class AMDSMICommands(): # Check if a GPU argument has been set gpu_args_enabled = False gpu_attributes = ["usage", "watch", "watch_time", "iterations", "power", "clock", - "temperature", "ecc", "ecc_blocks", "pcie", "fan", + "temperature", "ecc", "ecc_blocks", "pcie", "fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy", "mem_usage", "schedule", "guard", "guest_data", "fb_usage", "xgmi"] for attr in gpu_attributes: @@ -2455,7 +2479,7 @@ class AMDSMICommands(): self.metric_gpu(args, multiple_devices, watching_output, gpu, usage, watch, watch_time, iterations, power, clock, temperature, ecc, ecc_blocks, pcie, - fan, overdrive, perf_level, + fan, voltage_curve, overdrive, perf_level, xgmi_err, energy, mem_usage, schedule, guard, guest_data, fb_usage, xgmi) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized @@ -2490,7 +2514,7 @@ class AMDSMICommands(): self.metric_gpu(args, multiple_devices, watching_output, gpu, usage, watch, watch_time, iterations, power, clock, temperature, ecc, ecc_blocks, pcie, - fan, overdrive, perf_level, + fan, voltage_curve, overdrive, perf_level, xgmi_err, energy, mem_usage, schedule) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index af22db7137..4b11188b03 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -699,6 +699,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Help text for Arguments only on Linux Baremetal platforms fan_help = "Current fan speed" + vc_help = "Display voltage curve" overdrive_help = "Current GPU clock overdrive level" perf_level_help = "Current DPM performance level" xgmi_err_help = "XGMI error information since last read" @@ -769,6 +770,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Optional Args for Linux Baremetal Systems if self.helpers.is_baremetal() and self.helpers.is_linux(): metric_parser.add_argument('-f', '--fan', action='store_true', required=False, help=fan_help) + metric_parser.add_argument('-C', '--voltage-curve', action='store_true', required=False, help=vc_help) metric_parser.add_argument('-o', '--overdrive', action='store_true', required=False, help=overdrive_help) metric_parser.add_argument('-l', '--perf-level', action='store_true', required=False, help=perf_level_help) metric_parser.add_argument('-x', '--xgmi-err', action='store_true', required=False, help=xgmi_err_help) diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 80b8417dd3..dcfdfcf7e6 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -1253,13 +1253,12 @@ typedef struct { typedef struct { amdsmi_range_t curr_sclk_range; //!< The current SCLK frequency range amdsmi_range_t curr_mclk_range; //!< The current MCLK frequency range; - //!< (upper bound only) + //!< (upper bound only) amdsmi_range_t sclk_freq_limits; //!< The range possible of SCLK values amdsmi_range_t mclk_freq_limits; //!< The range possible of MCLK values /** * @brief The current voltage curve - * @deprecated ::Voltage curve support has been deprecated by the driver */ amdsmi_od_volt_curve_t curve; uint32_t num_regions; //!< The number of voltage curve regions @@ -2966,7 +2965,7 @@ amdsmi_status_t amdsmi_get_clk_freq(amdsmi_processor_handle processor_handle, amdsmi_status_t amdsmi_reset_gpu(amdsmi_processor_handle processor_handle); /** - * @brief This function retrieves the overdrive GFX & MCLK information. It is + * @brief This function retrieves the voltage/frequency curve information. It is * not supported on virtual machine guest * * @platform{gpu_bm_linux} @@ -3167,9 +3166,6 @@ amdsmi_status_t amdsmi_set_gpu_od_clk_info(amdsmi_processor_handle processor_han * * @platform{gpu_bm_linux} * - * @deprecated ::Voltage curve information is no longer supported by the - * amdgpu driver; this includes the ability to set voltage curve regions - * * @details Given a processor handle @p processor_handle, a voltage point @p vpoint * and a voltage value @p voltvalue this function will set voltage curve point * @@ -3196,9 +3192,6 @@ amdsmi_status_t amdsmi_set_gpu_od_volt_info(amdsmi_processor_handle processor_ha * * @platform{gpu_bm_linux} * - * @deprecated ::Voltage curve information is no longer supported by the - * amdgpu driver; this includes the number of valid voltage regions - * * @details Given a processor handle @p processor_handle, a pointer to an unsigned integer * @p num_regions and a buffer of ::amdsmi_freq_volt_region_t structures, @p * buffer, this function will populate @p buffer with the current @@ -3509,7 +3502,7 @@ amdsmi_status_t amdsmi_set_gpu_process_isolation(amdsmi_processor_handle process * @platform{gpu_bm_linux} @platform{guest_1vf} * * @details Given a processor handle @p processor_handle, and a sclean flag @p sclean, - * this function will clear the SRAM data of this processor. This can be called between + * this function will clear the SRAM data of this processor. This can be called between * user logins to prevent information leak. * * @note This function requires root access diff --git a/projects/amdsmi/py-interface/README.md b/projects/amdsmi/py-interface/README.md index e49a5699f4..6bbe75be7f 100644 --- a/projects/amdsmi/py-interface/README.md +++ b/projects/amdsmi/py-interface/README.md @@ -1591,7 +1591,7 @@ except AmdSmiException as e: ### amdsmi_set_gpu_od_clk_info -Description: **deprecated** This function sets the clock frequency information +Description: This function sets the clock frequency information It is not supported on virtual machine guest Input parameters: @@ -2306,7 +2306,7 @@ except AmdSmiException as e: ### amdsmi_get_gpu_od_volt_curve_regions -Description: **deprecated** This function will retrieve the current valid regions in the +Description: This function will retrieve the current valid regions in the frequency/voltage space. It is not supported on virtual machine guest Input parameters: diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index 8797cf1b5f..0fafa31c8f 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -3058,8 +3058,6 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, /** * @brief This function sets 1 of the 3 voltage curve points. * - * @deprecated This function is deprecated due to driver changes. - * * @details Given a device index @p dv_ind, a voltage point @p vpoint * and a voltage value @p voltvalue this function will set voltage curve point * @@ -3085,8 +3083,6 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, * @brief This function will retrieve the current valid regions in the * frequency/voltage space. * - * @deprecated This function is deprecated due to driver changes. - * * @details Given a device index @p dv_ind, a pointer to an unsigned integer * @p num_regions and a buffer of ::rsmi_freq_volt_region_t structures, @p * buffer, this function will populate @p buffer with the current @@ -3452,7 +3448,7 @@ rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind, uint32_t pisolate); /** - * @brief Clear the GPU SRAM data + * @brief Clear the GPU SRAM data * * * @details Given a device index @p dv_ind, this function will clear the diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h index 32e6bdeefc..67d9d8b8d8 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -45,17 +45,14 @@ #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include +#include #include +#include +#include +#include +#include +#include #include "rocm_smi/rocm_smi_device.h" @@ -128,33 +125,13 @@ std::string print_rsmi_od_volt_freq_regions(uint32_t num_regions, bool is_sudo_user(); rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind, std::string *gfx_version); - -std::string leftTrim(const std::string &s); -std::string rightTrim(const std::string &s); -std::string trim(const std::string &s); -std::string removeNewLines(const std::string &s); - -std::string removeString(const std::string origStr, - const std::string &removeMe); template - std::string print_int_as_hex(T i, bool showHexNotation = true, - int overloadBitSize = 0) { + std::string print_int_as_hex(T i, bool showHexNotation = true) { std::stringstream ss; if (showHexNotation) { - if (overloadBitSize == 0) { - ss << "0x" << std::hex << std::setw(sizeof(T) * 2) << std::setfill('0'); - } else { - // 8 bits per 1 byte - int byteSize = (overloadBitSize / 8) * 2; - ss << "0x" << std::hex << std::setw(byteSize) << std::setfill('0'); - } + ss << "0x" << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; } else { - if (overloadBitSize == 0) { - ss << std::hex << std::setw(sizeof(T) * 2) << std::setfill('0'); - } else { - int byteSize = (overloadBitSize / 8) * 2; - ss << std::hex << std::setw(byteSize) << std::setfill('0'); - } + ss << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; } if (std::is_same::value) { @@ -185,8 +162,7 @@ std::string print_unsigned_hex_and_int(T i, std::string heading="") { } ss << "Hex (MSB): " << print_int_as_hex(i) << ", " << "Unsigned int: " << print_unsigned_int(i) << ", " - << "Byte Size: " << sizeof(T) << ", " - << "Bits: " << sizeof(T) * 8; // 8 bits per 1 byte + << "Byte Size: " << sizeof(T); return ss.str(); } @@ -307,290 +283,8 @@ class ScopedAcquire { // In VM environment, the /proc/cpuinfo set hypervisor flag by default bool is_vm_guest(); - -// -enum class TagSplitterPositional_t -{ - kFIRST, - kBETWEEN, - kLAST, - kNONE, -}; - -template -class TagTextContents_t -{ - public: - using TextLines_t = std::vector; - using PrimaryList_t = std::vector; - using SecondaryList_t = std::vector; - using PrimaryKeyTbl_t = std::map; - using SecondaryKeyTbl_t = std::map; - using StructuredKeysTbl_t = std::map>; - - // - TagTextContents_t() = default; - TagTextContents_t(const TagTextContents_t&) = delete; - TagTextContents_t(TagTextContents_t&&) = delete; - TagTextContents_t& operator=(const TagTextContents_t&) = delete; - TagTextContents_t& operator=(TagTextContents_t&&) = delete; - - explicit TagTextContents_t(const TextLines_t& text_content) - : m_text_content(text_content) {} - - TagTextContents_t& set_text_content(const TextLines_t& text_content) - { - m_text_content = text_content; - } - - TagTextContents_t& set_title_terminator(const std::string& title_mark, - TagSplitterPositional_t title_mark_position) { - m_title_mark = title_mark; - m_title_mark_position = title_mark_position; - - return *this; - } - - TagTextContents_t& set_key_data_splitter(const std::string& line_splitter_mark, - TagSplitterPositional_t line_mark_position) { - m_line_splitter_mark = line_splitter_mark; - m_line_mark_position = line_mark_position; - - return *this; - } - - TagTextContents_t& structure_content() { - // Sanitizes the content. - if (!m_text_content.empty()) { - std::for_each(m_text_content.begin(), m_text_content.end(), trim); - section_title_lookup(); - section_data_lookup(); - } - - return *this; - } - - decltype(auto) get_title_size() { - return m_primary.size(); - } - - decltype(auto) get_structured_subkeys_size(const PrimaryKeyType& prim_key) { - return m_structured[prim_key].size(); - } - - decltype(auto) contains_title_key(const PrimaryKeyType& key) { - return (m_primary.find(key) != m_primary.end()); - } - - decltype(auto) contains_structured_key(const PrimaryKeyType& prim_key, - const SecondaryKeyType& sec_key) { - if (auto first_key_itr = m_structured.find(prim_key); - first_key_itr != m_structured.end()) { - if (auto sec_key_itr = first_key_itr->second.find(sec_key); - sec_key_itr != first_key_itr->second.end()) { - return true; - } - } - - return false; - } - - decltype(auto) get_structured_value_by_keys(const PrimaryKeyType& prim_key, - const SecondaryKeyType& sec_key, - bool is_value_id = true) { - if (auto first_key_itr = m_structured.find(prim_key); - first_key_itr != m_structured.end()) { - if (auto sec_key_itr = first_key_itr->second.find(sec_key); - sec_key_itr != first_key_itr->second.end()) { - SecondaryDataType key_value{}; - if (is_value_id) { - key_value = SecondaryDataType(sec_key_itr->first) + " "; - } - key_value += sec_key_itr->second; - return key_value; - } - } - - return SecondaryDataType{}; - } - - decltype(auto) get_structured_data_subkey_by_position(const PrimaryKeyType& prim_key, - uint32_t key_position) { - auto key_counter = uint32_t(0); - SecondaryKeyType data_key{}; - if (key_position < (get_structured_subkeys_size(prim_key))) { - for (const auto& [sec_key, sec_value] : m_structured[prim_key]) { - if (key_counter == key_position) { - data_key = static_cast(sec_key); - return data_key; - } - ++key_counter; - } - } - - return data_key; - } - - decltype(auto) get_structured_data_subkey_first(const PrimaryKeyType& prim_key) { - return (get_structured_value_by_keys(prim_key, - get_structured_data_subkey_by_position(prim_key, 0))); - } - - decltype(auto) get_structured_data_subkey_last(const PrimaryKeyType& prim_key) { - return (get_structured_value_by_keys(prim_key, get_structured_data_subkey_by_position(prim_key, - (get_structured_subkeys_size(prim_key) - 1)))); - } - - void reset() { - m_text_content.clear(); - m_primary.clear(); - m_structured.clear(); - m_title_mark.clear(); - m_line_splitter_mark.clear(); - m_title_mark_position = TagSplitterPositional_t::kNONE; - m_line_mark_position = TagSplitterPositional_t::kNONE; - } - - decltype(auto) dump_structured_content() { - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n"; - ostrstream << "** Primary Table **" << "\n"; - for (const auto& [key, values] : m_primary) { - ostrstream << "key: " << key << " values: " << values.size() << "\n"; - for (const auto& value : values) { - ostrstream << "\t value: " << value << "\n"; - } - } - - ostrstream << "\n ** Structured Table **" << "\n"; - for (const auto& [prim_key, prim_values] : m_structured) { - ostrstream << "key: " << prim_key << "\n"; - for (const auto& [sec_key, sec_value] : prim_values) { - ostrstream << "\t key: " << sec_key << " -> " << sec_value << "\n"; - } - } - ostrstream << "\n\n"; - - return ostrstream.str(); - } - - - private: - TextLines_t m_text_content; - PrimaryKeyTbl_t m_primary; - StructuredKeysTbl_t m_structured; - std::string m_title_mark; - std::string m_line_splitter_mark; - TagSplitterPositional_t m_title_mark_position; - TagSplitterPositional_t m_line_mark_position; - - // - // Note: Organizes table with Title as a Key, and a list of values. - // - decltype(auto) section_title_lookup() { - if (m_title_mark.empty() || - m_title_mark_position == TagSplitterPositional_t::kNONE) { - return; - } - - // - // Note: - // - top_title_line: Left pointer for the sliding window - // - bottom_title_line: Right pointer for the sliding window - // - auto top_title_line = uint32_t(std::numeric_limits::max()); - auto bottom_title_line = uint32_t(std::numeric_limits::max()); - auto line_counter = uint32_t(0); - - // - // Note: This whole interval/window where the section/title starts, and where it ends. - // - auto update_primary_tbl = [&](const uint32_t& from_line, const uint32_t& to_line) { - auto key = static_cast(m_text_content[from_line]); - for (auto line_num(from_line + 1); line_num < to_line; ++line_num) { - if ((line_num < m_text_content.size()) && !m_text_content[line_num].empty()) { - m_primary[key].push_back(m_text_content[line_num]); - } - } - }; - - auto adjust_sliding_window = [&](const uint32_t& title_line) { - // First time top_title_line gets adjusted. - if (top_title_line == uint32_t(std::numeric_limits::max())) { - top_title_line = title_line; - bottom_title_line = top_title_line; - return; - } - if (title_line > bottom_title_line) { - bottom_title_line = title_line; - update_primary_tbl(top_title_line, bottom_title_line); - top_title_line = bottom_title_line; - } - }; - - for (const auto& line : m_text_content) { - auto was_title_found{false}; - switch (m_title_mark_position) { - case TagSplitterPositional_t::kFIRST: - // Section/Title Mark was found at the first position - if (line.find_first_of(m_title_mark.c_str()) == 0) { - was_title_found = true; - } - break; - - case TagSplitterPositional_t::kLAST: - // Section/Title Mark was found at the last position - if ((line.find_last_of(m_title_mark.c_str()) + 1) == line.size()) { - was_title_found = true; - } - break; - - default: - break; - } - - if (was_title_found) { - adjust_sliding_window(line_counter); - } - ++line_counter; - } - - // Any remaining elements? - if (line_counter > bottom_title_line) { - update_primary_tbl(bottom_title_line, (line_counter - 1)); - } - } - - decltype(auto) section_data_lookup() { - if (m_line_splitter_mark.empty() || - m_line_mark_position == TagSplitterPositional_t::kNONE) { - return; - } - - // - // Note: Organizes table with Title as a Key, a Key/ID for values and values. - // It takes into consideration the initial constraints were all good and - // that the primary table has been populated. - for (const auto& [prim_key, prim_values] : m_primary) { - for (const auto& value : prim_values) { - if (auto mark_pos = value.find_first_of(m_line_splitter_mark.c_str()); - mark_pos != std::string::npos) { - auto sec_key = trim(value.substr(0, mark_pos + 1)); - auto sec_data = trim(value.substr((mark_pos + 1), value.size())); - if (!sec_key.empty()) { - m_structured[prim_key].insert(std::make_pair(sec_key, sec_data)); - } - } - } - } - } - -}; - -using TextFileTagContents_t = TagTextContents_t; - +// trim a string +std::string trim(const std::string &s); } // namespace smi } // namespace amd diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi.cc b/projects/amdsmi/rocm_smi/src/rocm_smi.cc index aa5f30d9d1..dd8e903328 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi.cc @@ -1415,6 +1415,17 @@ For the new format, GFXCLK field will show min and max values(0/1). If the curre frequency in neither min/max but lies within the range, this is indicated by an additional value followed by * at index 1 and max value at index 2. */ +constexpr uint32_t kOD_SCLK_label_array_index = 0; +constexpr uint32_t kOD_MCLK_label_array_index = + kOD_SCLK_label_array_index + 3; +constexpr uint32_t kOD_VDDC_CURVE_label_array_index = + kOD_MCLK_label_array_index + 2; +constexpr uint32_t kOD_OD_RANGE_label_array_index = + kOD_VDDC_CURVE_label_array_index + 4; +constexpr uint32_t kOD_VDDC_CURVE_start_index = + kOD_OD_RANGE_label_array_index + 3; +// constexpr uint32_t kOD_VDDC_CURVE_num_lines = +// kOD_VDDC_CURVE_start_index + 4; constexpr uint32_t kMIN_VALID_LINES = 2; static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, @@ -1439,75 +1450,62 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, return RSMI_STATUS_NOT_YET_IMPLEMENTED; } - // - const std::string kTAG_OD_SCLK{"OD_SCLK:"}; - const std::string kTAG_GFXCLK{"GFXCLK:"}; - const std::string KTAG_OD_MCLK{"OD_MCLK:"}; - const std::string KTAG_MCLK{"MCLK:"}; - const std::string KTAG_FIRST_FREQ_IDX{"0:"}; - amd::smi::TextFileTagContents_t txt_power_dev_od_voltage(val_vec); - txt_power_dev_od_voltage - .set_title_terminator(":", amd::smi::TagSplitterPositional_t::kLAST) - .set_key_data_splitter(":", amd::smi::TagSplitterPositional_t::kBETWEEN) - .structure_content(); - - // - // Note: We must have minimum of 'GFXCLK:' && 'MCLK:' OR: - // 'OD_SCLK:' && 'OD_MCLK:' tags. - if (txt_power_dev_od_voltage.get_title_size() < kMIN_VALID_LINES) { - return rsmi_status_t::RSMI_STATUS_NO_DATA; + assert(val_vec[kOD_SCLK_label_array_index] == "OD_SCLK:" || + val_vec[kOD_SCLK_label_array_index] == "GFXCLK:"); + if ((val_vec[kOD_SCLK_label_array_index] != "OD_SCLK:") && + (val_vec[kOD_SCLK_label_array_index] != "GFXCLK:")) { + return RSMI_STATUS_UNEXPECTED_DATA; } - // Note: For debug builds/purposes only. - assert(txt_power_dev_od_voltage.contains_title_key(kTAG_GFXCLK) || - txt_power_dev_od_voltage.contains_title_key(kTAG_OD_SCLK)); - // Note: For release builds/purposes. - if (!txt_power_dev_od_voltage.contains_title_key(kTAG_GFXCLK) && - !txt_power_dev_od_voltage.contains_title_key(kTAG_OD_SCLK)) { - return rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + + // find last_item but skip empty lines + int last_item = val_vec.size()-1; + while (val_vec[last_item].empty() || val_vec[last_item][0] == 0) + last_item--; + + p->curr_sclk_range.lower_bound = freq_string_to_int(val_vec, nullptr, + nullptr, kOD_SCLK_label_array_index + 1); + p->curr_sclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, + nullptr, kOD_SCLK_label_array_index + 2); + + if (val_vec.size() < (kOD_MCLK_label_array_index + 1)) { + return RSMI_STATUS_UNEXPECTED_SIZE; + } + // The condition below checks if it is the old style or new style format. + if (val_vec[kOD_MCLK_label_array_index] == "OD_MCLK:") { + p->curr_mclk_range.lower_bound = 0; + p->curr_mclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, + nullptr, kOD_MCLK_label_array_index + 1); + } else if (val_vec[kOD_MCLK_label_array_index] == "MCLK:") { + p->curr_mclk_range.lower_bound = freq_string_to_int(val_vec, nullptr, + nullptr, kOD_MCLK_label_array_index + 1); + // the upper memory frequency is the last + p->curr_mclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, + nullptr, last_item); + return RSMI_STATUS_SUCCESS; + } else { + if (val_vec.size() < (kOD_MCLK_label_array_index + 3)) { + return RSMI_STATUS_UNEXPECTED_SIZE; + } + if (val_vec[kOD_MCLK_label_array_index + 1] == "MCLK:") { + p->curr_sclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, + nullptr, kOD_SCLK_label_array_index + 3); + p->curr_mclk_range.lower_bound = freq_string_to_int(val_vec, nullptr, + nullptr, kOD_MCLK_label_array_index + 2); + // the upper memory frequency is the last + p->curr_mclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, + nullptr, last_item); + return RSMI_STATUS_SUCCESS; + } + return RSMI_STATUS_NOT_YET_IMPLEMENTED; } - // Note: Quick helpers for getting 1st and last elements found - auto build_lower_bound = [&](const std::string& prim_key) { - auto lower_bound_data = txt_power_dev_od_voltage.get_structured_data_subkey_first(prim_key); - return std::vector{lower_bound_data}; - }; - - auto build_upper_bound = [&](const std::string& prim_key) { - auto upper_bound_data = txt_power_dev_od_voltage.get_structured_data_subkey_last(prim_key); - return std::vector{upper_bound_data}; - }; - - // Validates 'OD_SCLK' is in the structure - if (txt_power_dev_od_voltage.contains_structured_key(kTAG_OD_SCLK, - KTAG_FIRST_FREQ_IDX)) { - p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_OD_SCLK), nullptr, nullptr, 0); - p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_OD_SCLK), nullptr, nullptr, 0); - - // Validates 'OD_MCLK' is in the structure - if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_MCLK, - KTAG_FIRST_FREQ_IDX)) { - p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); - p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); - } + if (val_vec.size() < kOD_VDDC_CURVE_label_array_index) { + return RSMI_STATUS_UNEXPECTED_SIZE; } - // Validates 'GFXCLK' is in the structure - else if (txt_power_dev_od_voltage.contains_structured_key(kTAG_GFXCLK, - KTAG_FIRST_FREQ_IDX)) { - p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_GFXCLK), nullptr, nullptr, 0); - p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_GFXCLK), nullptr, nullptr, 0); - // Validates 'MCLK' is in the structure - if (txt_power_dev_od_voltage.contains_structured_key(KTAG_MCLK, - KTAG_FIRST_FREQ_IDX)) { - p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_MCLK), nullptr, nullptr, 0); - p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_MCLK), nullptr, nullptr, 0); - } - } - else { - return RSMI_STATUS_NOT_YET_IMPLEMENTED; - } - p->num_regions = 0; + p->num_regions = + static_cast((val_vec.size()) / 2); return RSMI_STATUS_SUCCESS; CATCH @@ -1676,6 +1674,30 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, } +static void get_vc_region(uint32_t start_ind, + std::vector *val_vec, rsmi_freq_volt_region_t *p) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + assert(p != nullptr); + assert(val_vec != nullptr); + THROW_IF_NULLPTR_DEREF(p) + THROW_IF_NULLPTR_DEREF(val_vec) + + // There must be at least 1 region to read in + assert(val_vec->size() >= kOD_OD_RANGE_label_array_index + 2); + assert((*val_vec)[kOD_OD_RANGE_label_array_index] == "OD_RANGE:"); + if ((val_vec->size() < kOD_OD_RANGE_label_array_index + 2) || + ((*val_vec)[kOD_OD_RANGE_label_array_index] != "OD_RANGE:") ) { + ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning " + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA); + LOG_TRACE(ss); + throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_DATA, __FUNCTION__); + } + od_value_pair_str_to_range((*val_vec)[start_ind], &p->freq_range); + od_value_pair_str_to_range((*val_vec)[start_ind + 1], &p->volt_range); +} + /* * num_regions [inout] on calling, the number of regions requested to be read * in. At completion, the number of regions actually read in @@ -1707,20 +1729,23 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind, // This is a work-around to handle systems where kDevPowerODVoltage is not // fully supported yet. - if (val_vec.size() < kMIN_VALID_LINES) { + if (val_vec.size() < 2) { ss << __PRETTY_FUNCTION__ - << " | Issue: val_vec.size() < " << kMIN_VALID_LINES << "; returning " + << " | Issue: val_vec.size() < 2" << "; returning " << getRSMIStatusString(RSMI_STATUS_NOT_YET_IMPLEMENTED); LOG_ERROR(ss); return RSMI_STATUS_NOT_YET_IMPLEMENTED; } uint32_t val_vec_size = static_cast(val_vec.size()); + assert((val_vec_size - kOD_VDDC_CURVE_start_index) > 0); + ss << __PRETTY_FUNCTION__ << " | val_vec_size = " << std::dec - << val_vec_size; + << val_vec_size + << " | kOD_VDDC_CURVE_start_index = " << kOD_VDDC_CURVE_start_index; LOG_DEBUG(ss); - *num_regions = 0; + *num_regions = std::min((val_vec_size) / 2, *num_regions); return RSMI_STATUS_SUCCESS; CATCH diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc index 45dd3fe40f..61ec4243dc 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc @@ -1134,6 +1134,14 @@ std::string print_rsmi_od_volt_freq_data_t(rsmi_od_volt_freq_data_t *odv) { ss << pt_rng_Mhz("\t**Current SCLK frequency range: ", &odv->curr_sclk_range); ss << pt_rng_Mhz("\t**Current MCLK frequency range: ", &odv->curr_mclk_range); + ss << pt_rng_Mhz("\t**Min/Max Possible SCLK frequency range: ", + &odv->sclk_freq_limits); + ss << pt_rng_Mhz("\t**Min/Max Possible MCLK frequency range: ", + &odv->mclk_freq_limits); + + ss << "\t**Current Freq/Volt. curve: " << "\n"; + ss << pt_vddc_curve(&odv->curve); + ss << "\t**Number of Freq./Volt. regions: " << odv->num_regions << "\n\n"; return ss.str(); } @@ -1216,6 +1224,5 @@ std::queue getAllDeviceGfxVers() { return deviceGfxVersions; } - } // namespace smi } // namespace amd diff --git a/projects/amdsmi/tests/amd_smi_test/functional/mutual_exclusion.cc b/projects/amdsmi/tests/amd_smi_test/functional/mutual_exclusion.cc index e5578619f1..48bbe82934 100755 --- a/projects/amdsmi/tests/amd_smi_test/functional/mutual_exclusion.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/mutual_exclusion.cc @@ -183,10 +183,10 @@ void TestMutualExclusion::Run(void) { int64_t dmy_i64 = 0; char dmy_str[10]; amdsmi_dev_perf_level_t dmy_perf_lvl; - amdsmi_frequencies_t dmy_freqs{}; - amdsmi_od_volt_freq_data_t dmy_od_volt{}; - amdsmi_freq_volt_region_t dmy_vlt_reg{}; - amdsmi_error_count_t dmy_err_cnt{}; + amdsmi_frequencies_t dmy_freqs; + amdsmi_od_volt_freq_data_t dmy_od_volt; + amdsmi_freq_volt_region_t dmy_vlt_reg; + amdsmi_error_count_t dmy_err_cnt; amdsmi_ras_err_state_t dmy_ras_err_st; // This can be replaced with ASSERT_EQ() once env. stabilizes diff --git a/projects/amdsmi/tests/amd_smi_test/functional/volt_freq_curv_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/volt_freq_curv_read.cc index 080d8e9a1a..4c1a758fc9 100755 --- a/projects/amdsmi/tests/amd_smi_test/functional/volt_freq_curv_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/volt_freq_curv_read.cc @@ -146,7 +146,7 @@ static void print_amdsmi_od_volt_freq_regions(uint32_t num_regions, void TestVoltCurvRead::Run(void) { amdsmi_status_t err; - amdsmi_od_volt_freq_data_t odv{}; + amdsmi_od_volt_freq_data_t odv; TestBase::Run(); if (setup_failed_) { @@ -177,5 +177,25 @@ void TestVoltCurvRead::Run(void) { err = amdsmi_get_gpu_od_volt_info(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_INVAL); } + + if (err == AMDSMI_STATUS_SUCCESS) { + std::cout << "\t**Frequency-voltage curve data:" << std::endl; + print_amdsmi_od_volt_freq_data_t(&odv); + + amdsmi_freq_volt_region_t *regions; + uint32_t num_regions; + regions = new amdsmi_freq_volt_region_t[odv.num_regions]; + ASSERT_TRUE(regions != nullptr); + + num_regions = odv.num_regions; + err = amdsmi_get_gpu_od_volt_curve_regions(processor_handles_[i], &num_regions, regions); + CHK_ERR_ASRT(err) + ASSERT_TRUE(num_regions == odv.num_regions); + + std::cout << "\t**Frequency-voltage curve regions:" << std::endl; + print_amdsmi_od_volt_freq_regions(num_regions, regions); + + delete []regions; + } } }