From 5b36b438b7e993f1ed543141ce2fbb18810bb33d Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Wed, 22 Nov 2023 03:32:55 -0600 Subject: [PATCH] Refactor gpu_metrics usage in CLI Signed-off-by: Maisam Arif Change-Id: I599878971ab94a768d008f046f2d303ad76fdb3b --- amdsmi_cli/amdsmi_commands.py | 89 ++++++++++++++++++------ amdsmi_cli/amdsmi_parser.py | 2 +- include/amd_smi/amdsmi.h | 15 ++-- py-interface/amdsmi_interface.py | 4 ++ py-interface/amdsmi_wrapper.py | 114 ++++++++++++++++++++----------- src/amd_smi/amd_smi.cc | 21 +++--- 6 files changed, 165 insertions(+), 80 deletions(-) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 8fc3d6e7d8..35ba4f4ea5 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -505,12 +505,16 @@ class AMDSMICommands(): ras_info = amdsmi_interface.amdsmi_get_gpu_ras_feature_info(args.gpu) for key, value in ras_info.items(): if isinstance(value, int): - if value == 65535 or value == 0: + if value == 65535: logging.debug(f"Failed to get ras {key} for gpu {gpu_id}") ras_info[key] = "N/A" continue - if self.logger.is_human_readable_format(): - ras_info[key] = f"{value}" + if key != "eeprom_version": + if value: + ras_info[key] = "ENABLED" + else: + ras_info[key] = "DISABLED" + ras_dict.update(ras_info) except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Failed to get ras info for gpu %s | %s", gpu_id, e.get_error_info()) @@ -981,12 +985,13 @@ class AMDSMICommands(): 'current_soc_voltage': "N/A", 'current_mem_voltage': "N/A", 'power_limit': "N/A", - 'power_management': "N/A"} + 'power_management': "N/A", + 'throttle_status': "N/A"} try: power_info = amdsmi_interface.amdsmi_get_power_info(args.gpu) for key, value in power_info.items(): - if value == 0xFFFFFFFF: + if value == 0xFFFF: power_info[key] = "N/A" elif self.logger.is_human_readable_format(): if "voltage" in key: @@ -994,7 +999,11 @@ class AMDSMICommands(): elif "power" in key: power_info[key] = f"{value} W" - power_dict['current_power'] = power_info['average_socket_power'] + power_dict['current_power'] = power_info['current_socket_power'] + + if power_dict['current_power'] == "N/A": + power_dict['current_power'] = power_info['average_socket_power'] + power_dict['current_gfx_voltage'] = power_info['gfx_voltage'] power_dict['current_soc_voltage'] = power_info['soc_voltage'] power_dict['current_mem_voltage'] = power_info['mem_voltage'] @@ -1012,6 +1021,16 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Failed to get power management status for gpu %s | %s", gpu_id, e.get_error_info()) + try: + throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_throttle_status(args.gpu) + if throttle_status: + power_dict['throttle_status'] = "THROTTLED" + else: + power_dict['throttle_status'] = "UNTHROTTLED" + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get throttle status for gpu %s | %s", gpu_id, e.get_error_info()) + + values_dict['power'] = power_dict if "clock" in current_platform_args: if args.clock: @@ -1060,8 +1079,12 @@ class AMDSMICommands(): logging.debug("Failed to get %s clock info for gpu %s | %s", clock_name, gpu_id, e.get_error_info()) try: - # is_clk_locked = amdsmi_interface.amdsmi_is_clk_locked(args.gpu, amdsmi_interface.AmdSmiClkType.GFX) - is_clk_locked = "N/A" + is_clk_locked = amdsmi_interface.amdsmi_get_gpu_metrics_gfxclk_lock_status(args.gpu) + if self.logger.is_human_readable_format(): + if is_clk_locked: + is_clk_locked = "LOCKED" + else: + is_clk_locked = "UNLOCKED" except amdsmi_exception.AmdSmiLibraryException as e: is_clk_locked = "N/A" logging.debug("Failed to get gfx clock lock status info for gpu %s | %s", gpu_id, e.get_error_info()) @@ -1114,7 +1137,7 @@ class AMDSMICommands(): if self.logger.is_human_readable_format(): unit = '\N{DEGREE SIGN}C' for temperature_key, temperature_value in temperatures.items(): - if 'AMD_SMI_STATUS' not in str(temperature_value): + if 'N/A' not in str(temperature_value): temperatures[temperature_key] = f"{temperature_value} {unit}" values_dict['temperature'] = temperatures @@ -1123,12 +1146,26 @@ class AMDSMICommands(): ecc_count = {} try: ecc_count = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu) - ecc_count['correctable'] = ecc_count.pop('correctable_count') - ecc_count['uncorrectable'] = ecc_count.pop('uncorrectable_count') + ecc_count['total_correctable'] = ecc_count.pop('correctable_count') + ecc_count['total_uncorrectable'] = ecc_count.pop('uncorrectable_count') except amdsmi_exception.AmdSmiLibraryException as e: - ecc_count['correctable'] = "N/A" - ecc_count['uncorrectable'] = "N/A" - logging.debug("Failed to get ecc count for gpu %s | %s", gpu_id, e.get_error_info()) + ecc_count['total_correctable'] = "N/A" + ecc_count['total_uncorrectable'] = "N/A" + ecc_count['cache_correctable'] = "N/A" + ecc_count['cache_uncorrectable'] = "N/A" + logging.debug("Failed to get total ecc count for gpu %s | %s", gpu_id, e.get_error_info()) + + if ecc_count['total_correctable'] != "N/A": + # Get the UMC error count for getting total cache correctable errors + umc_block = amdsmi_interface.AmdSmiGpuBlock['UMC'] + try: + umc_count = amdsmi_interface.amdsmi_get_gpu_ecc_count(args.gpu, umc_block) + ecc_count['cache_correctable'] = ecc_count['total_correctable'] - umc_count['correctable_count'] + ecc_count['cache_uncorrectable'] = ecc_count['total_uncorrectable'] - umc_count['uncorrectable_count'] + except amdsmi_exception.AmdSmiLibraryException as e: + ecc_count['cache_correctable'] = "N/A" + ecc_count['cache_uncorrectable'] = "N/A" + logging.debug("Failed to get cache ecc count for gpu %s at block %s | %s", gpu_id, umc_block, e.get_error_info()) values_dict['ecc'] = ecc_count if "pcie" in current_platform_args: @@ -1162,23 +1199,28 @@ class AMDSMICommands(): logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info()) try: - pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu) + pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_metrics_pcie_replay_count_acc(args.gpu) pcie_dict['replay_count'] = pci_replay_counter except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Failed to get pci replay counter for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Falling back to sysfs pci replay counter for gpu %s | %s", gpu_id, e.get_error_info()) + try: + pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu) + pcie_dict['replay_count'] = pci_replay_counter + except amdsmi_exception.AmdSmiLibraryException as err: + pcie_dict['replay_count'] = "N/A" + logging.debug("Failed to get sysfs fallback pci replay counter for gpu %s | %s", gpu_id, err.get_error_info()) try: - # l0_to_recovery_counter = amdsmi_interface.amdsmi_get_gpu_pci_l0_to_recovery_counter(args.gpu) - # pcie_dict['l0_to_recovery_count'] = l0_to_recovery_counter - pcie_dict['l0_to_recovery_count'] = "N/A" + l0_to_recovery_counter = amdsmi_interface.amdsmi_get_gpu_metrics_pcie_l0_recov_count_acc(args.gpu) + pcie_dict['l0_to_recovery_count'] = l0_to_recovery_counter except amdsmi_exception.AmdSmiLibraryException as e: pcie_dict['l0_to_recovery_count'] = "N/A" logging.debug("Failed to get pcie l0 to recovery counter for gpu %s | %s", gpu_id, e.get_error_info()) try: - # pci_replay_rollover_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_rollover_counter(args.gpu) - # pcie_dict['replay_roll_over_count'] = pci_replay_rollover_counter - pcie_dict['replay_roll_over_count'] = "N/A" + pci_replay_rollover_counter = amdsmi_interface.amdsmi_get_gpu_metrics_pcie_replay_rover_count_acc(args.gpu) + pcie_dict['replay_rollover_count'] = pci_replay_rollover_counter except amdsmi_exception.AmdSmiLibraryException as e: pcie_dict['replay_roll_over_count'] = "N/A" logging.debug("Failed to get pcie replay rollover counter for gpu %s | %s", gpu_id, e.get_error_info()) @@ -1702,7 +1744,10 @@ class AMDSMICommands(): try: dest_gpu_link_status = amdsmi_interface.amdsmi_is_P2P_accessible(src_gpu, dest_gpu) - src_gpu_links[dest_gpu_key] = bool(dest_gpu_link_status) + if dest_gpu_link_status: + src_gpu_links[dest_gpu_key] = "ENABLED" + else: + src_gpu_links[dest_gpu_key] = "DISABLED" except amdsmi_exception.AmdSmiLibraryException as e: src_gpu_links[dest_gpu_key] = "N/A" logging.debug("Failed to get link status for %s to %s | %s", diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py index d37eb80872..262b2358bc 100644 --- a/amdsmi_cli/amdsmi_parser.py +++ b/amdsmi_cli/amdsmi_parser.py @@ -248,7 +248,7 @@ class AMDSMIParser(argparse.ArgumentParser): command_modifier_group.add_argument('--file', action=self._check_output_file_path(), type=str, required=False, help=file_help) # Placing loglevel outside the subcommands so it can be used with any subcommand - command_modifier_group.add_argument('--loglevel', action='store', required=False, help=loglevel_help, default='ERROR', metavar='LEVEL', + command_modifier_group.add_argument('--loglevel', action='store', type=str.upper, required=False, help=loglevel_help, default='ERROR', metavar='LEVEL', choices=loglevel_choices) diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index fb27f80da2..66c2a3d6be 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -525,6 +525,7 @@ typedef struct { } amdsmi_board_info_t; typedef struct { + uint32_t current_socket_power; uint32_t average_socket_power; uint32_t gfx_voltage; // GFX voltage measurement in mV uint32_t soc_voltage; // SOC voltage measurement in mV @@ -2705,16 +2706,16 @@ amdsmi_status_t amdsmi_get_gpu_metrics_info(amdsmi_processor_handle processor_ha * @brief Get the pm metrics table with provided device index. * * @details Given a device handle @p processor_handle, @p pm_metrics pointer, - * and @p num_of_metrics pointer, + * and @p num_of_metrics pointer, * this function will write the pm metrics name value pair * to the array at @p pm_metrics and the number of metrics retreived to @p num_of_metrics * Note: the library allocated memory for pm_metrics, and user must call * free(pm_metrics) to free it after use. - * + * * @param[in] processor_handle a processor handle * * @param[inout] pm_metrics A pointerto an array to hold multiple PM metrics. On successs, - * the library will allocate memory of pm_metrics and write metrics to this array. + * the library will allocate memory of pm_metrics and write metrics to this array. * The caller must free this memory after usage to avoid memory leak. * * @param[inout] num_of_metrics a pointer to uint32_t to which the number of @@ -2739,18 +2740,18 @@ amdsmi_status_t amdsmi_get_gpu_pm_metrics_info( * @brief Get the register metrics table with provided device index and register type. * * @details Given a device handle @p processor_handle, @p reg_type, @p reg_metrics pointer, - * and @p num_of_metrics pointer, + * and @p num_of_metrics pointer, * this function will write the register metrics name value pair * to the array at @p reg_metrics and the number of metrics retreived to @p num_of_metrics * Note: the library allocated memory for reg_metrics, and user must call * free(reg_metrics) to free it after use. - * + * * @param[in] processor_handle a processor handle - * + * * @param[in] reg_type The register type * * @param[inout] reg_metrics A pointerto an array to hold multiple register metrics. On successs, - * the library will allocate memory of reg_metrics and write metrics to this array. + * the library will allocate memory of reg_metrics and write metrics to this array. * The caller must free this memory after usage to avoid memory leak. * * @param[inout] num_of_metrics a pointer to uint32_t to which the number of diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index b61f68ac44..924c782e5d 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -1320,6 +1320,9 @@ def amdsmi_get_gpu_cache_info( "cpu_cache": cpu_cache, "simd_cache": simd_cache} + if cache_info_dict == {}: + raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NO_DATA) + return cache_info_dict @@ -1642,6 +1645,7 @@ def amdsmi_get_power_info( ) return { + "current_socket_power": power_measure.current_socket_power, "average_socket_power": power_measure.average_socket_power, "gfx_voltage": power_measure.gfx_voltage, "soc_voltage": power_measure.soc_voltage, diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index a36dd0f43c..9ab1307a90 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -866,6 +866,7 @@ class struct_amdsmi_power_info_t(Structure): struct_amdsmi_power_info_t._pack_ = 1 # source:False struct_amdsmi_power_info_t._fields_ = [ + ('current_socket_power', ctypes.c_uint32), ('average_socket_power', ctypes.c_uint32), ('gfx_voltage', ctypes.c_uint32), ('soc_voltage', ctypes.c_uint32), @@ -1514,6 +1515,31 @@ struct_amdsmi_gpu_metrics_t._fields_ = [ ] amdsmi_gpu_metrics_t = struct_amdsmi_gpu_metrics_t +class struct_amdsmi_name_value_t(Structure): + pass + +struct_amdsmi_name_value_t._pack_ = 1 # source:False +struct_amdsmi_name_value_t._fields_ = [ + ('name', ctypes.c_char * 64), + ('value', ctypes.c_uint64), +] + +amdsmi_name_value_t = struct_amdsmi_name_value_t + +# values for enumeration 'amdsmi_reg_type_t' +amdsmi_reg_type_t__enumvalues = { + 0: 'AMDSMI_REG_XGMI', + 1: 'AMDSMI_REG_WAFL', + 2: 'AMDSMI_REG_PCIE', + 3: 'AMDSMI_REG_USR', + 4: 'AMDSMI_REG_USR1', +} +AMDSMI_REG_XGMI = 0 +AMDSMI_REG_WAFL = 1 +AMDSMI_REG_PCIE = 2 +AMDSMI_REG_USR = 3 +AMDSMI_REG_USR1 = 4 +amdsmi_reg_type_t = ctypes.c_uint32 # enum class struct_amdsmi_ras_feature_t(Structure): pass @@ -1824,6 +1850,12 @@ amdsmi_get_gpu_od_volt_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER( amdsmi_get_gpu_metrics_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_metrics_info amdsmi_get_gpu_metrics_info.restype = amdsmi_status_t amdsmi_get_gpu_metrics_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_gpu_metrics_t)] +amdsmi_get_gpu_pm_metrics_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_pm_metrics_info +amdsmi_get_gpu_pm_metrics_info.restype = amdsmi_status_t +amdsmi_get_gpu_pm_metrics_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.POINTER(struct_amdsmi_name_value_t)), ctypes.POINTER(ctypes.c_uint32)] +amdsmi_get_gpu_reg_table_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_reg_table_info +amdsmi_get_gpu_reg_table_info.restype = amdsmi_status_t +amdsmi_get_gpu_reg_table_info.argtypes = [amdsmi_processor_handle, amdsmi_reg_type_t, ctypes.POINTER(ctypes.POINTER(struct_amdsmi_name_value_t)), ctypes.POINTER(ctypes.c_uint32)] amdsmi_set_gpu_clk_range = _libraries['libamd_smi.so'].amdsmi_set_gpu_clk_range amdsmi_set_gpu_clk_range.restype = amdsmi_status_t amdsmi_set_gpu_clk_range.argtypes = [amdsmi_processor_handle, uint64_t, uint64_t, amdsmi_clk_type_t] @@ -2349,7 +2381,9 @@ __all__ = \ 'AMDSMI_RAS_ERR_STATE_INVALID', 'AMDSMI_RAS_ERR_STATE_LAST', 'AMDSMI_RAS_ERR_STATE_MULT_UC', 'AMDSMI_RAS_ERR_STATE_NONE', 'AMDSMI_RAS_ERR_STATE_PARITY', 'AMDSMI_RAS_ERR_STATE_POISON', - 'AMDSMI_RAS_ERR_STATE_SING_C', 'AMDSMI_SLOT_TYPE__CEM', + 'AMDSMI_RAS_ERR_STATE_SING_C', 'AMDSMI_REG_PCIE', + 'AMDSMI_REG_USR', 'AMDSMI_REG_USR1', 'AMDSMI_REG_WAFL', + 'AMDSMI_REG_XGMI', 'AMDSMI_SLOT_TYPE__CEM', 'AMDSMI_SLOT_TYPE__OAM', 'AMDSMI_SLOT_TYPE__PCIE', 'AMDSMI_SLOT_TYPE__RESERVED', 'AMDSMI_STATUS_ADDRESS_FAULT', 'AMDSMI_STATUS_AMDGPU_RESTART_ERR', 'AMDSMI_STATUS_API_FAILED', @@ -2414,12 +2448,12 @@ __all__ = \ 'FW_ID_DRV_CAP', 'FW_ID_FIRST', 'FW_ID_IMU_DRAM', 'FW_ID_IMU_IRAM', 'FW_ID_ISP', 'FW_ID_MC', 'FW_ID_MES_KIQ', 'FW_ID_MES_STACK', 'FW_ID_MES_THREAD1', 'FW_ID_MES_THREAD1_STACK', - 'FW_ID_MMSCH', 'FW_ID_PPTABLE', 'FW_ID_PSP_BL', 'FW_ID_PSP_DBG', - 'FW_ID_PSP_INTF', 'FW_ID_PSP_KEYDB', 'FW_ID_PSP_SOC', - 'FW_ID_PSP_SOSDRV', 'FW_ID_PSP_SPL', 'FW_ID_PSP_SYSDRV', - 'FW_ID_PSP_TOC', 'FW_ID_REG_ACCESS_WHITELIST', 'FW_ID_RLC', - 'FW_ID_RLCV_LX7', 'FW_ID_RLC_P', 'FW_ID_RLC_RESTORE_LIST_CNTL', - 'FW_ID_RLC_RESTORE_LIST_GPM_MEM', + 'FW_ID_MMSCH', 'FW_ID_PM', 'FW_ID_PPTABLE', 'FW_ID_PSP_BL', + 'FW_ID_PSP_DBG', 'FW_ID_PSP_INTF', 'FW_ID_PSP_KEYDB', + 'FW_ID_PSP_SOC', 'FW_ID_PSP_SOSDRV', 'FW_ID_PSP_SPL', + 'FW_ID_PSP_SYSDRV', 'FW_ID_PSP_TOC', 'FW_ID_REG_ACCESS_WHITELIST', + 'FW_ID_RLC', 'FW_ID_RLCV_LX7', 'FW_ID_RLC_P', + 'FW_ID_RLC_RESTORE_LIST_CNTL', 'FW_ID_RLC_RESTORE_LIST_GPM_MEM', 'FW_ID_RLC_RESTORE_LIST_SRM_MEM', 'FW_ID_RLC_SAVE_RESTORE_LIST', 'FW_ID_RLC_SRLG', 'FW_ID_RLC_SRLS', 'FW_ID_RLC_V', 'FW_ID_RLX6', 'FW_ID_RLX6_CORE1', 'FW_ID_RLX6_DRAM_BOOT', @@ -2431,34 +2465,34 @@ __all__ = \ 'FW_ID_RS64_PFP_P1_DATA', 'FW_ID_SDMA0', 'FW_ID_SDMA1', 'FW_ID_SDMA2', 'FW_ID_SDMA3', 'FW_ID_SDMA4', 'FW_ID_SDMA5', 'FW_ID_SDMA6', 'FW_ID_SDMA7', 'FW_ID_SDMA_TH0', 'FW_ID_SDMA_TH1', - 'FW_ID_SEC_POLICY_STAGE2', 'FW_ID_PM', 'FW_ID_SMU', - 'FW_ID_TA_RAS', 'FW_ID_TA_XGMI', 'FW_ID_UVD', 'FW_ID_VCE', - 'FW_ID_VCN', 'FW_ID__MAX', 'MEMORY_PARTITION_NPS1', - 'MEMORY_PARTITION_NPS2', 'MEMORY_PARTITION_NPS4', - 'MEMORY_PARTITION_NPS8', 'MEMORY_PARTITION_UNKNOWN', - 'NON_AMD_CPU', 'NON_AMD_GPU', 'RD_BW0', 'TEMPERATURE_TYPE_EDGE', - 'TEMPERATURE_TYPE_FIRST', 'TEMPERATURE_TYPE_HBM_0', - 'TEMPERATURE_TYPE_HBM_1', 'TEMPERATURE_TYPE_HBM_2', - 'TEMPERATURE_TYPE_HBM_3', 'TEMPERATURE_TYPE_HOTSPOT', - 'TEMPERATURE_TYPE_JUNCTION', 'TEMPERATURE_TYPE_PLX', - 'TEMPERATURE_TYPE_VRAM', 'TEMPERATURE_TYPE__MAX', 'UNKNOWN', - 'VRAM_TYPE_DDR2', 'VRAM_TYPE_DDR3', 'VRAM_TYPE_DDR4', - 'VRAM_TYPE_GDDR1', 'VRAM_TYPE_GDDR3', 'VRAM_TYPE_GDDR4', - 'VRAM_TYPE_GDDR5', 'VRAM_TYPE_GDDR6', 'VRAM_TYPE_HBM', - 'VRAM_TYPE_UNKNOWN', 'VRAM_TYPE__MAX', 'WR_BW0', - 'amd_metrics_table_header_t', 'amdsmi_asic_info_t', - 'amdsmi_bdf_t', 'amdsmi_bit_field_t', 'amdsmi_board_info_t', - 'amdsmi_cache_flags_type_t', 'amdsmi_clk_info_t', - 'amdsmi_clk_type_t', 'amdsmi_compute_partition_type_t', - 'amdsmi_container_types_t', 'amdsmi_counter_command_t', - 'amdsmi_counter_value_t', 'amdsmi_cpu_apb_disable', - 'amdsmi_cpu_apb_enable', 'amdsmi_cpusocket_handle', - 'amdsmi_ddr_bw_metrics_t', 'amdsmi_dev_perf_level_t', - 'amdsmi_dimm_power_t', 'amdsmi_dimm_thermal_t', - 'amdsmi_dpm_level_t', 'amdsmi_driver_info_t', - 'amdsmi_engine_usage_t', 'amdsmi_error_count_t', - 'amdsmi_event_group_t', 'amdsmi_event_handle_t', - 'amdsmi_event_type_t', 'amdsmi_evt_notification_data_t', + 'FW_ID_SEC_POLICY_STAGE2', 'FW_ID_SMU', 'FW_ID_TA_RAS', + 'FW_ID_TA_XGMI', 'FW_ID_UVD', 'FW_ID_VCE', 'FW_ID_VCN', + 'FW_ID__MAX', 'MEMORY_PARTITION_NPS1', 'MEMORY_PARTITION_NPS2', + 'MEMORY_PARTITION_NPS4', 'MEMORY_PARTITION_NPS8', + 'MEMORY_PARTITION_UNKNOWN', 'NON_AMD_CPU', 'NON_AMD_GPU', + 'RD_BW0', 'TEMPERATURE_TYPE_EDGE', 'TEMPERATURE_TYPE_FIRST', + 'TEMPERATURE_TYPE_HBM_0', 'TEMPERATURE_TYPE_HBM_1', + 'TEMPERATURE_TYPE_HBM_2', 'TEMPERATURE_TYPE_HBM_3', + 'TEMPERATURE_TYPE_HOTSPOT', 'TEMPERATURE_TYPE_JUNCTION', + 'TEMPERATURE_TYPE_PLX', 'TEMPERATURE_TYPE_VRAM', + 'TEMPERATURE_TYPE__MAX', 'UNKNOWN', 'VRAM_TYPE_DDR2', + 'VRAM_TYPE_DDR3', 'VRAM_TYPE_DDR4', 'VRAM_TYPE_GDDR1', + 'VRAM_TYPE_GDDR3', 'VRAM_TYPE_GDDR4', 'VRAM_TYPE_GDDR5', + 'VRAM_TYPE_GDDR6', 'VRAM_TYPE_HBM', 'VRAM_TYPE_UNKNOWN', + 'VRAM_TYPE__MAX', 'WR_BW0', 'amd_metrics_table_header_t', + 'amdsmi_asic_info_t', 'amdsmi_bdf_t', 'amdsmi_bit_field_t', + 'amdsmi_board_info_t', 'amdsmi_cache_flags_type_t', + 'amdsmi_clk_info_t', 'amdsmi_clk_type_t', + 'amdsmi_compute_partition_type_t', 'amdsmi_container_types_t', + 'amdsmi_counter_command_t', 'amdsmi_counter_value_t', + 'amdsmi_cpu_apb_disable', 'amdsmi_cpu_apb_enable', + 'amdsmi_cpusocket_handle', 'amdsmi_ddr_bw_metrics_t', + 'amdsmi_dev_perf_level_t', 'amdsmi_dimm_power_t', + 'amdsmi_dimm_thermal_t', 'amdsmi_dpm_level_t', + 'amdsmi_driver_info_t', 'amdsmi_engine_usage_t', + 'amdsmi_error_count_t', 'amdsmi_event_group_t', + 'amdsmi_event_handle_t', 'amdsmi_event_type_t', + 'amdsmi_evt_notification_data_t', 'amdsmi_evt_notification_type_t', 'amdsmi_first_online_core_on_cpu_socket', 'amdsmi_freq_ind_t', 'amdsmi_freq_volt_region_t', 'amdsmi_frequencies_t', @@ -2561,10 +2595,12 @@ __all__ = \ 'amdsmi_get_gpu_pci_bandwidth', 'amdsmi_get_gpu_pci_replay_counter', 'amdsmi_get_gpu_pci_throughput', 'amdsmi_get_gpu_perf_level', + 'amdsmi_get_gpu_pm_metrics_info', 'amdsmi_get_gpu_power_profile_presets', 'amdsmi_get_gpu_process_info', 'amdsmi_get_gpu_process_list', 'amdsmi_get_gpu_ras_block_features_enabled', - 'amdsmi_get_gpu_ras_feature_info', 'amdsmi_get_gpu_revision', + 'amdsmi_get_gpu_ras_feature_info', + 'amdsmi_get_gpu_reg_table_info', 'amdsmi_get_gpu_revision', 'amdsmi_get_gpu_subsystem_id', 'amdsmi_get_gpu_subsystem_name', 'amdsmi_get_gpu_topo_numa_affinity', 'amdsmi_get_gpu_total_ecc_count', 'amdsmi_get_gpu_vbios_info', @@ -2590,7 +2626,7 @@ __all__ = \ 'amdsmi_is_gpu_power_management_enabled', 'amdsmi_link_id_bw_type_t', 'amdsmi_memory_page_status_t', 'amdsmi_memory_partition_type_t', 'amdsmi_memory_type_t', - 'amdsmi_mm_ip_t', 'amdsmi_od_vddc_point_t', + 'amdsmi_mm_ip_t', 'amdsmi_name_value_t', 'amdsmi_od_vddc_point_t', 'amdsmi_od_volt_curve_t', 'amdsmi_od_volt_freq_data_t', 'amdsmi_pcie_bandwidth_t', 'amdsmi_pcie_info_t', 'amdsmi_pcie_slot_type_t', 'amdsmi_power_cap_info_t', @@ -2599,7 +2635,7 @@ __all__ = \ 'amdsmi_proc_info_t', 'amdsmi_process_handle_t', 'amdsmi_process_info_t', 'amdsmi_processor_handle', 'amdsmi_range_t', 'amdsmi_ras_err_state_t', - 'amdsmi_ras_feature_t', 'amdsmi_reset_gpu', + 'amdsmi_ras_feature_t', 'amdsmi_reg_type_t', 'amdsmi_reset_gpu', 'amdsmi_reset_gpu_compute_partition', 'amdsmi_reset_gpu_fan', 'amdsmi_reset_gpu_memory_partition', 'amdsmi_reset_gpu_xgmi_error', 'amdsmi_retired_page_record_t', @@ -2646,7 +2682,7 @@ __all__ = \ 'struct_amdsmi_freq_volt_region_t', 'struct_amdsmi_frequencies_t', 'struct_amdsmi_frequency_range_t', 'struct_amdsmi_fw_info_t', 'struct_amdsmi_gpu_cache_info_t', 'struct_amdsmi_gpu_metrics_t', - 'struct_amdsmi_link_id_bw_type_t', + 'struct_amdsmi_link_id_bw_type_t', 'struct_amdsmi_name_value_t', 'struct_amdsmi_od_vddc_point_t', 'struct_amdsmi_od_volt_curve_t', 'struct_amdsmi_od_volt_freq_data_t', 'struct_amdsmi_pcie_bandwidth_t', 'struct_amdsmi_pcie_info_t', diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index c69bc63f85..f92e28c224 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1859,22 +1859,21 @@ amdsmi_get_power_info(amdsmi_processor_handle processor_handle, amdsmi_power_inf if (status != AMDSMI_STATUS_SUCCESS) return status; - info->average_socket_power = 0xFFFFFFFF; - info->gfx_voltage = 0xFFFFFFFF; - info->soc_voltage = 0xFFFFFFFF; // Not implmented yet - info->mem_voltage = 0xFFFFFFFF; // Not implmented yet - info->power_limit = 0xFFFFFFFF; + info->current_socket_power = 0xFFFF; + info->average_socket_power = 0xFFFF; + info->gfx_voltage = 0xFFFF; + info->soc_voltage = 0xFFFF; + info->mem_voltage = 0xFFFF; + info->power_limit = 0xFFFF; amdsmi_gpu_metrics_t metrics = {}; status = amdsmi_get_gpu_metrics_info(processor_handle, &metrics); if (status == AMDSMI_STATUS_SUCCESS) { + info->current_socket_power = metrics.current_socket_power; info->average_socket_power = metrics.average_socket_power; - } - - int64_t voltage_read = 0; - status = amdsmi_get_gpu_volt_metric(processor_handle, AMDSMI_VOLT_TYPE_VDDGFX, AMDSMI_VOLT_CURRENT, &voltage_read); - if (status == AMDSMI_STATUS_SUCCESS) { - info->gfx_voltage = voltage_read; + info->gfx_voltage = metrics.voltage_gfx; + info->soc_voltage = metrics.voltage_soc; + info->mem_voltage = metrics.voltage_mem; } int power_limit = 0;