From fb0440d49344e73cb0bb43129b5e7ec44874108a Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Tue, 26 Sep 2023 14:45:44 -0500 Subject: [PATCH] Added sleep state to amd-smi metric --clock Change-Id: Idb5fbc84a787ef1affdf0449b6dd77ab6e50e91d Signed-off-by: Maisam Arif [ROCm/amdsmi commit: 95337c88fc89c974cbcdf817969c83fd91ddce16] --- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 154 ++++++++++-------- projects/amdsmi/include/amd_smi/amdsmi.h | 3 +- .../include/amd_smi/impl/amd_smi_utils.h | 2 +- .../amdsmi/py-interface/amdsmi_interface.py | 1 + .../amdsmi/py-interface/amdsmi_wrapper.py | 3 +- projects/amdsmi/src/amd_smi/amd_smi.cc | 6 +- projects/amdsmi/src/amd_smi/amd_smi_utils.cc | 34 ++-- 7 files changed, 118 insertions(+), 85 deletions(-) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index f9cba62615..8e7d901789 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -207,7 +207,7 @@ class AMDSMICommands(): static_dict['asic'] = asic_info except amdsmi_exception.AmdSmiLibraryException as e: static_dict['asic'] = "N/A" - logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info()) if args.bus: bus_output_info = {} @@ -243,13 +243,13 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: bus_info = "N/A" - logging.debug("Failed to get bus info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get bus info for gpu %s | %s", gpu_id, e.get_error_info()) try: bus_output_info['bdf'] = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: bus_output_info['bdf'] = "N/A" - logging.debug("Failed to get bdf for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get bdf for gpu %s | %s", gpu_id, e.get_error_info()) bus_output_info.update(bus_info) static_dict['bus'] = bus_output_info @@ -259,7 +259,7 @@ class AMDSMICommands(): static_dict['vbios'] = vbios_info except amdsmi_exception.AmdSmiLibraryException as e: static_dict['vbios'] = "N/A" - logging.debug("Failed to get vbios info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get vbios info for gpu %s | %s", gpu_id, e.get_error_info()) if self.helpers.is_linux() and self.helpers.is_baremetal(): if args.board: @@ -275,7 +275,7 @@ class AMDSMICommands(): static_dict['board'] = board_info except amdsmi_exception.AmdSmiLibraryException as e: static_dict['board'] = "N/A" - logging.debug("Failed to get board info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get board info for gpu %s | %s", gpu_id, e.get_error_info()) if args.limit: # Power limits try: @@ -287,7 +287,7 @@ class AMDSMICommands(): power_limit_error = True max_power_limit = "N/A" current_power_limit = "N/A" - logging.debug("Failed to get power cap info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get power cap info for gpu %s | %s", gpu_id, e.get_error_info()) # Edge temperature limits try: @@ -297,7 +297,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: slowdown_temp_edge_limit_error = True slowdown_temp_edge_limit = "N/A" - logging.debug("Failed to get edge temperature slowdown metric for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get edge temperature slowdown metric for gpu %s | %s", gpu_id, e.get_error_info()) if slowdown_temp_edge_limit == 0: slowdown_temp_edge_limit_error = True @@ -310,7 +310,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: shutdown_temp_edge_limit_error = True shutdown_temp_edge_limit = "N/A" - logging.debug("Failed to get edge temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get edge temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) if shutdown_temp_edge_limit == 0: shutdown_temp_edge_limit_error = True @@ -324,7 +324,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: slowdown_temp_hotspot_limit_error = True slowdown_temp_hotspot_limit = "N/A" - logging.debug("Failed to get hotspot temperature slowdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get hotspot temperature slowdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) try: shutdown_temp_hotspot_limit_error = False @@ -333,7 +333,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: shutdown_temp_hotspot_limit_error = True shutdown_temp_hotspot_limit = "N/A" - logging.debug("Failed to get hotspot temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get hotspot temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) # VRAM temperature limits @@ -344,7 +344,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: slowdown_temp_vram_limit_error = True slowdown_temp_vram_limit = "N/A" - logging.debug("Failed to get vram temperature slowdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get vram temperature slowdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) try: shutdown_temp_vram_limit_error = False @@ -353,7 +353,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: shutdown_temp_vram_limit_error = True shutdown_temp_vram_limit = "N/A" - logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get vram temperature shutdown metrics for gpu %s | %s", gpu_id, e.get_error_info()) if self.logger.is_human_readable_format(): unit = 'W' @@ -396,7 +396,7 @@ class AMDSMICommands(): static_dict['driver'] = driver_info except amdsmi_exception.AmdSmiLibraryException as e: static_dict['driver'] = "N/A" - logging.debug("Failed to get driver info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get driver info for gpu %s | %s", gpu_id, e.get_error_info()) if self.helpers.is_hypervisor() or self.helpers.is_baremetal(): if args.ras: @@ -404,7 +404,7 @@ class AMDSMICommands(): static_dict['ras'] = amdsmi_interface.amdsmi_get_gpu_ras_block_features_enabled(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: static_dict['ras'] = "N/A" - logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get ras block features for gpu %s | %s", gpu_id, e.get_error_info()) if args.vram: try: vram_info = amdsmi_interface.amdsmi_get_gpu_vram_info(args.gpu) @@ -421,7 +421,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: vram_info = "N/A" - logging.debug("Failed to get vram info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get vram info for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['vram'] = vram_info @@ -431,13 +431,13 @@ class AMDSMICommands(): numa_node_number = amdsmi_interface.amdsmi_topo_get_numa_node_number(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: numa_node_number = "N/A" - logging.debug("Failed to get numa node number for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get numa node number for gpu %s | %s", gpu_id, e.get_error_info()) try: numa_affinity = amdsmi_interface.amdsmi_get_gpu_topo_numa_affinity(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: numa_affinity = "N/A" - logging.debug("Failed to get numa affinity for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get numa affinity for gpu %s | %s", gpu_id, e.get_error_info()) static_dict['numa'] = {'node' : numa_node_number, 'affinity' : numa_affinity} @@ -527,7 +527,7 @@ class AMDSMICommands(): fw_list.update(fw_info) except amdsmi_exception.AmdSmiLibraryException as e: fw_list['fw_list'] = "N/A" - logging.debug("Failed to get firmware info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get firmware info for gpu %s | %s", gpu_id, e.get_error_info()) multiple_devices_csv_override = False # Convert and store output by pid for csv format @@ -603,7 +603,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: bad_page_error = True bad_page_err_output = "N/A" - logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get bad page info for gpu %s | %s", gpu_id, e.get_error_info()) if bad_page_info == "No bad pages found.": bad_page_error = True @@ -812,6 +812,7 @@ class AMDSMICommands(): # Get gpu_id for logging gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + logging.debug("GPU Metrics table for %s | %s", gpu_id, amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)) if self.helpers.is_linux() and self.helpers.is_baremetal(): if args.usage: @@ -833,7 +834,7 @@ class AMDSMICommands(): values_dict['usage'] = engine_usage except amdsmi_exception.AmdSmiLibraryException as e: values_dict['usage'] = "N/A" - logging.debug("Failed to get gpu activity for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get gpu activity for gpu %s | %s", gpu_id, e.get_error_info()) if args.power: power_dict = {'current_power': "N/A", 'current_gfx_voltage': "N/A", @@ -860,7 +861,7 @@ class AMDSMICommands(): power_dict['power_limit'] = power_info['power_limit'] except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get power info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get power info for gpu %s | %s", gpu_id, e.get_error_info()) try: is_power_management_enabled = amdsmi_interface.amdsmi_is_gpu_power_management_enabled(args.gpu) @@ -869,47 +870,63 @@ class AMDSMICommands(): else: power_dict['power_management'] = "DISABLED" except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get power management status for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get power management status for gpu %s | %s", gpu_id, e.get_error_info()) values_dict['power'] = power_dict if args.clock: - clocks = {"gfx": "N/A", "mem": "N/A"} + clocks = {"gfx" : {"cur_clk": "N/A", + "max_clk": "N/A", + "min_clk": "N/A", + "sleep_clk": "N/A", + "is_clk_locked": "N/A"}, + "mem" : {"cur_clk": "N/A", + "max_clk": "N/A", + "min_clk": "N/A", + "sleep_clk": "N/A"} + } try: gfx_clock = amdsmi_interface.amdsmi_get_clock_info(args.gpu, amdsmi_interface.AmdSmiClkType.GFX) + if gfx_clock['sleep_clk'] == 0xFFFFFFFF: + gfx_clock['sleep_clk'] = "N/A" if self.logger.is_human_readable_format(): unit = 'MHz' for key, value in gfx_clock.items(): - gfx_clock[key] = f"{value} {unit}" + if isinstance(value, int): + gfx_clock[key] = f"{value} {unit}" - clocks['gfx'] = gfx_clock + clocks['gfx']['cur_clk'] = gfx_clock['cur_clk'] + clocks['gfx']['max_clk'] = gfx_clock['max_clk'] + clocks['gfx']['min_clk'] = gfx_clock['min_clk'] + clocks['gfx']['sleep_clk'] = gfx_clock['sleep_clk'] except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get gfx clock info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get gfx clock info for gpu %s | %s", gpu_id, e.get_error_info()) try: # is_clk_locked = amdsmi_interface.amdsmi_is_clk_locked(args.gpu, amdsmi_interface.AmdSmiClkType.GFX) is_clk_locked = "N/A" - except amdsmi_exception.AmdSmiLibraryException as e: - is_clk_locked = "N/A" - logging.debug("Failed to get gfx clock lock status info for gpu %s | %s", gpu_id, e.get_error_info()) - - if isinstance(clocks['gfx'], dict): clocks['gfx']['is_clk_locked'] = is_clk_locked - else: - clocks['gfx'] = {'is_clk_locked': is_clk_locked} + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get gfx clock lock status info for gpu %s | %s", gpu_id, e.get_error_info()) try: mem_clock = amdsmi_interface.amdsmi_get_clock_info(args.gpu, amdsmi_interface.AmdSmiClkType.MEM) + if mem_clock['sleep_clk'] == 0xFFFFFFFF: + mem_clock['sleep_clk'] = "N/A" if self.logger.is_human_readable_format(): unit = 'MHz' for key, value in mem_clock.items(): - mem_clock[key] = f"{value} {unit}" + if isinstance(value, int): + gfx_clock[key] = f"{value} {unit}" - clocks['mem'] = mem_clock + clocks['mem']['cur_clk'] = mem_clock['cur_clk'] + clocks['mem']['max_clk'] = mem_clock['max_clk'] + clocks['mem']['min_clk'] = mem_clock['min_clk'] + clocks['mem']['sleep_clk'] = mem_clock['sleep_clk'] except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get mem clock info for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get mem clock info for gpu %s | %s", gpu_id, e.get_error_info()) values_dict['clock'] = clocks if args.temperature: @@ -918,14 +935,14 @@ class AMDSMICommands(): args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) except amdsmi_exception.AmdSmiLibraryException as e: temperature_edge_current = "N/A" - logging.debug("Failed to get current edge temperature for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get current edge temperature for gpu %s | %s", gpu_id, e.get_error_info()) try: temperature_edge_limit = amdsmi_interface.amdsmi_get_temp_metric( args.gpu, amdsmi_interface.AmdSmiTemperatureType.EDGE, amdsmi_interface.AmdSmiTemperatureMetric.CRITICAL) except amdsmi_exception.AmdSmiLibraryException as e: temperature_edge_limit = "N/A" - logging.debug("Failed to get edge temperature limit for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get edge temperature limit for gpu %s | %s", gpu_id, e.get_error_info()) # If edge limit is reporting 0 then set the current edge temp to N/A if temperature_edge_limit == 0: @@ -936,14 +953,14 @@ class AMDSMICommands(): args.gpu, amdsmi_interface.AmdSmiTemperatureType.HOTSPOT, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) except amdsmi_exception.AmdSmiLibraryException as e: temperature_hotspot_current = "N/A" - logging.debug("Failed to get current hotspot temperature for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get current hotspot temperature for gpu %s | %s", gpu_id, e.get_error_info()) try: temperature_vram_current = amdsmi_interface.amdsmi_get_temp_metric( args.gpu, amdsmi_interface.AmdSmiTemperatureType.VRAM, amdsmi_interface.AmdSmiTemperatureMetric.CURRENT) except amdsmi_exception.AmdSmiLibraryException as e: temperature_vram_current = "N/A" - logging.debug("Failed to get current vram temperature for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get current vram temperature for gpu %s | %s", gpu_id, e.get_error_info()) temperatures = {'edge': temperature_edge_current, 'hotspot': temperature_hotspot_current, @@ -965,7 +982,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: ecc_count['correctable'] = "N/A" ecc_count['uncorrectable'] = "N/A" - logging.debug("Failed to get ecc count for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get ecc count for gpu %s | %s", gpu_id, e.get_error_info()) values_dict['ecc'] = ecc_count if args.ecc_block: @@ -980,15 +997,14 @@ class AMDSMICommands(): ecc_dict[state['block']] = {'correctable' : ecc_count['correctable_count'], 'uncorrectable': ecc_count['uncorrectable_count']} except amdsmi_exception.AmdSmiLibraryException as e: - ecc_count = "N/A" - logging.debug("Failed to get ecc count for gpu %s at block %s | %s", gpu_id, gpu_block, e.get_error_info()) + ecc_dict[state['block']] = {'correctable' : "N/A", + 'uncorrectable': "N/A"} + logging.debug("Failed to get ecc count for gpu %s at block %s | %s", gpu_id, gpu_block, e.get_error_info()) - ecc_dict[state['block']] = {'correctable' : ecc_count, - 'uncorrectable': ecc_count} values_dict['ecc_block'] = ecc_dict except amdsmi_exception.AmdSmiLibraryException as e: values_dict['ecc_block'] = "N/A" - logging.debug("Failed to get ecc block features for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get ecc block features for gpu %s | %s", gpu_id, e.get_error_info()) if args.pcie: pcie_dict = {'current_width': "N/A", 'current_speed': "N/A", @@ -1012,13 +1028,13 @@ class AMDSMICommands(): unit = 'GT/s' pcie_link_status['current_speed'] = f"{pcie_link_status['pcie_speed']} {unit}" except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get pcie link status for gpu %s | %s", gpu_id, e.get_error_info()) try: pci_replay_counter = amdsmi_interface.amdsmi_get_gpu_pci_replay_counter(args.gpu) pcie_dict['replay_count'] = pci_replay_counter except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get pci replay counter for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get pci replay counter for gpu %s | %s", gpu_id, e.get_error_info()) try: pcie_bw = amdsmi_interface.amdsmi_get_gpu_pci_throughput(args.gpu) @@ -1039,7 +1055,7 @@ class AMDSMICommands(): pcie_dict['current_bandwith_received'] = received pcie_dict['max_packet_size'] = pcie_bw['max_pkt_sz'] except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get pcie bandwidth for gpu %s | %s", gpu_id, e.get_error_info()) values_dict['pcie'] = pcie_dict if args.fan: @@ -1092,7 +1108,7 @@ class AMDSMICommands(): values_dict['voltage_curve'] = voltage_point_dict except amdsmi_exception.AmdSmiLibraryException as e: values_dict['voltage_curve'] = "N/A" - logging.debug("Failed to get voltage curve for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get voltage curve for gpu %s | %s", gpu_id, e.get_error_info()) if args.overdrive: try: overdrive_level = amdsmi_interface.amdsmi_get_gpu_overdrive_level(args.gpu) @@ -1104,14 +1120,14 @@ class AMDSMICommands(): values_dict['overdrive'] = overdrive_level except amdsmi_exception.AmdSmiLibraryException as e: values_dict['overdrive'] = "N/A" - logging.debug("Failed to get overdrive level for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get overdrive level for gpu %s | %s", gpu_id, e.get_error_info()) if args.perf_level: try: perf_level = amdsmi_interface.amdsmi_get_gpu_perf_level(args.gpu) values_dict['perf_level'] = perf_level except amdsmi_exception.AmdSmiLibraryException as e: values_dict['perf_level'] = "N/A" - logging.debug("Failed to get perf level for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get perf level for gpu %s | %s", gpu_id, e.get_error_info()) if self.helpers.is_linux() and self.helpers.is_baremetal(): if args.xgmi_err: @@ -1120,7 +1136,7 @@ class AMDSMICommands(): values_dict['xgmi_err'] = amdsmi_interface.amdsmi_wrapper.amdsmi_xgmi_status_t__enumvalues[xgmi_err_status] except amdsmi_exception.AmdSmiLibraryException as e: values_dict['xgmi_err'] = "N/A" - logging.debug("Failed to get xgmi error status for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get xgmi error status for gpu %s | %s", gpu_id, e.get_error_info()) if args.energy: try: energy_dict = amdsmi_interface.amdsmi_get_energy_count(args.gpu) @@ -1156,19 +1172,19 @@ class AMDSMICommands(): total_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VRAM) memory_usage['total_vram'] = total_vram // (1024*1024) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get total VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get total VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) try: total_visible_vram = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM) memory_usage['total_visible_vram'] = total_visible_vram // (1024*1024) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get total VIS VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get total VIS VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) try: total_gtt = amdsmi_interface.amdsmi_get_gpu_memory_total(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT) memory_usage['total_gtt'] = total_gtt // (1024*1024) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get total GTT memory for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get total GTT memory for gpu %s | %s", gpu_id, e.get_error_info()) # Used VRAM try: @@ -1176,19 +1192,19 @@ class AMDSMICommands(): memory_usage['used_vram'] = used_vram // (1024*1024) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get used VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get used VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) try: used_visible_vram = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.VIS_VRAM) memory_usage['used_visible_vram'] = used_visible_vram // (1024*1024) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get used VIS VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get used VIS VRAM memory for gpu %s | %s", gpu_id, e.get_error_info()) try: used_gtt = amdsmi_interface.amdsmi_get_gpu_memory_usage(args.gpu, amdsmi_interface.AmdSmiMemoryType.GTT) memory_usage['used_gtt'] = used_gtt // (1024*1024) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get used GTT memory for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get used GTT memory for gpu %s | %s", gpu_id, e.get_error_info()) # Free VRAM if memory_usage['total_vram'] != "N/A" and memory_usage['used_vram'] != "N/A": @@ -1311,7 +1327,7 @@ class AMDSMICommands(): try: process_list = amdsmi_interface.amdsmi_get_gpu_process_list(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info()) raise e filtered_process_values = [] @@ -1320,7 +1336,7 @@ class AMDSMICommands(): process_info = amdsmi_interface.amdsmi_get_gpu_process_info(args.gpu, process_handle) except amdsmi_exception.AmdSmiLibraryException as e: process_info = "N/A" - logging.debug("Failed to get process info for gpu %s on process_handle %s | %s", gpu_id, process_handle, e.get_error_info()) + logging.debug("Failed to get process info for gpu %s on process_handle %s | %s", gpu_id, process_handle, e.get_error_info()) filtered_process_values.append({'process_info': process_info}) continue @@ -1786,7 +1802,7 @@ class AMDSMICommands(): if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e reset_clocks_results['overdrive'] = "N/A" - logging.debug("Failed to reset overdrive on gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to reset overdrive on gpu %s | %s", gpu_id, e.get_error_info()) try: level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO @@ -1796,7 +1812,7 @@ class AMDSMICommands(): if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e reset_clocks_results['clocks'] = "N/A" - logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) try: level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO @@ -1806,7 +1822,7 @@ class AMDSMICommands(): if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e reset_clocks_results['performance'] = "N/A" - logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.store_output(args.gpu, 'reset_clocks', reset_clocks_results) if args.fans: @@ -1817,7 +1833,7 @@ class AMDSMICommands(): if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e result = "N/A" - logging.debug("Failed to reset fans on gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to reset fans on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.store_output(args.gpu, 'reset_fans', result) if args.profile: @@ -1831,7 +1847,7 @@ class AMDSMICommands(): if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e reset_profile_results['power_profile'] = "N/A" - logging.debug("Failed to reset power profile on gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to reset power profile on gpu %s | %s", gpu_id, e.get_error_info()) try: level_auto = amdsmi_interface.AmdSmiDevPerfLevel.AUTO @@ -1841,7 +1857,7 @@ class AMDSMICommands(): if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e reset_profile_results['performance_level'] = "N/A" - logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to reset perf level on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.store_output(args.gpu, 'reset_profile', reset_profile_results) if args.xgmierr: @@ -1852,7 +1868,7 @@ class AMDSMICommands(): if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e result = "N/A" - logging.debug("Failed to reset xgmi error count on gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to reset xgmi error count on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.store_output(args.gpu, 'reset_xgmi_err', result) if args.perfdeterminism: try: @@ -1863,7 +1879,7 @@ class AMDSMICommands(): if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e result = "N/A" - logging.debug("Failed to set perf level on gpu %s | %s", gpu_id, e.get_error_info()) + logging.debug("Failed to set perf level on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.store_output(args.gpu, 'reset_perf_determinism', result) diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index f7b8d7ee5e..4f8f2536f1 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -471,7 +471,8 @@ typedef struct { uint32_t cur_clk; uint32_t min_clk; uint32_t max_clk; - uint32_t reserved[5]; + uint32_t sleep_clk; + uint32_t reserved[4]; } amdsmi_clk_info_t; typedef struct { diff --git a/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h b/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h index 4a9f0d0740..ae8f2b26ad 100644 --- a/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h +++ b/projects/amdsmi/include/amd_smi/impl/amd_smi_utils.h @@ -36,7 +36,7 @@ amdsmi_status_t smi_amdgpu_find_hwmon_dir(amd::smi::AMDSmiGPUDevice* device, std::string* full_path); amdsmi_status_t smi_amdgpu_get_board_info(amd::smi::AMDSmiGPUDevice* device, amdsmi_board_info_t *info); amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int *cap); -amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_clk_type_t domain, int *max_freq, int *min_freq, int *num_dpm); +amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_clk_type_t domain, int *max_freq, int *min_freq, int *num_dpm, int *sleep_state_freq); amdsmi_status_t smi_amdgpu_get_enabled_blocks(amd::smi::AMDSmiGPUDevice* device, uint64_t *enabled_blocks); amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device, uint32_t *num_pages, amdsmi_retired_page_record_t *info); amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt); diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 526aa8a5db..9731bf6c65 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -758,6 +758,7 @@ def amdsmi_get_clock_info( "cur_clk": clock_measure.cur_clk, "max_clk": clock_measure.max_clk, "min_clk": clock_measure.min_clk, + "sleep_clk" : clock_measure.sleep_clk, } diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index 9c49d42b1a..2c0fcfe1ab 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -817,7 +817,8 @@ struct_amdsmi_clk_info_t._fields_ = [ ('cur_clk', ctypes.c_uint32), ('min_clk', ctypes.c_uint32), ('max_clk', ctypes.c_uint32), - ('reserved', ctypes.c_uint32 * 5), + ('sleep_clk', ctypes.c_uint32), + ('reserved', ctypes.c_uint32 * 4), ] amdsmi_clk_info_t = struct_amdsmi_clk_info_t diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 9e376dd912..f6a3fbef4c 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -1082,7 +1082,7 @@ amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle, info->power_cap = power_cap; status = smi_amdgpu_get_ranges(gpudevice, CLK_TYPE_GFX, - NULL, NULL, &dpm); + NULL, NULL, &dpm, NULL); if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) set_ret_success = true; info->dpm_cap = dpm; @@ -1482,13 +1482,15 @@ amdsmi_get_clock_info(amdsmi_processor_handle processor_handle, amdsmi_clk_type_ } int max_freq; int min_freq; + int sleep_state_freq; status = smi_amdgpu_get_ranges(gpu_device, clk_type, - &max_freq, &min_freq, NULL); + &max_freq, &min_freq, NULL, &sleep_state_freq); if (status != AMDSMI_STATUS_SUCCESS) { return status; } info->max_clk = max_freq; info->min_clk = min_freq; + info->sleep_clk = sleep_state_freq; switch (clk_type) { case CLK_TYPE_GFX: diff --git a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc index 7c8123ecdc..fbf8d8efb6 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_utils.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_utils.cc @@ -181,15 +181,13 @@ amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int } amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_clk_type_t domain, - int *max_freq, int *min_freq, int *num_dpm) + int *max_freq, int *min_freq, int *num_dpm, int *sleep_state_freq) { if (!device->check_if_drm_is_supported()) { return AMDSMI_STATUS_NOT_SUPPORTED; } SMIGPUDEVICE_MUTEX(device->get_mutex()) std::string fullpath = "/sys/class/drm/" + device->get_gpu_path() + "/device"; - char str[10]; - unsigned int max, min, dpm; switch (domain) { case CLK_TYPE_GFX: @@ -214,20 +212,32 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_ return AMDSMI_STATUS_API_FAILED; } + unsigned int max, min, dpm, sleep_freq; + char str[10]; + char single_char; max = 0; min = UINT_MAX; dpm = 0; + sleep_freq = UINT_MAX; + for (std::string line; getline(ranges, line);) { - unsigned int d, freq; + unsigned int dpm_level, freq; - if (sscanf(line.c_str(), "%u: %d%s", &d, &freq, str) <= 2){ - ranges.close(); - return AMDSMI_STATUS_IO; + char firstChar = line[0]; + if (firstChar == 'S'){ + if (sscanf(line.c_str(), "%c: %d%s", &single_char, &sleep_freq, str) <= 2){ + ranges.close(); + return AMDSMI_STATUS_NO_DATA; + } + } else { + if (sscanf(line.c_str(), "%u: %d%c", &dpm_level, &freq, str) <= 2){ + ranges.close(); + return AMDSMI_STATUS_IO; + } + max = freq > max ? freq : max; + min = freq < min ? freq: min; + dpm = dpm_level > dpm ? dpm_level : dpm; } - - max = freq > max ? freq : max; - min = freq < min ? freq: min; - dpm = d > dpm ? d : dpm; } if (num_dpm) @@ -236,6 +246,8 @@ amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_ *max_freq = max; if (min_freq) *min_freq = min; + if (sleep_state_freq) + *sleep_state_freq = sleep_freq; ranges.close(); return AMDSMI_STATUS_SUCCESS;