diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 12f604a863..7febc5dc23 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -561,26 +561,14 @@ class AMDSMICommands(): if args.cache: try: cache_info = amdsmi_interface.amdsmi_get_gpu_cache_info(args.gpu) - logging.debug("Before dictionary modify | cache_info = " + str(cache_info)) - for key, cache_values in cache_info.items(): - cache_properties = "N/A" - if 'cache_flags' in list(cache_info[key].keys()): - if isinstance(cache_values['cache_flags'], list): - cache_properties = list(cache_values['cache_flags']) - cache_values.pop('cache_flags') # remove cache_flags from output - cache_info[key] = { # add properties to top of key's dictionary - 'cache_properties': list(cache_properties), - **cache_info[key] # append remaining key's dictionary - } - logging.debug("After dictionary modify | cache_info = " + str(cache_info)) + logging.debug(f"cache_info dictionary = {cache_info}") + if self.logger.is_human_readable_format(): for key, cache_values in cache_info.items(): cache_values['cache_size'] = f"{cache_values['cache_size']} KB" # take cache_properties out of list -> display as string, removing brackets - update_cache_properties = str(cache_values['cache_properties']) - update_cache_properties = update_cache_properties.replace("[","").replace("]", "") - cache_values['cache_properties'] = update_cache_properties - logging.debug("After human_readable | cache_info = " + str(cache_info)) + cache_values['cache_properties'] = ", ".join(cache_values['cache_properties']) + logging.debug(f"After human_readable | cache_info = {cache_info}") except amdsmi_exception.AmdSmiLibraryException as e: cache_info = "N/A" @@ -1142,9 +1130,6 @@ class AMDSMICommands(): if args.usage: try: engine_usage = amdsmi_interface.amdsmi_get_gpu_activity(args.gpu) - engine_usage['gfx_activity'] = engine_usage.pop('gfx_activity') - engine_usage['umc_activity'] = engine_usage.pop('umc_activity') - engine_usage['mm_activity'] = engine_usage.pop('mm_activity') # TODO: move vcn_activity and jpeg_activity into amdsmi_get_gpu_activity gpu_metric_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) @@ -1152,20 +1137,17 @@ class AMDSMICommands(): engine_usage['jpeg_activity'] = gpu_metric_info.pop('jpeg_activity') for key, value in engine_usage.items(): - if self.logger.is_human_readable_format(): unit = '%' if isinstance(value, list): - engine_usage[key] = [f"{v} {unit}" if str(v) != "N/A" else str(v) for v in engine_usage[key]] - save_value = engine_usage[key] - pretty_array = "[" - for i in range(len(save_value)): - if (i+1 != len(save_value)): - pretty_array += save_value[i] + ", " - else: - pretty_array += save_value[i] + "]" - engine_usage[key] = pretty_array - elif not isinstance(value, list) and engine_usage[key] != "N/A": + for index, activity in enumerate(value): + if activity != "N/A": + engine_usage[key][index] = f"{activity} {unit}" + + # Convert list to a string for human readable format + engine_usage[key] = '[' + ", ".join(engine_usage[key]) + ']' + + elif value != "N/A": engine_usage[key] = f"{value} {unit}" values_dict['usage'] = engine_usage @@ -1196,7 +1178,8 @@ class AMDSMICommands(): power_dict['current_power'] = power_info['current_socket_power'] if power_dict['current_power'] == "N/A": - power_dict['average_power'] = power_info['average_socket_power'] + # For older gpu's when current power doesn't populate we use the average socket power instead + power_dict['current_power'] = power_info['average_socket_power'] power_dict['current_gfx_voltage'] = power_info['gfx_voltage'] power_dict['current_soc_voltage'] = power_info['soc_voltage'] @@ -2654,7 +2637,7 @@ class AMDSMICommands(): if not any([args.access, args.weight, args.hops, args.link_type, args.numa_bw]): args.access = args.weight = args.hops = args.link_type= args.numa_bw = True - # Clear the table header; TODO make this a function + # Clear the table header self.logger.table_header = ''.rjust(12) # Populate the possible gpus @@ -3351,7 +3334,7 @@ class AMDSMICommands(): # Get gpu_id for logging gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) - # Clear the table header; TODO make this a function + # Clear the table header self.logger.table_header = '' # Store timestamp for watch output @@ -3365,12 +3348,14 @@ class AMDSMICommands(): try: gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu) - monitor_values['power_usage'] = gpu_metrics_info['current_socket_power'] - if monitor_values['power_usage'] == "N/A": # Fallback to average_socket_power for older gpu_metrics versions + if gpu_metrics_info['current_socket_power'] != "N/A": + monitor_values['power_usage'] = gpu_metrics_info['current_socket_power'] + else: # Fallback to average_socket_power for older gpu_metrics versions monitor_values['power_usage'] = gpu_metrics_info['average_socket_power'] if self.logger.is_human_readable_format() and monitor_values['power_usage'] != "N/A": - monitor_values['power_usage'] = f"{monitor_values['power_usage']} W" + unit = 'W' + monitor_values['power_usage'] = f"{monitor_values['power_usage']} {unit}" except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['power_usage'] = "N/A" logging.debug("Failed to get power usage on gpu %s | %s", gpu_id, e.get_error_info()) @@ -3403,7 +3388,7 @@ class AMDSMICommands(): try: gfx_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_gfx_activity'] monitor_values['gfx'] = gfx_util - if self.logger.is_human_readable_format(): + if self.logger.is_human_readable_format() and gfx_util != "N/A": monitor_values['gfx'] = f"{monitor_values['gfx']} %" except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['gfx'] = "N/A" @@ -3414,7 +3399,7 @@ class AMDSMICommands(): try: gfx_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_gfxclk'] monitor_values['gfx_clock'] = gfx_clock - if self.logger.is_human_readable_format(): + if self.logger.is_human_readable_format() and gfx_clock != "N/A": monitor_values['gfx_clock'] = f"{monitor_values['gfx_clock']} MHz" except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['gfx_clock'] = "N/A" @@ -3425,7 +3410,7 @@ class AMDSMICommands(): try: mem_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_umc_activity'] monitor_values['mem'] = mem_util - if self.logger.is_human_readable_format(): + if self.logger.is_human_readable_format() and mem_util != "N/A": monitor_values['mem'] = f"{monitor_values['mem']} %" except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['mem'] = "N/A" @@ -3436,7 +3421,7 @@ class AMDSMICommands(): try: mem_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_uclk'] monitor_values['mem_clock'] = mem_clock - if self.logger.is_human_readable_format(): + if self.logger.is_human_readable_format() and mem_clock != "N/A": monitor_values['mem_clock'] = f"{monitor_values['mem_clock']} MHz" except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['mem_clock'] = "N/A" @@ -3449,13 +3434,15 @@ class AMDSMICommands(): encoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['vcn_activity'] encoding_activity_avg = [] for value in encoder_util: - if value < 150: # each encoder chiplet's value range should be a percent + if isinstance(value, int): encoding_activity_avg.append(value) + # Averaging the possible encoding activity values if encoding_activity_avg: encoding_activity_avg = sum(encoding_activity_avg) / len(encoding_activity_avg) else: encoding_activity_avg = "N/A" + monitor_values['encoder'] = encoding_activity_avg if self.logger.is_human_readable_format() and monitor_values['encoder'] != "N/A": monitor_values['encoder'] = f"{monitor_values['encoder']} %" @@ -3468,7 +3455,7 @@ class AMDSMICommands(): try: encoder_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_vclk0'] monitor_values['encoder_clock'] = encoder_clock - if self.logger.is_human_readable_format(): + if self.logger.is_human_readable_format() and encoder_clock != "N/A": monitor_values['encoder_clock'] = f"{monitor_values['encoder_clock']} MHz" except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['encoder_clock'] = "N/A" @@ -3500,10 +3487,11 @@ class AMDSMICommands(): if args.throttle_status: try: throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['throttle_status'] - if throttle_status: - throttle_status = "THROTTLED" - else: - throttle_status = "UNTHROTTLED" + if throttle_status != "N/A": + if throttle_status: + throttle_status = "THROTTLED" + else: + throttle_status = "UNTHROTTLED" monitor_values['throttle_status'] = throttle_status except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['throttle_status'] = "N/A" diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 06a927730d..e30de16825 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -660,9 +660,9 @@ typedef struct { typedef uint32_t amdsmi_process_handle_t; typedef struct { - char name[AMDSMI_NORMAL_STRING_LENGTH]; + char name[AMDSMI_NORMAL_STRING_LENGTH]; amdsmi_process_handle_t pid; - uint64_t mem; /** in bytes */ + uint64_t mem; /** in bytes */ struct engine_usage_ { uint64_t gfx; uint64_t enc; diff --git a/projects/amdsmi/py-interface/__init__.py b/projects/amdsmi/py-interface/__init__.py index 119383c039..76b2921907 100644 --- a/projects/amdsmi/py-interface/__init__.py +++ b/projects/amdsmi/py-interface/__init__.py @@ -214,56 +214,6 @@ from .amdsmi_interface import amdsmi_set_gpu_memory_partition from .amdsmi_interface import amdsmi_reset_gpu_memory_partition # # Individual GPU Metrics Functions -from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_hotspot -from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_mem -from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_vrsoc -from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_socket_power -from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_gfx_activity -from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_umc_activity -from .amdsmi_interface import amdsmi_get_gpu_metrics_energy_acc -from .amdsmi_interface import amdsmi_get_gpu_metrics_system_clock_counter -from .amdsmi_interface import amdsmi_get_gpu_metrics_firmware_timestamp -from .amdsmi_interface import amdsmi_get_gpu_metrics_throttle_status -from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_link_width -from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_link_speed -from .amdsmi_interface import amdsmi_get_gpu_metrics_xgmi_link_width -from .amdsmi_interface import amdsmi_get_gpu_metrics_xgmi_link_speed -from .amdsmi_interface import amdsmi_get_gpu_metrics_gfxclk_lock_status -from .amdsmi_interface import amdsmi_get_gpu_metrics_gfx_activity_acc -from .amdsmi_interface import amdsmi_get_gpu_metrics_mem_activity_acc -from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_bandwidth_acc -from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_bandwidth_inst -from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_l0_recov_count_acc -from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_replay_count_acc -from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_replay_rover_count_acc -from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_uclk -from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_hbm -from .amdsmi_interface import amdsmi_get_gpu_metrics_vcn_activity -from .amdsmi_interface import amdsmi_get_gpu_metrics_xgmi_read_data -from .amdsmi_interface import amdsmi_get_gpu_metrics_xgmi_write_data -from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_gfxclk -from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_socclk -from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_vclk0 -from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_dclk0 -from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_edge -from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_vrgfx -from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_vrmem -from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_mm_activity -from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_vclk1 -from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_dclk1 -from .amdsmi_interface import amdsmi_get_gpu_metrics_indep_throttle_status -from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_socket_power -from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_fan_speed -from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_gfx_clock_frequency -from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_soc_clock_frequency -from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_uclock_frequency -from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_vclock0_frequency -from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_dclock0_frequency -from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_vclock1_frequency -from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_dclock1_frequency -from .amdsmi_interface import amdsmi_get_gpu_metrics_volt_soc -from .amdsmi_interface import amdsmi_get_gpu_metrics_volt_gfx -from .amdsmi_interface import amdsmi_get_gpu_metrics_volt_mem from .amdsmi_interface import amdsmi_get_gpu_metrics_header_info # # Enums diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 9eb3a9722c..2002da306a 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -351,6 +351,7 @@ class AmdSmiUtilizationCounterType(IntEnum): UTILIZATION_COUNTER_FIRST = amdsmi_wrapper.AMDSMI_UTILIZATION_COUNTER_FIRST UTILIZATION_COUNTER_LAST = amdsmi_wrapper.AMDSMI_UTILIZATION_COUNTER_LAST + class AmdSmiProcessorType(IntEnum): UNKNOWN = amdsmi_wrapper.UNKNOWN AMD_GPU = amdsmi_wrapper.AMD_GPU @@ -358,12 +359,6 @@ class AmdSmiProcessorType(IntEnum): NON_AMD_GPU = amdsmi_wrapper.NON_AMD_GPU NON_AMD_CPU = amdsmi_wrapper.NON_AMD_CPU -class AmdSmiCacheTypeNames(IntEnum): - ENABLED = amdsmi_wrapper.CACHE_FLAGS_ENABLED - DATA_CACHE = amdsmi_wrapper.CACHE_FLAGS_DATA_CACHE - INST_CACHE = amdsmi_wrapper.CACHE_FLAGS_INST_CACHE - CPU_CACHE = amdsmi_wrapper.CACHE_FLAGS_CPU_CACHE - SIMD_CACHE = amdsmi_wrapper.CACHE_FLAGS_SIMD_CACHE class AmdSmiEventReader: def __init__( @@ -1624,37 +1619,32 @@ def amdsmi_get_gpu_cache_info( cache_info_dict = {} for cache_index in range(cache_info.num_cache_types): - cache_size = cache_info.cache[cache_index].cache_size_kb - cache_level = cache_info.cache[cache_index].cache_level - max_num_cu_shared = cache_info.cache[cache_index].max_num_cu_shared - num_cache_instance = cache_info.cache[cache_index].num_cache_instance - cache_flags = cache_info.cache[cache_index].flags - data_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_DATA_CACHE) - inst_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_INST_CACHE) - cpu_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_CPU_CACHE) - simd_cache = bool(cache_flags & amdsmi_wrapper.CACHE_FLAGS_SIMD_CACHE) - cache_flag_list = [] - if (data_cache): - cache_flag_list.append( - AmdSmiCacheTypeNames(amdsmi_wrapper.CACHE_FLAGS_DATA_CACHE).name) - if (inst_cache): - cache_flag_list.append( - AmdSmiCacheTypeNames(amdsmi_wrapper.CACHE_FLAGS_INST_CACHE).name) - if (cpu_cache): - cache_flag_list.append( - AmdSmiCacheTypeNames(amdsmi_wrapper.CACHE_FLAGS_CPU_CACHE).name) - if (simd_cache): - cache_flag_list.append( - AmdSmiCacheTypeNames(amdsmi_wrapper.CACHE_FLAGS_SIMD_CACHE).name) - cache_info_dict[f"cache {cache_index}"] = { - "cache_flags": cache_flag_list, - "cache_size": cache_size, - "cache_level": cache_level, - "max_num_cu_shared": max_num_cu_shared, - "num_cache_instance": num_cache_instance - } + cache_dict = { + "cache_properties": [], + "cache_size": cache_info.cache[cache_index].cache_size_kb, + "cache_level": cache_info.cache[cache_index].cache_level, + "max_num_cu_shared": cache_info.cache[cache_index].max_num_cu_shared, + "num_cache_instance": cache_info.cache[cache_index].num_cache_instance + } - if cache_info_dict == {}: + cache_flags = cache_info.cache[cache_index].flags + data_cache = cache_flags & amdsmi_wrapper.CACHE_FLAGS_DATA_CACHE + inst_cache = cache_flags & amdsmi_wrapper.CACHE_FLAGS_INST_CACHE + cpu_cache = cache_flags & amdsmi_wrapper.CACHE_FLAGS_CPU_CACHE + simd_cache = cache_flags & amdsmi_wrapper.CACHE_FLAGS_SIMD_CACHE + + cache_flags_status = [data_cache, inst_cache, cpu_cache, simd_cache] + cache_flag_list = [] + for cache_flag in cache_flags_status: + if cache_flag: + flag_name = amdsmi_wrapper.amdsmi_cache_flags_type_t__enumvalues[cache_flag] + flag_name = flag_name.replace("CACHE_FLAGS_", "") + cache_flag_list.append(flag_name) + + cache_dict["cache_properties"] = cache_flag_list + cache_info_dict[f"cache {cache_index}"] = cache_dict + + if not cache_info_dict: raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NO_DATA) return cache_info_dict @@ -3095,16 +3085,16 @@ def amdsmi_get_utilization_count( raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_API_FAILED) result = [{"timestamp": timestamp.value}] - for idx in range(count.value): + for index in range(count.value): counter_type = amdsmi_wrapper.amdsmi_utilization_counter_type_t__enumvalues[ - util_counter_list[idx].type + util_counter_list[index].type ] if counter_type == "AMDSMI_UTILIZATION_COUNTER_FIRST": counter_type = "AMDSMI_COARSE_GRAIN_GPU_ACTIVITY" if counter_type == "AMDSMI_UTILIZATION_COUNTER_LAST": counter_type = "AMDSMI_COARSE_GRAIN_MEM_ACTIVITY" result.append( - {"type": counter_type, "value": util_counter_list[idx].value}) + {"type": counter_type, "value": util_counter_list[index].value}) return result @@ -3302,7 +3292,7 @@ def amdsmi_get_gpu_metrics_info( } # Validate support for each gpu_metric - uint_16_values = ['temperature_edge', 'temperature_hotspot', 'temperature_mem', + uint_16_metrics = ['temperature_edge', 'temperature_hotspot', 'temperature_mem', 'temperature_vrgfx', 'temperature_vrsoc', 'temperature_vrmem', 'average_gfx_activity', 'average_umc_activity', 'average_mm_activity', 'average_socket_power', 'average_gfxclk_frequency', 'average_socclk_frequency', @@ -3312,78 +3302,60 @@ def amdsmi_get_gpu_metrics_info( 'current_vclk1', 'current_dclk1', 'current_fan_speed', 'pcie_link_width', 'pcie_link_speed', 'voltage_soc', 'voltage_gfx', 'voltage_mem', 'current_socket_power', 'xgmi_link_width', 'xgmi_link_speed'] + for metric in uint_16_metrics: + if gpu_metrics_output[metric] == 0xFFFF: + gpu_metrics_output[metric] = "N/A" - for value in uint_16_values: - if gpu_metrics_output[value] == 0xFFFF: - gpu_metrics_output[value] = "N/A" + uint_32_metrics = ['gfx_activity_acc','mem_activity_acc', 'pcie_nak_sent_count_acc', 'pcie_nak_rcvd_count_acc'] + for metric in uint_32_metrics: + if gpu_metrics_output[metric] == 0xFFFFFFFF: + gpu_metrics_output[metric] = "N/A" - uint_32_values = ['gfx_activity_acc','mem_activity_acc', 'pcie_nak_sent_count_acc', 'pcie_nak_rcvd_count_acc'] - - for value in uint_32_values: - if gpu_metrics_output[value] == 0xFFFFFFFF: - gpu_metrics_output[value] = "N/A" - - uint_64_values = ['energy_accumulator', 'system_clock_counter', 'firmware_timestamp', + uint_64_metrics = ['energy_accumulator', 'system_clock_counter', 'firmware_timestamp', 'pcie_bandwidth_acc', 'pcie_bandwidth_inst', 'pcie_l0_to_recov_count_acc', 'pcie_replay_count_acc', 'pcie_replay_rover_count_acc'] + for metric in uint_64_metrics: + if gpu_metrics_output[metric] == 0xFFFFFFFFFFFFFFFF: + gpu_metrics_output[metric] = "N/A" - for value in uint_64_values: - if gpu_metrics_output[value] == 0xFFFFFFFFFFFFFFFF: - gpu_metrics_output[value] = "N/A" + # Custom validation for metrics in a bool format + uint_32_bool_metrics = ['throttle_status', 'gfxclk_lock_status'] + for metric in uint_32_bool_metrics: + if gpu_metrics_output[metric] == 0xFFFFFFFF: + gpu_metrics_output[metric] = "N/A" + else: + gpu_metrics_output[metric] = bool(gpu_metrics_output[metric]) + + # Custom validation for metrics in a list format + uint_16_clock_list_metrics = ['current_gfxclks', 'current_socclks', 'current_vclk0s', 'current_dclk0s'] + for clock in uint_16_clock_list_metrics: + for index, clk in enumerate(gpu_metrics_output[clock]): + if clk == 0xFFFF: + gpu_metrics_output[clock][index] = "N/A" + + uint_16_activity_list_metrics = ['vcn_activity', 'jpeg_activity'] + for activity_metric in uint_16_activity_list_metrics: + for index, activity in enumerate(gpu_metrics_output[activity_metric]): + if activity == 0xFFFF or activity > 110: + gpu_metrics_output[activity_metric][index] = "N/A" + + uint_64_xgmi_metrics = ['xgmi_read_data_acc', 'xgmi_write_data_acc'] + for metric in uint_64_xgmi_metrics: + for index, data in enumerate(gpu_metrics_output[metric]): + if data == 0xFFFFFFFFFFFFFFFF: + gpu_metrics_output[metric][index] = "N/A" # Custom validation for specific gpu_metrics - if gpu_metrics_output['throttle_status'] == 0xFFFFFFFF: - gpu_metrics_output['throttle_status'] = "N/A" - else: - gpu_metrics_output['throttle_status'] = bool(gpu_metrics_output['throttle_status']) - - for idx, temp in enumerate(gpu_metrics_output['temperature_hbm']): + for index, temp in enumerate(gpu_metrics_output['temperature_hbm']): if temp == 0xFFFF: - gpu_metrics_output['temperature_hbm'][idx] = "N/A" + gpu_metrics_output['temperature_hbm'][index] = "N/A" if gpu_metrics_output['indep_throttle_status'] == 0xFFFFFFFFFFFFFFFF: gpu_metrics_output['indep_throttle_status'] = "N/A" else: gpu_metrics_output['indep_throttle_status'] = bool(gpu_metrics_output['indep_throttle_status']) - for idx, activity in enumerate(gpu_metrics_output['vcn_activity']): - if activity == 0xFFFF or activity > 100: - gpu_metrics_output['vcn_activity'][idx] = "N/A" - - if gpu_metrics_output['gfxclk_lock_status'] == 0xFFFFFFFF: - gpu_metrics_output['gfxclk_lock_status'] = "N/A" - else: - gpu_metrics_output['gfxclk_lock_status'] = bool(gpu_metrics_output['gfxclk_lock_status']) - - for idx, data in enumerate(gpu_metrics_output['xgmi_read_data_acc']): - if data == 0xFFFFFFFFFFFFFFFF: - gpu_metrics_output['xgmi_read_data_acc'][idx] = "N/A" - - for idx, data in enumerate(gpu_metrics_output['xgmi_write_data_acc']): - if data == 0xFFFFFFFFFFFFFFFF: - gpu_metrics_output['xgmi_write_data_acc'][idx] = "N/A" - - for idx, clk in enumerate(gpu_metrics_output['current_gfxclks']): - if clk == 0xFFFF: - gpu_metrics_output['current_gfxclks'][idx] = "N/A" - - for idx, clk in enumerate(gpu_metrics_output['current_socclks']): - if clk == 0xFFFF: - gpu_metrics_output['current_socclks'][idx] = "N/A" - - for idx, clk in enumerate(gpu_metrics_output['current_vclk0s']): - if clk == 0xFFFF: - gpu_metrics_output['current_vclk0s'][idx] = "N/A" - - for idx, clk in enumerate(gpu_metrics_output['current_dclk0s']): - if clk == 0xFFFF: - gpu_metrics_output['current_dclk0s'][idx] = "N/A" - - for idx, activity in enumerate(gpu_metrics_output['jpeg_activity']): - if activity == 0xFFFF or activity > 100: - gpu_metrics_output['jpeg_activity'][idx] = "N/A" - return gpu_metrics_output @@ -3407,17 +3379,17 @@ def amdsmi_get_gpu_od_volt_curve_regions( result = [] - for idx in range(region_count.value): + for index in range(region_count.value): result.extend( [ { "freq_range": { - "lower_bound": buffer[idx].freq_range.lower_bound, - "upper_bound": buffer[idx].freq_range.upper_bound, + "lower_bound": buffer[index].freq_range.lower_bound, + "upper_bound": buffer[index].freq_range.upper_bound, }, "volt_range": { - "lower_bound": buffer[idx].volt_range.lower_bound, - "upper_bound": buffer[idx].volt_range.upper_bound, + "lower_bound": buffer[index].volt_range.lower_bound, + "upper_bound": buffer[index].volt_range.upper_bound, }, } ] @@ -3645,1059 +3617,6 @@ def amdsmi_get_gpu_memory_reserved_pages( return table_records -### Individual GPU Metrics Functions -def amdsmi_get_gpu_metrics_temp_hotspot( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - hotspot_value = ctypes.c_uint16() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_hotspot( - processor_handle, ctypes.byref(hotspot_value) - ) - ) - - if hotspot_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return hotspot_value.value - - -def amdsmi_get_gpu_metrics_temp_mem( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - mem_value = ctypes.c_uint16() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_mem( - processor_handle, ctypes.byref(mem_value) - ) - ) - - if mem_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return mem_value.value - - -def amdsmi_get_gpu_metrics_temp_vrsoc( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - vrsoc_value = ctypes.c_uint16() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrsoc( - processor_handle, ctypes.byref(vrsoc_value) - ) - ) - - if vrsoc_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return vrsoc_value.value - - -def amdsmi_get_gpu_metrics_curr_socket_power( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - socket_power_value = ctypes.c_uint16() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_curr_socket_power( - processor_handle, ctypes.byref(socket_power_value) - ) - ) - - if socket_power_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return socket_power_value.value - - -def amdsmi_get_gpu_metrics_avg_gfx_activity( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - gfx_activity_value = ctypes.c_uint16() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_avg_gfx_activity( - processor_handle, ctypes.byref(gfx_activity_value) - ) - ) - - if gfx_activity_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return gfx_activity_value.value - - -def amdsmi_get_gpu_metrics_avg_umc_activity( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - umc_activity_value = ctypes.c_uint16() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_avg_umc_activity( - processor_handle, ctypes.byref(umc_activity_value) - ) - ) - - if umc_activity_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return umc_activity_value.value - - -def amdsmi_get_gpu_metrics_energy_acc( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - energy_acc_value = ctypes.c_uint64() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_energy_acc( - processor_handle, ctypes.byref(energy_acc_value) - ) - ) - - if energy_acc_value.value == 0xFFFFFFFFFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return energy_acc_value.value - - -def amdsmi_get_gpu_metrics_system_clock_counter( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - system_clock_counter_value = ctypes.c_uint64() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_system_clock_counter( - processor_handle, ctypes.byref(system_clock_counter_value) - ) - ) - - if system_clock_counter_value.value == 0xFFFFFFFFFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return system_clock_counter_value.value - - -def amdsmi_get_gpu_metrics_firmware_timestamp( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - firmware_timestamp_value = ctypes.c_uint64() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_firmware_timestamp( - processor_handle, ctypes.byref(firmware_timestamp_value) - ) - ) - - if firmware_timestamp_value.value == 0xFFFFFFFFFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return firmware_timestamp_value.value - - -def amdsmi_get_gpu_metrics_throttle_status( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> bool: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - throttle_status_value = ctypes.c_uint32() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_throttle_status( - processor_handle, ctypes.byref(throttle_status_value) - ) - ) - - if throttle_status_value.value == 0xFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return bool(throttle_status_value.value) - - -def amdsmi_get_gpu_metrics_pcie_link_width( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - pcie_link_width_value = ctypes.c_uint16() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_pcie_link_width( - processor_handle, ctypes.byref(pcie_link_width_value) - ) - ) - - if pcie_link_width_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return pcie_link_width_value.value - - -def amdsmi_get_gpu_metrics_pcie_link_speed( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - pcie_link_speed_value = ctypes.c_uint16() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_pcie_link_speed( - processor_handle, ctypes.byref(pcie_link_speed_value) - ) - ) - - if pcie_link_speed_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return pcie_link_speed_value.value - - -def amdsmi_get_gpu_metrics_xgmi_link_width( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - xgmi_link_width_value = ctypes.c_uint16() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_xgmi_link_width( - processor_handle, ctypes.byref(xgmi_link_width_value) - ) - ) - - if xgmi_link_width_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return xgmi_link_width_value.value - - -def amdsmi_get_gpu_metrics_xgmi_link_speed( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - xgmi_link_speed_value = ctypes.c_uint16() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_xgmi_link_speed( - processor_handle, ctypes.byref(xgmi_link_speed_value) - ) - ) - - if xgmi_link_speed_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return xgmi_link_speed_value.value - - -def amdsmi_get_gpu_metrics_gfxclk_lock_status( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> bool: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - gfxclk_lock_status_value = ctypes.c_uint32() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_gfxclk_lock_status( - processor_handle, ctypes.byref(gfxclk_lock_status_value) - ) - ) - - if gfxclk_lock_status_value.value == 0xFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return bool(gfxclk_lock_status_value.value) - - -def amdsmi_get_gpu_metrics_gfx_activity_acc( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - gfx_activity_acc_value = ctypes.c_uint32() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_gfx_activity_acc( - processor_handle, ctypes.byref(gfx_activity_acc_value) - ) - ) - - if gfx_activity_acc_value.value == 0xFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return gfx_activity_acc_value.value - - -def amdsmi_get_gpu_metrics_mem_activity_acc( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - mem_activity_acc_value = ctypes.c_uint32() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_mem_activity_acc( - processor_handle, ctypes.byref(mem_activity_acc_value) - ) - ) - - if mem_activity_acc_value.value == 0xFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return mem_activity_acc_value.value - - -def amdsmi_get_gpu_metrics_pcie_bandwidth_acc( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - pcie_bandwidth_acc_value = ctypes.c_uint64() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_pcie_bandwidth_acc( - processor_handle, ctypes.byref(pcie_bandwidth_acc_value) - ) - ) - - if pcie_bandwidth_acc_value.value == 0xFFFFFFFFFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return pcie_bandwidth_acc_value.value - - -def amdsmi_get_gpu_metrics_pcie_bandwidth_inst( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - pcie_bandwidth_inst_value = ctypes.c_uint64() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_pcie_bandwidth_inst( - processor_handle, ctypes.byref(pcie_bandwidth_inst_value) - ) - ) - - if pcie_bandwidth_inst_value.value == 0xFFFFFFFFFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return pcie_bandwidth_inst_value.value - - -def amdsmi_get_gpu_metrics_pcie_l0_recov_count_acc( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - pcie_count_acc_value = ctypes.c_uint64() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_pcie_l0_recov_count_acc( - processor_handle, ctypes.byref(pcie_count_acc_value) - ) - ) - - if pcie_count_acc_value.value == 0xFFFFFFFFFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return pcie_count_acc_value.value - - -def amdsmi_get_gpu_metrics_pcie_replay_count_acc( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - pcie_count_acc_value = ctypes.c_uint64() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_pcie_replay_count_acc( - processor_handle, ctypes.byref(pcie_count_acc_value) - ) - ) - - if pcie_count_acc_value.value == 0xFFFFFFFFFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return pcie_count_acc_value.value - - -def amdsmi_get_gpu_metrics_pcie_replay_rover_count_acc( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - pcie_count_acc_value = ctypes.c_uint64() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_pcie_replay_rover_count_acc( - processor_handle, ctypes.byref(pcie_count_acc_value) - ) - ) - - if pcie_count_acc_value.value == 0xFFFFFFFFFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return pcie_count_acc_value.value - - -def amdsmi_get_gpu_metrics_curr_uclk( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - uclk_value = ctypes.c_uint16() - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_curr_uclk( - processor_handle, ctypes.byref(uclk_value) - ) - ) - - if uclk_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return uclk_value.value - - -def amdsmi_get_gpu_metrics_temp_hbm( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> List[int]: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - temp_hbm_value = (ctypes.c_uint16 * AMDSMI_NUM_HBM_INSTANCES)() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_hbm( - processor_handle, temp_hbm_value - ) - ) - - return [temp_hbm.value for temp_hbm in temp_hbm_value] - - -def amdsmi_get_gpu_metrics_vcn_activity( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> List[int]: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - vcn_activity_value = (ctypes.c_uint16 * AMDSMI_MAX_NUM_VCN)() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_vcn_activity( - processor_handle, vcn_activity_value - ) - ) - - return vcn_activity_value - - -def amdsmi_get_gpu_metrics_xgmi_read_data( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> List[int]: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - xgmi_write_data_acc_value = (ctypes.c_uint64 * AMDSMI_MAX_NUM_XGMI_LINKS)() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_xgmi_read_data( - processor_handle, xgmi_write_data_acc_value - ) - ) - - return [xgmi_read_data_acc.value for xgmi_read_data_acc in xgmi_write_data_acc_value] - - -def amdsmi_get_gpu_metrics_xgmi_write_data( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> List[int]: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - xgmi_write_data_acc_value = (ctypes.c_uint64 * AMDSMI_MAX_NUM_XGMI_LINKS)() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_xgmi_write_data( - processor_handle, xgmi_write_data_acc_value - ) - ) - - return [xgmi_write_data_acc.value for xgmi_write_data_acc in xgmi_write_data_acc_value] - - -def amdsmi_get_gpu_metrics_curr_gfxclk( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> List[int]: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - current_gfxclk_value = (ctypes.c_uint16 * AMDSMI_MAX_NUM_GFX_CLKS)() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_curr_gfxclk( - processor_handle, current_gfxclk_value - ) - ) - - print([curr_gfxclk for curr_gfxclk in current_gfxclk_value]) - - return [curr_gfxclk for curr_gfxclk in current_gfxclk_value] - - -def amdsmi_get_gpu_metrics_curr_socclk( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> List[int]: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - current_socclk_value = (ctypes.c_uint16 * AMDSMI_MAX_NUM_CLKS)() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_curr_socclk( - processor_handle, current_socclk_value - ) - ) - - return [curr_socclk.value for curr_socclk in current_socclk_value] - - -def amdsmi_get_gpu_metrics_curr_vclk0( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> List[int]: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - current_vclk_value = (ctypes.c_uint16 * AMDSMI_MAX_NUM_CLKS)() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_curr_vclk0( - processor_handle, current_vclk_value - ) - ) - - return [curr_vclk0.value for curr_vclk0 in current_vclk_value] - - -def amdsmi_get_gpu_metrics_curr_dclk0( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> List[int]: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - current_dclk_value = (ctypes.c_uint16 * AMDSMI_MAX_NUM_CLKS)() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_curr_dclk0( - processor_handle, current_dclk_value - ) - ) - - return [curr_dclk0.value for curr_dclk0 in current_dclk_value] - - -def amdsmi_get_gpu_metrics_temp_edge( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - edge_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_edge( - processor_handle, ctypes.byref(edge_value) - ) - ) - - if edge_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return edge_value.value - - -def amdsmi_get_gpu_metrics_temp_vrgfx( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - vrgfx_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrgfx( - processor_handle, ctypes.byref(vrgfx_value) - ) - ) - - if vrgfx_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return vrgfx_value.value - - -def amdsmi_get_gpu_metrics_temp_vrmem( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - vrmem_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_temp_vrmem( - processor_handle, ctypes.byref(vrmem_value) - ) - ) - - if vrmem_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return vrmem_value.value - - -def amdsmi_get_gpu_metrics_avg_mm_activity( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - mm_activity_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_avg_mm_activity( - processor_handle, ctypes.byref(mm_activity_value) - ) - ) - - if mm_activity_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return mm_activity_value.value - - -def amdsmi_get_gpu_metrics_curr_vclk1( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - current_vclk_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_curr_vclk1( - processor_handle, ctypes.byref(current_vclk_value) - ) - ) - - if current_vclk_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return current_vclk_value.value - - -def amdsmi_get_gpu_metrics_curr_dclk1( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - current_dclk_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_curr_dclk1( - processor_handle, ctypes.byref(current_dclk_value) - ) - ) - - if current_dclk_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return current_dclk_value.value - - -def amdsmi_get_gpu_metrics_indep_throttle_status( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> bool: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - throttle_status_value = ctypes.c_uint64() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_indep_throttle_status( - processor_handle, ctypes.byref(throttle_status_value) - ) - ) - - if throttle_status_value.value == 0xFFFFFFFFFFFFFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return bool(throttle_status_value.value) - - -def amdsmi_get_gpu_metrics_avg_socket_power( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - socket_power_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_avg_socket_power( - processor_handle, ctypes.byref(socket_power_value) - ) - ) - - if socket_power_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return socket_power_value.value - - -def amdsmi_get_gpu_metrics_curr_fan_speed( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - fan_speed_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_curr_fan_speed( - processor_handle, ctypes.byref(fan_speed_value) - ) - ) - - if fan_speed_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return fan_speed_value.value - - -def amdsmi_get_gpu_metrics_avg_gfx_clock_frequency( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - clock_frequency_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_avg_gfx_clock_frequency( - processor_handle, ctypes.byref(clock_frequency_value) - ) - ) - - if clock_frequency_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return clock_frequency_value.value - - -def amdsmi_get_gpu_metrics_avg_soc_clock_frequency( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - clock_frequency_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_avg_soc_clock_frequency( - processor_handle, ctypes.byref(clock_frequency_value) - ) - ) - - if clock_frequency_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return clock_frequency_value.value - - -def amdsmi_get_gpu_metrics_avg_uclock_frequency( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - clock_frequency_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_avg_uclock_frequency( - processor_handle, ctypes.byref(clock_frequency_value) - ) - ) - - if clock_frequency_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return clock_frequency_value.value - - -def amdsmi_get_gpu_metrics_avg_vclock0_frequency( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - clock_frequency_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_avg_vclock0_frequency( - processor_handle, ctypes.byref(clock_frequency_value) - ) - ) - - if clock_frequency_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return clock_frequency_value.value - - -def amdsmi_get_gpu_metrics_avg_dclock0_frequency( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - clock_frequency_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_avg_dclock0_frequency( - processor_handle, ctypes.byref(clock_frequency_value) - ) - ) - - if clock_frequency_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return clock_frequency_value.value - - -def amdsmi_get_gpu_metrics_avg_vclock1_frequency( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - clock_frequency_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_avg_vclock1_frequency( - processor_handle, ctypes.byref(clock_frequency_value) - ) - ) - - if clock_frequency_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return clock_frequency_value.value - - -def amdsmi_get_gpu_metrics_avg_dclock1_frequency( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - clock_frequency_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_avg_dclock1_frequency( - processor_handle, ctypes.byref(clock_frequency_value) - ) - ) - - if clock_frequency_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return clock_frequency_value.value - - -def amdsmi_get_gpu_metrics_volt_soc( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - voltage_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_volt_soc( - processor_handle, ctypes.byref(voltage_value) - ) - ) - - if voltage_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return voltage_value.value - - -def amdsmi_get_gpu_metrics_volt_gfx( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - voltage_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_volt_gfx( - processor_handle, ctypes.byref(voltage_value) - ) - ) - - return [volt_gfx.value for volt_gfx in voltage_value] - - -def amdsmi_get_gpu_metrics_volt_mem( - processor_handle: amdsmi_wrapper.amdsmi_processor_handle, -) -> int: - if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): - raise AmdSmiParameterException( - processor_handle, amdsmi_wrapper.amdsmi_processor_handle - ) - - voltage_value = ctypes.c_uint16() - - _check_res( - amdsmi_wrapper.amdsmi_get_gpu_metrics_volt_mem( - processor_handle, ctypes.byref(voltage_value) - ) - ) - - if voltage_value.value == 0xFFFF: - raise AmdSmiLibraryException(amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED) - - return voltage_value.value - - def amdsmi_get_gpu_metrics_header_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, int]: diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 532e485da8..b940ec9113 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -1114,45 +1114,8 @@ amdsmi_status_t amdsmi_get_gpu_metrics_info( amdsmi_gpu_metrics_t *pgpu_metrics) { AMDSMI_CHECK_INIT(); // nullptr api supported - amdsmi_status_t ret = - rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle, + return rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle, reinterpret_cast(pgpu_metrics)); - if (ret != AMDSMI_STATUS_SUCCESS) { - return ret; - } - // WARNING: TEMPORARY - awaiting 1.5 update from amdgpu driver/firmware - // intended to be removed later - // START: REMOVE WHATS BELOW ME - uint8_t content_ver = pgpu_metrics->common_header.content_revision; - int8_t format_ver = pgpu_metrics->common_header.format_revision; - const uint8_t expected_format_ver = 1; - const uint8_t expected_content_ver = 4; - if (ret == AMDSMI_STATUS_SUCCESS && - (format_ver == expected_format_ver && - content_ver <= expected_content_ver)) { - std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << " | SET JPEG_ACTIVITY to MAX_UINT16, " - << "detected content version: " << std::dec << +content_ver - << "; format version: " << std::dec << +format_ver - << "; awaiting 1.5 metrics remove once released"; - LOG_ALWAYS(ss); - std::fill_n(&pgpu_metrics->jpeg_activity[0], - (sizeof(pgpu_metrics->jpeg_activity) / - sizeof(pgpu_metrics->jpeg_activity[0])), - std::numeric_limits::max()); - pgpu_metrics->pcie_nak_sent_count_acc = - static_cast(std::numeric_limits::max()); - pgpu_metrics->pcie_nak_rcvd_count_acc = - static_cast(std::numeric_limits::max()); - } - std::ostringstream ss; - const char *status_string; - amdsmi_status_code_to_string(ret, &status_string); - ss << __PRETTY_FUNCTION__ - << " | END, returning status = " << status_string; - LOG_TRACE(ss); - // END: REMOVE WHATS ABOVE ME - return ret; }