diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 8556361c62..5f378c5fdf 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -4156,14 +4156,35 @@ class AMDSMICommands(): self.logger.store_output(args.gpu, 'perfdeterminism', f"Successfully enabled performance determinism and set GFX clock frequency to {args.perf_determinism}") if args.compute_partition: - compute_partition = amdsmi_interface.AmdSmiComputePartitionType[args.compute_partition] try: - amdsmi_interface.amdsmi_set_gpu_compute_partition(args.gpu, compute_partition) + (accelerator_set_choices, accelerator_profiles) = self.helpers.get_accelerator_choices_types_indices() + logging.debug("args.compute_partition: %s; Accelerator_set_choices: %s", str(args.compute_partition), str(json.dumps(accelerator_set_choices, indent=4))) + if args.compute_partition in accelerator_profiles['profile_types']: + compute_partition = amdsmi_interface.AmdSmiComputePartitionType[args.compute_partition] + index = accelerator_profiles['profile_types'].index(args.compute_partition) + attempted_to_set = f"Attempted to set accelerator partition to {args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]} on {gpu_string}" + amdsmi_interface.amdsmi_set_gpu_compute_partition(args.gpu, compute_partition) + self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {args.compute_partition} (profile #{accelerator_profiles['profile_indices'][int(index)]})") + elif args.compute_partition in accelerator_profiles['profile_indices']: + compute_partition = int(args.compute_partition) + index = accelerator_profiles['profile_indices'].index(args.compute_partition) + attempted_to_set = f"Attempted to set accelerator partition to {accelerator_profiles['profile_types'][int(index)]} (profile #{args.compute_partition}) on {gpu_string}" + amdsmi_interface.amdsmi_set_gpu_accelerator_partition_profile(args.gpu, compute_partition) + self.logger.store_output(args.gpu, 'accelerator_partition', f"Successfully set accelerator partition to {accelerator_profiles['profile_types'][int(index)]} (profile #{args.compute_partition})") + else: + raise ValueError(f"Invalid accelerator configuration {args.compute_partition} on {gpu_string}") + except amdsmi_exception.AmdSmiLibraryException as e: if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: raise PermissionError('Command requires elevation') from e - raise ValueError(f"Unable to set compute partition to {args.compute_partition} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'computepartition', f"Successfully set compute partition to {args.compute_partition}") + elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_SETTING_UNAVAILABLE: + print(f"\n{attempted_to_set}\n" + f"\n[AMDSMI_STATUS_SETTING_UNAVAILABLE] Please check amd-smi partition --memory --accelerator for available profiles.\n" + "Users may need to switch memory partition to another mode in order to enable the desired accelerator partition.\n") + raise ValueError(f"[AMDSMI_STATUS_SETTING_UNAVAILABLE] Unable to set accelerator partition to {args.compute_partition} on {gpu_string}") from e + else: + raise ValueError(f"Unable to set accelerator partition to {args.compute_partition} on {gpu_string}") from e + if args.memory_partition: lock = multiprocessing.Lock() lock.acquire() @@ -4172,49 +4193,18 @@ class AMDSMICommands(): # Info used if AMDSMI_STATUS_INVAL is caught & to set progress bar # #################################################################### try: - memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu) # this info likely actually comes from different apis than used here + memory_dict = {'caps': "N/A", 'current': "N/A"} + memory_partition_config = amdsmi_interface.amdsmi_get_gpu_memory_partition_config(args.gpu) + memory_dict['caps'] = str(memory_partition_config['partition_caps']).replace("]", "").replace("[", "").replace("\'", "").replace(" ", "") + memory_dict['current'] = memory_partition_config['mp_mode'] except amdsmi_exception.AmdSmiLibraryException as e: - memory_partition = "N/A" logging.debug("Failed to get current memory partition for GPU %s | %s", gpu_id, e.get_error_info()) - try: - mem_caps_str = "N/A" - partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(args.gpu) - temp_mem_caps = partition_dict['partition_profile']['memory_caps'] - mem_caps = temp_mem_caps.nps_cap_mask - if temp_mem_caps.amdsmi_nps_flags_t == None: - mem_caps_list = [] - if mem_caps & 1 == 1: - mem_caps_list.append("NPS1") - if mem_caps & 2 == 2: - mem_caps_list.append("NPS2") - if mem_caps & 4 == 4: - mem_caps_list.append("NPS4") - if mem_caps & 8 == 8: - mem_caps_list.append("NPS8") - mem_caps_str = str(mem_caps_list).replace("]", "").replace("[", "") - else: - mem_caps = temp_mem_caps.amdsmi_nps_flags_t - mem_caps_list = [] - if mem_caps.nps1_cap == 1: - mem_caps_list.append("NPS1") - if mem_caps.nps2_cap == 1: - mem_caps_list.append("NPS2") - if mem_caps.nps4_cap == 1: - mem_caps_list.append("NPS4") - if mem_caps.nps8_cap == 1: - mem_caps_list.append("NPS8") - mem_caps_str = str(mem_caps_list).replace("]", "").replace("[", "").replace("\'", "") - if mem_caps_str == "": - mem_caps_str = "N/A" - except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get accelerator partition profile for GPU %s | %s", gpu_id, e.get_error_info()) - memory_dict = {'caps': mem_caps_str, 'current': memory_partition} ############################################################### # memory partition set starts here # ############################################################### showProgressBar = False - if ((str(memory_dict['current']) != "N/A") and (str(args.memory_partition) in mem_caps_str) + if ((str(memory_dict['current']) != "N/A") and (str(args.memory_partition) in memory_dict['caps']) and ((str(memory_dict['current']) != str(args.memory_partition)))): showProgressBar = True # Only show progress bar if # 1) Device can set memory partition modes @@ -4259,7 +4249,7 @@ class AMDSMICommands(): raise PermissionError('Command requires elevation') from e if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL: out = f"[AMDSMI_STATUS_INVAL] Unable to set memory partition to {args.memory_partition} on {gpu_string}" - print(f"Valid Memory partition Modes: {mem_caps_str}\n") + print(f"Valid Memory partition Modes: {memory_dict['caps']}\n") self.logger.store_output(args.gpu, 'memory_partition', out) self.logger.print_output() self.logger.clear_multiple_devices_ouput() @@ -5711,15 +5701,21 @@ class AMDSMICommands(): if accelerator: args.accelerator = accelerator + ########################################### + # amd-smi partition (no args) # + ########################################### # if no args are present, then everything should be displayed if not args.current and not args.memory and not args.accelerator: args.current = True args.memory = True args.accelerator = True + ########################################### + # amd-smi partition --current # + ########################################### if args.current: self.logger.table_header = ''.rjust(7) - current_header = "GPU_ID".ljust(13) + \ + current_header = "GPU_ID".ljust(8) + \ "MEMORY".ljust(8) + \ "ACCELERATOR_TYPE".ljust(18) + \ "ACCELERATOR_PROFILE_INDEX".ljust(27) + \ @@ -5733,11 +5729,11 @@ class AMDSMICommands(): partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu) profile_type = partition_dict['partition_profile']['profile_type'] profile_index = partition_dict['partition_profile']['profile_index'] - partition_id = partition_dict['partition_id'] + partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "") except amdsmi_exception.AmdSmiLibraryException as e: profile_type = "N/A" profile_index = "N/A" - partition_id = "N/A" + partition_id = "0" logging.debug("Failed to get accelerator partition profile for GPU %s | %s", gpu_id, e.get_error_info()) try: current_mem_cap = amdsmi_interface.amdsmi_get_gpu_memory_partition(gpu) @@ -5756,65 +5752,52 @@ class AMDSMICommands(): tabular_output.append(tabular_output_dict) self.logger.multiple_device_output = tabular_output - self.logger.table_title = "CURRENT_PARTITION" - self.logger.print_output(multiple_device_enabled=True, tabular=True) + self.logger.table_title = "\nCURRENT_PARTITION" + self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True) self.logger.clear_multiple_devices_ouput() + ########################################### + # amd-smi partition --memory # + ########################################### if args.memory: + tabular_output = [] + self.logger.table_header = ''.rjust(7) + current_header = "GPU_ID".ljust(8) + \ + "MEMORY_PARTITION_CAPS".ljust(23) + \ + "CURRENT_MEMORY_PARTITION".ljust(26) + self.logger.table_header = current_header + self.logger.table_header.strip() + for gpu in args.gpu: gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) + mem_caps_str = "N/A" + current_memory_partition = "N/A" try: - memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(gpu) # this info likely actually comes from different apis than used here + memory_partition_config = amdsmi_interface.amdsmi_get_gpu_memory_partition_config(gpu) + mem_caps_str = str(memory_partition_config['partition_caps']).replace("]", "").replace("[", "").replace("\'", "").replace(" ", "") + current_memory_partition = memory_partition_config['mp_mode'] except amdsmi_exception.AmdSmiLibraryException as e: - memory_partition = "N/A" logging.debug("Failed to get current memory partition for GPU %s | %s", gpu_id, e.get_error_info()) - try: - partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu) - temp_mem_caps = partition_dict['partition_profile']['memory_caps'] - if temp_mem_caps.amdsmi_nps_flags_t == None: - mem_caps = temp_mem_caps.nps_cap_mask - mem_caps_list = [] - if mem_caps & 1 == 1: - mem_caps_list.append("NPS1") - if mem_caps & 2 == 2: - mem_caps_list.append("NPS2") - if mem_caps & 4 == 4: - mem_caps_list.append("NPS4") - if mem_caps & 8 == 8: - mem_caps_list.append("NPS8") - mem_caps_str = str(mem_caps_list).replace("]", "").replace("[", "") - else: - mem_caps = temp_mem_caps.amdsmi_nps_flags_t - mem_caps_list = [] - if mem_caps.nps1_cap == 1: - mem_caps_list.append("NPS1") - if mem_caps.nps2_cap == 1: - mem_caps_list.append("NPS2") - if mem_caps.nps4_cap == 1: - mem_caps_list.append("NPS4") - if mem_caps.nps8_cap == 1: - mem_caps_list.append("NPS8") - mem_caps_str = str(mem_caps_list).replace("]", "").replace("[", "").replace("\'", "") - if mem_caps_str == "": - mem_caps_str = "N/A" - except amdsmi_exception.AmdSmiLibraryException as e: - mem_caps_str = "N/A" - logging.debug("Failed to get accelerator partition profile for GPU %s | %s", gpu_id, e.get_error_info()) + tabular_output_dict = {"gpu_id": gpu_id, + "memory_partition_caps": mem_caps_str, + "current_memory_partition": current_memory_partition} + tabular_output.append(tabular_output_dict) - memory_dict = {'caps': mem_caps_str, 'current': memory_partition} - self.logger.store_output(gpu, 'memory_partition', memory_dict) - self.logger.store_multiple_device_output() - self.logger.print_output(multiple_device_enabled=True) + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "\nMEMORY_PARTITION" + self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True) self.logger.clear_multiple_devices_ouput() + ########################################### + # amd-smi partition --accelerator # + ########################################### if args.accelerator: self.logger.table_header = ''.rjust(7) - current_header = "GPU_ID".ljust(13) + \ + current_header = "GPU_ID".ljust(8) + \ "PROFILE_INDEX".ljust(15) + \ "MEMORY_PARTITION_CAPS".ljust(23) + \ "ACCELERATOR_TYPE".ljust(18) + \ - "PARTITION_ID".ljust(14) + \ + "PARTITION_ID".ljust(17) + \ "NUM_PARTITIONS".ljust(16) + \ "NUM_RESOURCES".ljust(15) + \ "RESOURCE_INDEX".ljust(16) + \ @@ -5824,74 +5807,184 @@ class AMDSMICommands(): self.logger.table_header = current_header + self.logger.table_header.strip() tabular_output = [] + prev_gpu_id = "N/A" for gpu in args.gpu: gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) + tabular_output_dict = {"gpu_id": "N/A", + "profile_index": "N/A", + "memory_partition_caps": "N/A", + "accelerator_type": "N/A", + "partition_id": "0", + "num_partitions": "N/A", + "num_resources": "N/A", + "resource_index": "N/A", + "resource_type": "N/A", + "resource_instances": "N/A", + "resources_shared": "N/A"} try: partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu) - profile_type = partition_dict['partition_profile']['profile_type'] - profile_index = partition_dict['partition_profile']['profile_index'] - temp_mem_caps = partition_dict['partition_profile']['memory_caps'] - parition_id = partition_dict['partition_id'] - num_resources = partition_dict['partition_profile']['num_resources'] - resources = partition_dict['partition_profile']['resources'] + partition_id = str(partition_dict['partition_id']).replace("[", "").replace("]", "").replace(" ", "") + current_accelerator_type = partition_dict['partition_profile']['profile_type'] + + # save only the primary GPU node's partition_id (the 1st listed device; non N/A one) + # else keep current_partition_id unchanged for displaying in accelerator resource's output + if partition_id != "N/A": + current_partition_id = partition_id - if temp_mem_caps.amdsmi_nps_flags_t == None: - mem_caps = temp_mem_caps.nps_cap_mask - mem_caps_list = [] - if mem_caps & 1 == 1: - mem_caps_list.append("NPS1") - if mem_caps & 2 == 2: - mem_caps_list.append("NPS2") - if mem_caps & 4 == 4: - mem_caps_list.append("NPS4") - if mem_caps & 8 == 8: - mem_caps_list.append("NPS8") - mem_caps_str = str(mem_caps_list).replace("]", "").replace("[", "").replace("\'", "") - else: - mem_caps = temp_mem_caps.amdsmi_nps_flags_t - mem_caps_list = [] - if mem_caps.nps1_cap == 1: - mem_caps_list.append("NPS1") - if mem_caps.nps2_cap == 1: - mem_caps_list.append("NPS2") - if mem_caps.nps4_cap == 1: - mem_caps_list.append("NPS4") - if mem_caps.nps8_cap == 1: - mem_caps_list.append("NPS8") - mem_caps_str = str(mem_caps_list).replace("]", "").replace("[", "").replace("\'", "") - if mem_caps_str == "": - mem_caps_str = "N/A" except amdsmi_exception.AmdSmiLibraryException as e: profile_type = "N/A" profile_index = "N/A" - temp_mem_caps = "N/A" - parition_id = "N/A" - num_resources = "N/A" - resources = "N/A" + partition_id = "0" mem_caps_str = "N/A" + num_partitions = 0 + current_accelerator_type = "N/A" logging.debug("Failed to get accelerator partition profile for GPU %s | %s", gpu_id, e.get_error_info()) - if profile_type == 0: - profile_type = "N/A" + try: + partition_config_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile_config(gpu) + logging.debug("amdsmi_commands.py | partition_config_dict: " + str(json.dumps(partition_config_dict, indent=4))) + num_profiles = partition_config_dict['num_profiles'] + num_resource_profiles = partition_config_dict['num_resource_profiles'] - tabular_output_dict = {"gpu_id": gpu_id, + resource_index = 0 + prev_accelerator_type = "N/A" + for p in range(0, num_profiles): + accelerator_type = partition_config_dict['profiles'][p]['profile_type'] + profile_index = partition_config_dict['profiles'][p]['profile_index'] + num_partitions = partition_config_dict['profiles'][p]['num_partitions'] + mem_caps_str = str(partition_config_dict['profiles'][p]['memory_caps']).replace("]", "").replace("[", "").replace("\'", "").replace(" ", "") + # 2 modifications based on the current accelerator type: + # 1) display a * for the current accelerator type, otherwise display as normal + # 2) display partition id only for the current accelerator profile (the *'d one) + if current_accelerator_type == accelerator_type: + accelerator_type = accelerator_type + "*" + partition_id = current_partition_id + else: + partition_id = "N/A" + # only display the first instance of the gpu_id, rest are empty strings + if prev_gpu_id != gpu_id: + tabular_gpu_id = gpu_id + prev_gpu_id = gpu_id + else: + tabular_gpu_id = "" + logging.debug("amdsmi_commands.py | tabular_gpu_id: " + str(tabular_gpu_id)) + + if num_resource_profiles == 0: + if prev_accelerator_type != accelerator_type: # only print the first instance of the resources + tabular_output_dict = {"gpu_id": tabular_gpu_id, "profile_index": profile_index, "memory_partition_caps": mem_caps_str, - "accelerator_type": profile_type, - "partition_id": parition_id, - "num_partitions": 0, - "num_resources": num_resources, - "resource_index": resources, - "resource_type": resources, - "resource_instances": resources, - "resources_shared": resources} - tabular_output.append(tabular_output_dict) + "accelerator_type": accelerator_type, + "partition_id": partition_id, + "num_partitions": num_partitions, + "num_resources": num_resource_profiles, + "resource_index": "N/A", + "resource_type": "N/A", + "resource_instances": "N/A", + "resources_shared": "N/A"} + prev_accelerator_type = accelerator_type + tabular_output.append(tabular_output_dict) + continue + + for r in range(0, num_resource_profiles): + logging.debug("amdsmi_commands.py | p: " + str(p) + "; r: " + str(r) + + "; accelerator_type: " + str(accelerator_type)) + resource_type = partition_config_dict['profiles'][p]['resources'][r]['resource_type'] + resource_instances = partition_config_dict['profiles'][p]['resources'][r]['partition_resource'] + resources_shared = partition_config_dict['profiles'][p]['resources'][r]['num_partitions_share_resource'] + if prev_accelerator_type != accelerator_type: # only print the first instance of the resources + tabular_output_dict = {"gpu_id": tabular_gpu_id, + "profile_index": profile_index, + "memory_partition_caps": mem_caps_str, + "accelerator_type": accelerator_type, + "partition_id": partition_id, + "num_partitions": num_partitions, + "num_resources": num_resource_profiles, + "resource_index": resource_index, + "resource_type": resource_type, + "resource_instances": resource_instances, + "resources_shared": resources_shared} + prev_accelerator_type = accelerator_type + else: + tabular_output_dict = {"gpu_id": "", + "profile_index": "", + "memory_partition_caps": "", + "accelerator_type": "", + "partition_id": "", + "num_partitions": "", + "num_resources": "", + "resource_index": resource_index, + "resource_type": resource_type, + "resource_instances": resource_instances, + "resources_shared": resources_shared} + resource_index += 1 + tabular_output.append(tabular_output_dict) + except amdsmi_exception.AmdSmiLibraryException as e: + tabular_output.append(tabular_output_dict) self.logger.multiple_device_output = tabular_output - self.logger.table_title = "ACCELERATOR_PARTITION_PROFILES" - self.logger.print_output(multiple_device_enabled=True, tabular=True) + self.logger.table_title = "\nACCELERATOR_PARTITION_PROFILES" + self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True) self.logger.clear_multiple_devices_ouput() + ######################################### + # print accelerator partition resources # + ######################################### + self.logger.table_header = ''.rjust(7) + current_header = "RESOURCE_INDEX".ljust(16) + \ + "RESOURCE_TYPE".ljust(15) + \ + "RESOURCE_INSTANCES".ljust(20) + \ + "RESOURCES_SHARED".ljust(18) + self.logger.table_header = current_header + self.logger.table_header.strip() + + tabular_output = [] + for gpu in args.gpu: + gpu_id = self.helpers.get_gpu_id_from_device_handle(gpu) + tabular_output_dict = {"resource_index": "N/A", + "resource_type": "N/A", + "resource_instances": "N/A", + "resources_shared": "N/A"} + try: + partition_config_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile_config(gpu) + logging.debug("amdsmi_commands.py | partition_config_dict: " + str(json.dumps(partition_config_dict, indent=4))) + num_profiles = partition_config_dict['num_profiles'] + num_resource_profiles = partition_config_dict['num_resource_profiles'] + + if num_resource_profiles == 0: + tabular_output.append(tabular_output_dict) + continue + + resource_index = 0 + for p in range(0, num_profiles): + for r in range(0, num_resource_profiles): + resource_type = partition_config_dict['profiles'][p]['resources'][r]['resource_type'] + resource_instances = partition_config_dict['profiles'][p]['resources'][r]['partition_resource'] + resources_shared = partition_config_dict['profiles'][p]['resources'][r]['num_partitions_share_resource'] + tabular_output_dict = { + "resource_index": resource_index, + "resource_type": resource_type, + "resource_instances": resource_instances, + "resources_shared": resources_shared} + resource_index += 1 + tabular_output.append(tabular_output_dict) + except amdsmi_exception.AmdSmiLibraryException as e: + tabular_output.append(tabular_output_dict) + + self.logger.multiple_device_output = tabular_output + self.logger.table_title = "\nACCELERATOR_PARTITION_RESOURCES" + self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True) + self.logger.clear_multiple_devices_ouput() + + # print legend + legend_parts = [ + "\n\nLegend:", + " * = Current mode"] + legend_output = "\n".join(legend_parts) + if self.logger.destination == 'stdout': + print(legend_output) + else: + with self.logger.destination.open('a', encoding="utf-8") as output_file: + output_file.write(legend_output + '\n') def _event_thread(self, commands, i): devices = commands.device_handles diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index 8f4801a46d..9c43c4f024 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -27,6 +27,7 @@ import sys import time import re import multiprocessing +import json from typing import List, Union from enum import Enum @@ -681,12 +682,30 @@ class AMDSMIHelpers(): perf_levels_int = list(set(clock.value for clock in amdsmi_interface.AmdSmiDevPerfLevel)) return perf_levels_str, perf_levels_int + def get_accelerator_partition_profile_config(self): + device_handles = amdsmi_interface.amdsmi_get_processor_handles() + accelerator_partition_profiles = {'profile_indices':[], 'profile_types':[], 'memory_caps': []} + for dev in device_handles: + try: + profile = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile_config(dev) + num_profiles = profile['num_profiles'] + for p in range(num_profiles): + accelerator_partition_profiles['profile_indices'].append(str(profile['profiles'][p]['profile_index'])) + accelerator_partition_profiles['profile_types'].append(profile['profiles'][p]['profile_type']) + accelerator_partition_profiles['memory_caps'].append(profile['profiles'][p]['memory_caps']) + break # Only need to get the profiles for one device + except amdsmi_interface.AmdSmiLibraryException as e: + break + return accelerator_partition_profiles - def get_compute_partition_types(self): - compute_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiComputePartitionType] - if 'INVALID' in compute_partitions_str: - compute_partitions_str.remove('INVALID') - return compute_partitions_str + def get_accelerator_choices_types_indices(self): + return_val = ("N/A", {'profile_indices':[], 'profile_types':[]}) + accelerator_partition_profiles = self.get_accelerator_partition_profile_config() + if len(accelerator_partition_profiles['profile_types']) != 0: + compute_partitions_str = accelerator_partition_profiles['profile_types'] + accelerator_partition_profiles['profile_indices'] + accelerator_choices = ", ".join(compute_partitions_str) + return_val = (accelerator_choices, accelerator_partition_profiles) + return return_val def get_memory_partition_types(self): memory_partitions_str = [partition.name for partition in amdsmi_interface.AmdSmiMemoryPartitionType] diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index 2a6f0c7b93..0aee32be36 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -102,14 +102,24 @@ class AMDSMILogger(): return output_dict - def _convert_json_to_tabular(self, json_object: Dict[str, any]): - # TODO make dynamic + def _convert_json_to_tabular(self, json_object: Dict[str, any], dynamic=False): + # TODO make dynamic - convert other python CLI outputs to use (as needed) + # Update: using dynamic=true provides dynamic re-sizing based on key name length + table_values = '' stored_gpu = '' stored_timestamp = '' for key, value in json_object.items(): string_value = str(value) - if key == 'gpu': + if key == 'partition_id': + # Special case for partition_id: 8 partitions + 7 comma + 2 spaces = 17 + table_values += string_value.ljust(17) + continue + key_length = len(key) + 2 + if dynamic and len(key) > 0: + stored_gpu = string_value + table_values += string_value.ljust(key_length) + elif key == 'gpu': stored_gpu = string_value table_values += string_value.rjust(3) elif key == 'timestamp': @@ -144,30 +154,6 @@ class AMDSMILogger(): elif key == "link_status": for i in value: table_values += str(i).ljust(3) - elif key == "memory": - table_values += string_value.ljust(8) - elif key == "accelerator_type": - table_values += string_value.ljust(18) - elif key == "partition_id": - table_values += string_value.ljust(14) - elif key == "accelerator_profile_index": - table_values += string_value.ljust(27) - elif key == "profile_index": - table_values += string_value.ljust(15) - elif key == "memory_partition_caps": - table_values += string_value.ljust(23) - elif key == "num_partitions": - table_values += string_value.ljust(16) - elif key == "num_resources": - table_values += string_value.ljust(15) - elif key == "resource_index": - table_values += string_value.ljust(16) - elif key == "resource_type": - table_values += string_value.ljust(15) - elif key == "resource_instances": - table_values += string_value.ljust(20) - elif key == "resources_shared": - table_values += string_value.ljust(18) elif key == "RW": table_values += string_value.ljust(57) elif key in ('pviol', 'tviol'): @@ -494,12 +480,14 @@ class AMDSMILogger(): self.output = {} - def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False, dual_csv_output=False): + def print_output(self, multiple_device_enabled=False, watching_output=False, tabular=False, dual_csv_output=False, dynamic=False): """ Print current output acording to format and then destination params: multiple_device_enabled (bool) - True if printing output from multiple devices watching_output (bool) - True if printing watch output + dynamic (bool) - Defaults to False. True turns on dynamic resizing for + left justified table output return: Nothing """ @@ -516,7 +504,7 @@ class AMDSMILogger(): elif self.is_human_readable_format(): # If tabular output is enabled, redirect to _print_tabular_output if tabular: - self._print_tabular_output(multiple_device_enabled=multiple_device_enabled, watching_output=watching_output) + self._print_tabular_output(multiple_device_enabled=multiple_device_enabled, watching_output=watching_output, dynamic=dynamic) else: self._print_human_readable_output(multiple_device_enabled=multiple_device_enabled, watching_output=watching_output) @@ -788,7 +776,7 @@ class AMDSMILogger(): output_file.write(human_readable_output + '\n') - def _print_tabular_output(self, multiple_device_enabled=False, watching_output=False): + def _print_tabular_output(self, multiple_device_enabled=False, watching_output=False, dynamic=False): primary_table = '' secondary_table = '' @@ -808,7 +796,7 @@ class AMDSMILogger(): for key, value in device_output.items(): if key != 'process_list': primary_table_output[key] = value - primary_table += self._convert_json_to_tabular(primary_table_output) + '\n' + primary_table += self._convert_json_to_tabular(primary_table_output, dynamic=dynamic) + '\n' else: # Single device output if 'process_list' in self.output: process_table_dict = {} @@ -822,7 +810,7 @@ class AMDSMILogger(): for key, value in self.output.items(): if key != 'process_list': primary_table_output[key] = value - primary_table += self._convert_json_to_tabular(primary_table_output) + '\n' + primary_table += self._convert_json_to_tabular(primary_table_output, dynamic=dynamic) + '\n' primary_table = primary_table.rstrip() secondary_table = secondary_table.rstrip() @@ -879,7 +867,7 @@ class AMDSMILogger(): for key, value in device_output.items(): if key != 'process_list': primary_table_output[key] = value - primary_table += self._convert_json_to_tabular(primary_table_output) + '\n' + primary_table += self._convert_json_to_tabular(primary_table_output, dynamic=dynamic) + '\n' primary_table = primary_table.rstrip() # Remove trailing new line secondary_table = secondary_table.rstrip() diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index b96e8cbf25..62e3fc8ed9 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -173,6 +173,14 @@ class AMDSMIParser(argparse.ArgumentParser): else: raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(string_value, outputformat) + def _is_command_supported(self, user_input, acceptable_values, command_name): + if acceptable_values == "N/A": + raise amdsmi_cli_exceptions.AmdSmiCommandNotSupportedException(command_name, self.helpers.get_output_format()) + elif str(user_input).upper() not in acceptable_values: + print(f"Valid inputs are {acceptable_values}") + raise amdsmi_cli_exceptions.AmdSmiInvalidParameterValueException(str(user_input).upper(), self.helpers.get_output_format()) + else: + return str(user_input).upper() def _limit_select(self): """Custom action for setting clock limits""" @@ -401,7 +409,7 @@ class AMDSMIParser(argparse.ArgumentParser): return _CoreSelectAction - def _add_command_modifiers(self, subcommand_parser): + def _add_command_modifiers(self, subcommand_parser: argparse.ArgumentParser): json_help = "Displays output in JSON format (human readable by default)." csv_help = "Displays output in CSV format (human readable by default)." file_help = "Saves output into a file on the provided path (stdout by default)." @@ -460,7 +468,7 @@ class AMDSMIParser(argparse.ArgumentParser): return value - def _add_device_arguments(self, subcommand_parser, required=False): + def _add_device_arguments(self, subcommand_parser: argparse.ArgumentParser, required=False): # Device arguments help text gpu_help = f"Select a GPU ID, BDF, or UUID from the possible choices:\n{self.gpu_choices_str}" vf_help = "Gets general information about the specified VF (timeslice, fb info, …).\ @@ -583,7 +591,7 @@ class AMDSMIParser(argparse.ArgumentParser): return _ValidateOverdrivePercent - def _add_version_parser(self, subparsers, func): + def _add_version_parser(self, subparsers: argparse._SubParsersAction, func): # Subparser help text version_help = "Display version information" @@ -597,7 +605,7 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_command_modifiers(version_parser) - def _add_list_parser(self, subparsers, func): + def _add_list_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_amdgpu_initialized(): # The list subcommand is only applicable to systems with amdgpu initialized return @@ -619,7 +627,7 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_device_arguments(list_parser, required=False) - def _add_static_parser(self, subparsers, func): + def _add_static_parser(self, subparsers: argparse._SubParsersAction, func): # Subparser help text static_help = "Gets static information about the specified GPU" static_subcommand_help = "If no GPU is specified, returns static information for all GPUs on the system.\ @@ -925,7 +933,7 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_command_modifiers(metric_parser) - def _add_process_parser(self, subparsers, func): + def _add_process_parser(self, subparsers: argparse._SubParsersAction, func): if self.helpers.is_hypervisor(): # Don't add this subparser on Hypervisors # This subparser is only available to Guest and Baremetal systems @@ -969,7 +977,7 @@ class AMDSMIParser(argparse.ArgumentParser): process_parser.add_argument('-n', '--name', action='store', type=lambda value: self._is_valid_string(value, '--name'), required=False, help=name_help) - def _add_profile_parser(self, subparsers, func): + def _add_profile_parser(self, subparsers: argparse._SubParsersAction, func): if not (self.helpers.is_windows() and self.helpers.is_hypervisor()): # This subparser only applies to Hypervisors return @@ -990,7 +998,7 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_device_arguments(profile_parser, required=False) - def _add_event_parser(self, subparsers, func): + def _add_event_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_amdgpu_initialized(): # The event subcommand is only applicable to systems with amdgpu initialized return @@ -1011,7 +1019,7 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_device_arguments(event_parser, required=False) - def _add_topology_parser(self, subparsers, func): + def _add_topology_parser(self, subparsers: argparse._SubParsersAction, func): if not(self.helpers.is_baremetal() and self.helpers.is_linux()): # This subparser is only applicable to Baremetal Linux return @@ -1059,7 +1067,7 @@ class AMDSMIParser(argparse.ArgumentParser): topology_parser.add_argument('-z', '--bi-dir', action='store_true', required=False, help=bi_dir_help) - def _add_set_value_parser(self, subparsers, func): + def _add_set_value_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_linux(): # This subparser is only applicable to Linux return @@ -1078,9 +1086,9 @@ class AMDSMIParser(argparse.ArgumentParser): set_profile_help = f"Set power profile level (#) or choose one of available profiles:\n\t{power_profile_choices_str}" perf_det_choices_str = ", ".join(self.helpers.get_perf_det_levels()) set_perf_det_help = f"Set performance determinism and select one of the corresponding performance levels:\n\t{perf_det_choices_str}" - compute_partition_choices_str = ", ".join(self.helpers.get_compute_partition_types()) + (accelerator_set_choices, _) = self.helpers.get_accelerator_choices_types_indices() memory_partition_choices_str = ", ".join(self.helpers.get_memory_partition_types()) - set_compute_partition_help = f"Set one of the following the compute partition modes:\n\t{compute_partition_choices_str}" + set_compute_partition_help = f"Set one of the following the accelerator type or profile index:\n\t{accelerator_set_choices}.\n\tUse `sudo amd-smi partition --accelerator` to find acceptable values." set_memory_partition_help = f"Set one of the following the memory partition modes:\n\t{memory_partition_choices_str}" power_cap_min, power_cap_max = self.helpers.get_power_caps() power_cap_max = self.helpers.convert_SI_unit(power_cap_max, AMDSMIHelpers.SI_Unit.MICRO) @@ -1128,7 +1136,7 @@ class AMDSMIParser(argparse.ArgumentParser): set_value_exclusive_group.add_argument('-l', '--perf-level', action='store', choices=self.helpers.get_perf_levels()[0], type=str.upper, required=False, help=set_perf_level_help, metavar='LEVEL') set_value_exclusive_group.add_argument('-P', '--profile', action='store', required=False, help=set_profile_help, metavar='SETPROFILE') set_value_exclusive_group.add_argument('-d', '--perf-determinism', action='store', type=lambda value: self._not_negative_int(value, '--perf-determinism'), required=False, help=set_perf_det_help, metavar='SCLKMAX') - set_value_exclusive_group.add_argument('-C', '--compute-partition', action='store', choices=self.helpers.get_compute_partition_types(), type=str.upper, required=False, help=set_compute_partition_help, metavar='PARTITION') + set_value_exclusive_group.add_argument('-C', '--compute-partition', action='store', choices=accelerator_set_choices, type=lambda value: self._is_command_supported(value, accelerator_set_choices, '--compute-partition'), required=False, help=set_compute_partition_help, metavar=' or ') set_value_exclusive_group.add_argument('-M', '--memory-partition', action='store', choices=self.helpers.get_memory_partition_types(), type=str.upper, required=False, help=set_memory_partition_help, metavar='PARTITION') set_value_exclusive_group.add_argument('-o', '--power-cap', action='store', type=lambda value: self._positive_int(value, '--power-cap'), required=False, help=set_power_cap_help, metavar='WATTS') set_value_exclusive_group.add_argument('-p', '--soc-pstate', action='store', required=False, type=lambda value: self._not_negative_int(value, '--soc-pstate'), help=set_soc_pstate_help, metavar='POLICY_ID') @@ -1162,7 +1170,7 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_command_modifiers(set_value_parser) - def _add_reset_parser(self, subparsers, func): + def _add_reset_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_linux(): # This subparser is only applicable to Linux return @@ -1215,7 +1223,7 @@ class AMDSMIParser(argparse.ArgumentParser): reset_exclusive_group.add_argument('-l', '--clean-local-data', action='store_true', required=False, help=reset_gpu_clean_local_data_help) - def _add_monitor_parser(self, subparsers, func): + def _add_monitor_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_linux(): # This subparser is only applicable to Linux return @@ -1314,7 +1322,7 @@ class AMDSMIParser(argparse.ArgumentParser): rocm_smi_parser.add_argument('-f', '--showclkfrq', action='store_true', required=False, help=showclkfrq_help) - def _add_xgmi_parser(self, subparsers, func): + def _add_xgmi_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_amdgpu_initialized(): # The xgmi subcommand is only applicable to systems with amdgpu initialized return @@ -1344,7 +1352,7 @@ class AMDSMIParser(argparse.ArgumentParser): xgmi_parser.add_argument('-l', '--link-status', action='store_true', required=False, help=xgmi_link_status_help) - def _add_partition_parser(self, subparsers, func): + def _add_partition_parser(self, subparsers: argparse._SubParsersAction, func): if not self.helpers.is_amdgpu_initialized(): # The partition subcommand is only applicable to systems with amdgpu initialized return diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index addf071cf4..24ecb46299 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -70,6 +70,7 @@ typedef enum { #define AMDSMI_MAX_ACCELERATOR_PROFILE 32 #define AMDSMI_MAX_CP_PROFILE_RESOURCES 32 #define AMDSMI_MAX_ACCELERATOR_PARTITIONS 8 +#define AMDSMI_MAX_NUM_NUMA_NODES 32 #define AMDSMI_GPU_UUID_SIZE 38 @@ -259,8 +260,8 @@ typedef enum { AMDSMI_STATUS_FILE_NOT_FOUND = 52, //!< file or directory not found AMDSMI_STATUS_ARG_PTR_NULL = 53, //!< Parsed argument is invalid AMDSMI_STATUS_AMDGPU_RESTART_ERR = 54, //!< AMDGPU restart failed - AMDSMI_STATUS_SETTING_UNAVAILABLE = 55, //!< Setting is not available - AMDSMI_STATUS_CORRUPTED_EEPROM = 56, //!< EEPROM is corrupted + AMDSMI_STATUS_SETTING_UNAVAILABLE = 55, //!< Setting is not available + AMDSMI_STATUS_CORRUPTED_EEPROM = 56, //!< EEPROM is corrupted // General errors AMDSMI_STATUS_MAP_ERROR = 0xFFFFFFFE, //!< The internal library error did not map to a status code @@ -292,19 +293,35 @@ typedef enum { * various accelerator partitioning settings. */ typedef enum { - AMDSMI_ACCELERATOR_PARTITION_INVALID = 0, - AMDSMI_ACCELERATOR_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work - //!< together with shared memory - AMDSMI_ACCELERATOR_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work - //!< together with shared memory - AMDSMI_ACCELERATOR_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs - //!< work together with shared memory - AMDSMI_ACCELERATOR_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs - //!< work together with shared memory - AMDSMI_ACCELERATOR_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with - //!< shared memory + AMDSMI_ACCELERATOR_PARTITION_INVALID = 0, + AMDSMI_ACCELERATOR_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + AMDSMI_ACCELERATOR_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + AMDSMI_ACCELERATOR_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + AMDSMI_ACCELERATOR_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory + AMDSMI_ACCELERATOR_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory + AMDSMI_ACCELERATOR_PARTITION_MAX } amdsmi_accelerator_partition_type_t; +/** + * @brief Accelerator Partition Resource Type. + * This enum is used to identify + * various accelerator resource types. + */ +typedef enum { + AMDSMI_ACCELERATOR_XCC, + AMDSMI_ACCELERATOR_ENCODER, + AMDSMI_ACCELERATOR_DECODER, + AMDSMI_ACCELERATOR_DMA, + AMDSMI_ACCELERATOR_JPEG, + AMDSMI_ACCELERATOR_MAX +} amdsmi_accelerator_partition_resource_type_t; + + /** * @brief Compute Partition. This enum is used to identify * various compute partitioning settings. @@ -329,19 +346,19 @@ typedef enum { */ typedef enum { AMDSMI_MEMORY_PARTITION_UNKNOWN = 0, - AMDSMI_MEMORY_PARTITION_NPS1, //!< NPS1 - All CCD & XCD data is interleaved - //!< accross all 8 HBM stacks (all stacks/1). - AMDSMI_MEMORY_PARTITION_NPS2, //!< NPS2 - 2 sets of CCDs or 4 XCD interleaved - //!< accross the 4 HBM stacks per AID pair - //!< (8 stacks/2). - AMDSMI_MEMORY_PARTITION_NPS4, //!< NPS4 - Each XCD data is interleaved accross - //!< accross 2 (or single) HBM stacks - //!< (8 stacks/8 or 8 stacks/4). - AMDSMI_MEMORY_PARTITION_NPS8, //!< NPS8 - Each XCD uses a single HBM stack - //!< (8 stacks/8). Or each XCD uses a single - //!< HBM stack & CCDs share 2 non-interleaved - //!< HBM stacks on its AID - //!< (AID[1,2,3] = 6 stacks/6). + AMDSMI_MEMORY_PARTITION_NPS1 = 1, //!< NPS1 - All CCD & XCD data is interleaved + //!< accross all 8 HBM stacks (all stacks/1). + AMDSMI_MEMORY_PARTITION_NPS2 = 2, //!< NPS2 - 2 sets of CCDs or 4 XCD interleaved + //!< accross the 4 HBM stacks per AID pair + //!< (8 stacks/2). + AMDSMI_MEMORY_PARTITION_NPS4 = 4, //!< NPS4 - Each XCD data is interleaved + //!< accross 2 (or single) HBM stacks + //!< (8 stacks/8 or 8 stacks/4). + AMDSMI_MEMORY_PARTITION_NPS8 = 8, //!< NPS8 - Each XCD uses a single HBM stack + //!< (8 stacks/8). Or each XCD uses a single + //!< HBM stack & CCDs share 2 non-interleaved + //!< HBM stacks on its AID + //!< (AID[1,2,3] = 6 stacks/6). } amdsmi_memory_partition_type_t; /** @@ -661,34 +678,77 @@ typedef struct { } amdsmi_kfd_info_t; /** - * @brief Possible Memory Partition Modes. - * This union is used to identify various memory partitioning settings. + * @brief Possible Memory Partition Capabilities. + * This union is used to identify various memory partition capabilities. */ typedef union { - struct nps_flags_ { - uint32_t nps1_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported - uint32_t nps2_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported - uint32_t nps4_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported - uint32_t nps8_cap :1; // bool 1 = true; 0 = false; Max uint32 means unsupported - uint32_t reserved :28; - } amdsmi_nps_flags_t; - uint32_t nps_cap_mask; + struct nps_flags_ { + uint32_t nps1_cap :1; //!< bool 1 = true; 0 = false + uint32_t nps2_cap :1; //!< bool 1 = true; 0 = false + uint32_t nps4_cap :1; //!< bool 1 = true; 0 = false + uint32_t nps8_cap :1; //!< bool 1 = true; 0 = false + uint32_t reserved :28; + } amdsmi_nps_flags_t; + + uint32_t nps_cap_mask; } amdsmi_nps_caps_t; /** - * @brief Possible Memory Partition Modes. - * This union is used to identify various memory partitioning settings. + * @brief Memory Partition Configuration. + * This structure is used to identify various memory partition configurations. */ typedef struct { - amdsmi_accelerator_partition_type_t profile_type; // SPX, DPX, QPX, CPX and so on - uint32_t num_partitions; // On MI300X, SPX: 1, DPX: 2, QPX: 4, CPX: 8, length of resources array - amdsmi_nps_caps_t memory_caps; // Possible memory partition capabilities - uint32_t profile_index; - uint32_t num_resources; // length of index_of_resources_profile - uint32_t resources[AMDSMI_MAX_ACCELERATOR_PARTITIONS][AMDSMI_MAX_CP_PROFILE_RESOURCES]; - uint64_t reserved[13]; + amdsmi_nps_caps_t partition_caps; + amdsmi_memory_partition_type_t mp_mode; + uint32_t num_numa_ranges; + struct numa_range_ { + amdsmi_vram_type_t memory_type; + uint64_t start; + uint64_t end; + } numa_range[AMDSMI_MAX_NUM_NUMA_NODES]; + + uint64_t reserved[11]; +} amdsmi_memory_partition_config_t; + +/** + * @brief Accelerator Partition Profile. + * This structure is used to identify the current accelerator partition profile. + */ +typedef struct { + amdsmi_accelerator_partition_type_t profile_type; //!< SPX, DPX, QPX, CPX and so on + uint32_t num_partitions; //!< On MI300X: SPX=>1, DPX=>2, QPX=>4, CPX=>8; length of resources + amdsmi_nps_caps_t memory_caps; //!< Possible memory partition capabilities + uint32_t profile_index; //!< Index in the profiles array in amdsmi_accelerator_partition_profile_t + uint32_t num_resources; //!< length of index_of_resources_profile + uint32_t resources[AMDSMI_MAX_ACCELERATOR_PARTITIONS][AMDSMI_MAX_CP_PROFILE_RESOURCES]; + uint64_t reserved[13]; } amdsmi_accelerator_partition_profile_t; +/** + * @brief Accelerator Partition Resources. + * This struct is used to identify various partition resource profiles. + */ +typedef struct { + uint32_t profile_index; + amdsmi_accelerator_partition_resource_type_t resource_type; + uint32_t partition_resource; //!< Resources a partition can use, which may be shared + uint32_t num_partitions_share_resource; //!< If it is greater than 1, then resource is shared. + uint64_t reserved[6]; +} amdsmi_accelerator_partition_resource_profile_t; + +/** + * @brief Accelerator Partition Profile Configurations. + * This struct is used to identify various partition profiles. + */ +typedef struct { + uint32_t num_profiles; //!< The length of profiles array + uint32_t num_resource_profiles; + amdsmi_accelerator_partition_resource_profile_t resource_profiles[AMDSMI_MAX_CP_PROFILE_RESOURCES]; + uint32_t default_profile_index; //!< The index of the default profile in the profiles array + amdsmi_accelerator_partition_profile_t profiles[AMDSMI_MAX_ACCELERATOR_PROFILE]; + uint64_t reserved[30]; +} amdsmi_accelerator_partition_profile_config_t; + typedef enum { AMDSMI_LINK_TYPE_INTERNAL, AMDSMI_LINK_TYPE_XGMI, @@ -4583,26 +4643,103 @@ amdsmi_get_gpu_memory_partition(amdsmi_processor_handle processor_handle, char * * */ amdsmi_status_t -amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle, amdsmi_memory_partition_type_t memory_partition); +amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle, + amdsmi_memory_partition_type_t memory_partition); +/** + * @brief Version 2.0: Returns current gpu memory partition capabilities + * + * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} @platform{guest_mvf} + * + * @param[in] processor_handle a processor handle + * + * @param[out] config reference to the accelerator partition profile. + * Must be allocated by user. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t +amdsmi_get_gpu_memory_partition_config(amdsmi_processor_handle processor_handle, + amdsmi_memory_partition_config_t *config); + +/** + * @brief Version 2.0: Set accelerator partition setting based on profile_index from amdsmi_get_gpu_accelerator_partition_profile_config + * + * @platform{gpu_bm_linux} @platform{host} + * + * @param[in] processor_handle a processor handle + * + * @param[in] mode Enum representing memory partition to set to + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t +amdsmi_set_gpu_memory_partition_mode(amdsmi_processor_handle processor_handle, + amdsmi_memory_partition_type_t mode); /** @} */ // end of memory_partition /*****************************************************************************/ -/** @defgroup accelerator_partition_profile Accelerator Partition Profile Functions +/** @defgroup accelerator_partition Accelerator Partition Profile Functions * These functions are used to configure and query the device's * accelerator parition profile setting. * @{ */ -// TODO: declare rest of partition profile functions and complete doc commentary. -/* - Get the current accelerator partition profile. The function will return current profile. -*/ +/** + * @brief Version 2.0: Returns gpu accelerator partition caps as currently configured in the system + * User must use admin/sudo privledges to run this API, or API will not be able to + * read resources. Otherwise, API will fill in the structure with as much information as + * it can. + * + * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} @platform{guest_mvf} + * + * @param[in] processor_handle Device which to query + * + * @param[out] profile_config reference to the accelerator partition config. + * Must be allocated by user. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t +amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle processor_handle, + amdsmi_accelerator_partition_profile_config_t *profile_config); + +/** + * @brief Version 2.0: Returns current gpu accelerator partition capabilities + * + * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} @platform{guest_mvf} + * + * @param[in] processor_handle Device which to query + * + * @param[out] profile reference to the accelerator partition profile. + * Must be allocated by user. + * + * @param[inout] partition_id array of ids for current accelerator profile. + * Must be allocated by user. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ amdsmi_status_t amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_handle, amdsmi_accelerator_partition_profile_t *profile, uint32_t *partition_id); -/** @} */ // end of accelerator_partition_profile +/** + * @brief Version 2.0: Set accelerator partition setting based on profile_index + * from amdsmi_get_gpu_accelerator_partition_profile_config + * + * @platform{gpu_bm_linux} @platform{host} + * + * @param[in] processor_handle Device which to query + * + * @param[in] profile_index Represents index of a partition user wants to set + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t +amdsmi_set_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_handle, + uint32_t profile_index); + +/** @} End accelerator_partition */ /*****************************************************************************/ /** @defgroup EvntNotif Event Notification Functions diff --git a/projects/amdsmi/py-interface/__init__.py b/projects/amdsmi/py-interface/__init__.py index b52765571e..db8df1eb87 100644 --- a/projects/amdsmi/py-interface/__init__.py +++ b/projects/amdsmi/py-interface/__init__.py @@ -224,6 +224,10 @@ from .amdsmi_interface import amdsmi_set_gpu_compute_partition from .amdsmi_interface import amdsmi_get_gpu_memory_partition from .amdsmi_interface import amdsmi_set_gpu_memory_partition from .amdsmi_interface import amdsmi_get_gpu_accelerator_partition_profile +from .amdsmi_interface import amdsmi_get_gpu_accelerator_partition_profile_config +from .amdsmi_interface import amdsmi_get_gpu_memory_partition_config +from .amdsmi_interface import amdsmi_set_gpu_accelerator_partition_profile +from .amdsmi_interface import amdsmi_set_gpu_memory_partition_mode # # Individual GPU Metrics Functions from .amdsmi_interface import amdsmi_get_gpu_metrics_header_info diff --git a/projects/amdsmi/py-interface/amdsmi_exception.py b/projects/amdsmi/py-interface/amdsmi_exception.py index 7cd94e75ad..958786a603 100644 --- a/projects/amdsmi/py-interface/amdsmi_exception.py +++ b/projects/amdsmi/py-interface/amdsmi_exception.py @@ -87,6 +87,8 @@ class AmdSmiLibraryException(AmdSmiException): amdsmi_wrapper.AMDSMI_STATUS_ARG_PTR_NULL : "AMDSMI_STATUS_ARG_PTR_NULL - Parsed argument is invalid", amdsmi_wrapper.AMDSMI_STATUS_MAP_ERROR : "AMDSMI_STATUS_MAP_ERROR - The internal library error did not map to a status code", amdsmi_wrapper.AMDSMI_STATUS_AMDGPU_RESTART_ERR: "AMDSMI_STATUS_AMDGPU_RESTART_ERR - AMDGPU restart failed, please check dmsg for errors", + amdsmi_wrapper.AMDSMI_STATUS_SETTING_UNAVAILABLE: "AMDSMI_STATUS_SETTING_UNAVAILABLE - Setting is not available", + amdsmi_wrapper.AMDSMI_STATUS_CORRUPTED_EEPROM: "AMDSMI_STATUS_CORRUPTED_EEPROM - Setting is not available", amdsmi_wrapper.AMDSMI_STATUS_UNKNOWN_ERROR : "AMDSMI_STATUS_UNKNOWN_ERROR - An unknown error occurred" } diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index a6e74f890d..10c5d0d81e 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -19,6 +19,8 @@ import ctypes import re +import json +import logging from typing import Union, Any, Dict, List from enum import IntEnum from collections.abc import Iterable @@ -288,13 +290,30 @@ class AmdSmiVoltageType(IntEnum): VDDGFX = amdsmi_wrapper.AMDSMI_VOLT_TYPE_VDDGFX INVALID = amdsmi_wrapper.AMDSMI_VOLT_TYPE_INVALID +class AmdSmiAcceleratorPartitionResourceType(IntEnum): + XCC = amdsmi_wrapper.AMDSMI_ACCELERATOR_XCC + ENCODER = amdsmi_wrapper.AMDSMI_ACCELERATOR_ENCODER + DECODER = amdsmi_wrapper.AMDSMI_ACCELERATOR_DECODER + DMA = amdsmi_wrapper.AMDSMI_ACCELERATOR_DMA + JPEG = amdsmi_wrapper.AMDSMI_ACCELERATOR_JPEG + MAX = amdsmi_wrapper.AMDSMI_ACCELERATOR_MAX + + +class AmdSmiAcceleratorPartitionType(IntEnum): + SPX = amdsmi_wrapper.AMDSMI_ACCELERATOR_PARTITION_SPX + DPX = amdsmi_wrapper.AMDSMI_ACCELERATOR_PARTITION_DPX + TPX = amdsmi_wrapper.AMDSMI_ACCELERATOR_PARTITION_TPX + QPX = amdsmi_wrapper.AMDSMI_ACCELERATOR_PARTITION_QPX + CPX = amdsmi_wrapper.AMDSMI_ACCELERATOR_PARTITION_CPX + INVALID = amdsmi_wrapper.AMDSMI_ACCELERATOR_PARTITION_INVALID + class AmdSmiComputePartitionType(IntEnum): - CPX = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_CPX SPX = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_SPX DPX = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_DPX TPX = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_TPX QPX = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_QPX + CPX = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_CPX INVALID = amdsmi_wrapper.AMDSMI_COMPUTE_PARTITION_INVALID @@ -2729,6 +2748,7 @@ def amdsmi_get_gpu_compute_partition(processor_handle: amdsmi_wrapper.amdsmi_pro def amdsmi_set_gpu_compute_partition(processor_handle: amdsmi_wrapper.amdsmi_processor_handle, compute_partition: AmdSmiComputePartitionType): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle @@ -2743,6 +2763,21 @@ def amdsmi_set_gpu_compute_partition(processor_handle: amdsmi_wrapper.amdsmi_pro ) ) +def amdsmi_set_gpu_accelerator_partition_profile(processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + profile_index: int): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + if not isinstance(profile_index, int): + raise AmdSmiParameterException(profile_index, int) + + _check_res( + amdsmi_wrapper.amdsmi_set_gpu_accelerator_partition_profile( + processor_handle, profile_index + ) + ) def amdsmi_get_gpu_memory_partition(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): @@ -2763,6 +2798,39 @@ def amdsmi_get_gpu_memory_partition(processor_handle: amdsmi_wrapper.amdsmi_proc return memory_partition.value.decode("utf-8") +def amdsmi_get_gpu_memory_partition_config(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + config = amdsmi_wrapper.amdsmi_memory_partition_config_t() + + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_memory_partition_config( + processor_handle, config + ) + ) + mem_caps_list = [] + if config.partition_caps.amdsmi_nps_flags_t.nps1_cap == 1: + mem_caps_list.append("NPS1") + if config.partition_caps.amdsmi_nps_flags_t.nps2_cap == 1: + mem_caps_list.append("NPS2") + if config.partition_caps.amdsmi_nps_flags_t.nps4_cap == 1: + mem_caps_list.append("NPS4") + if config.partition_caps.amdsmi_nps_flags_t.nps8_cap == 1: + mem_caps_list.append("NPS8") + + return_dict = { + "partition_caps": mem_caps_list, + "mp_mode": amdsmi_wrapper.amdsmi_memory_partition_type_t__enumvalues[ + config.mp_mode].replace("AMDSMI_MEMORY_PARTITION_", "").replace("UNKNOWN", "N/A"), + "num_numa_ranges": "N/A", + "numa_range": "N/A", + } + logging.debug("amdsmi_interface.py | amdsmi_get_gpu_memory_partition_config | return_dictionary = \n" + str(json.dumps(return_dict, indent=4))) + return return_dict + def amdsmi_set_gpu_memory_partition(processor_handle: amdsmi_wrapper.amdsmi_processor_handle, memory_partition: AmdSmiMemoryPartitionType): @@ -2780,6 +2848,21 @@ def amdsmi_set_gpu_memory_partition(processor_handle: amdsmi_wrapper.amdsmi_proc ) ) +def amdsmi_set_gpu_memory_partition_mode(processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + memory_partition: AmdSmiMemoryPartitionType): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + if not isinstance(memory_partition, AmdSmiMemoryPartitionType): + raise AmdSmiParameterException(memory_partition, AmdSmiMemoryPartitionType) + + _check_res( + amdsmi_wrapper.amdsmi_set_gpu_memory_partition( + processor_handle, memory_partition + ) + ) def amdsmi_get_gpu_accelerator_partition_profile( processor_handle: amdsmi_wrapper.amdsmi_processor_handle @@ -2788,29 +2871,129 @@ def amdsmi_get_gpu_accelerator_partition_profile( raise AmdSmiParameterException( processor_handle, amdsmi_wrapper.amdsmi_processor_handle ) - partition_id = ctypes.c_uint32() + length = 8 + partition_id = [0, 0, 0, 0, 0, 0, 0, 0] + partition_id_list = (ctypes.c_uint32 * length)(*partition_id) profile = amdsmi_wrapper.amdsmi_accelerator_partition_profile_t() _check_res( amdsmi_wrapper.amdsmi_get_gpu_accelerator_partition_profile(processor_handle, - ctypes.byref(profile), - ctypes.byref(partition_id)) + ctypes.byref(profile), partition_id_list) ) + profile_type_ret = amdsmi_wrapper.amdsmi_accelerator_partition_type_t__enumvalues[profile.profile_type].replace("AMDSMI_ACCELERATOR_PARTITION_", "") + profile_type_ret = profile_type_ret.replace("INVALID", "N/A") + + length = profile.num_partitions + partition_ids = [] + for i in range(profile.num_partitions): + partition_ids.append(partition_id_list[i]) + + last_element = 0 + if length > 0: + last_element = length - 1 + if ((partition_ids[last_element] == 0) + and not((profile_type_ret == str("SPX")) or (profile_type_ret == str("N/A")))): + partition_ids = "N/A" + + mem_caps_list = [] + if profile.memory_caps.amdsmi_nps_flags_t.nps1_cap == 1: + mem_caps_list.append("NPS1") + if profile.memory_caps.amdsmi_nps_flags_t.nps2_cap == 1: + mem_caps_list.append("NPS2") + if profile.memory_caps.amdsmi_nps_flags_t.nps4_cap == 1: + mem_caps_list.append("NPS4") + if profile.memory_caps.amdsmi_nps_flags_t.nps8_cap == 1: + mem_caps_list.append("NPS8") partition_profile_dict = { - "profile_type" : profile.profile_type, + "profile_type" : profile_type_ret, "num_partitions" : profile.num_partitions, "profile_index" : profile.profile_index, - "memory_caps" : profile.memory_caps, + "memory_caps": mem_caps_list, "num_resources" : profile.num_resources, "resources" : "N/A" } - - return { - "partition_id" : partition_id.value, + return_dictionary = { + "partition_id" : partition_ids, "partition_profile" : partition_profile_dict } + logging.debug("amdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile | return_dictionary = \n" + str(json.dumps(return_dictionary, indent=4))) + return return_dictionary + +def amdsmi_get_gpu_accelerator_partition_profile_config(processor_handle: amdsmi_wrapper.amdsmi_processor_handle) -> Dict: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + config = amdsmi_wrapper.amdsmi_accelerator_partition_profile_config_t() + + _check_res(amdsmi_wrapper.amdsmi_get_gpu_accelerator_partition_profile_config(processor_handle, + ctypes.byref(config))) + logging.debug("\namdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile_config | START - " + + "config.num_profiles = " + str(config.num_profiles) + + "\n; config.num_resource_profiles = " + str(config.num_resource_profiles) + + "\n; config.resource_profiles = " + str(config.resource_profiles) + + "\n; config.default_profile_index = " + str(config.default_profile_index) + + "\n; config.profiles = " + str(config.profiles)) + + profiles = [] + resource_idx = 0 + for i in range(config.num_profiles): + profile = config.profiles[i] + logging.debug("\namdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile_config | profile = " + str(profile)) + profile_type_ret = amdsmi_wrapper.amdsmi_accelerator_partition_type_t__enumvalues[ + config.profiles[i].profile_type].replace("AMDSMI_ACCELERATOR_PARTITION_", "") + profile_type_ret = profile_type_ret.replace("INVALID", "N/A") + resources = [] + + + mem_caps_list = [] + if profile.memory_caps.amdsmi_nps_flags_t.nps1_cap == 1: + mem_caps_list.append("NPS1") + if profile.memory_caps.amdsmi_nps_flags_t.nps2_cap == 1: + mem_caps_list.append("NPS2") + if profile.memory_caps.amdsmi_nps_flags_t.nps4_cap == 1: + mem_caps_list.append("NPS4") + if profile.memory_caps.amdsmi_nps_flags_t.nps8_cap == 1: + mem_caps_list.append("NPS8") + + for r in range(config.num_resource_profiles): + logging.debug("\namdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile_config | i = " + str(i) + "; r = " + str(r) + "; resource_idx = " + str(resource_idx)) + res_profile = config.resource_profiles[resource_idx] + resource_profiles_ret = amdsmi_wrapper.amdsmi_accelerator_partition_resource_type_t__enumvalues[ + res_profile.resource_type].replace("AMDSMI_ACCELERATOR_", "") + resource_profile_dict = { + "profile_index": res_profile.profile_index, + "resource_type": resource_profiles_ret, + "partition_resource": res_profile.partition_resource, + "num_partitions_share_resource": res_profile.num_partitions_share_resource, + } + logging.debug("\namdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile_config | resource_profile_dict = " + str(resource_profile_dict)) + resources.append(resource_profile_dict) + resource_idx += 1 + + profile_dict = { + "profile_type": profile_type_ret, + "num_partitions": profile.num_partitions, + "profile_index": profile.profile_index, + "memory_caps": mem_caps_list, + "num_resources": profile.num_resources, + "resources": resources + } + profiles.append(profile_dict) + + config_dict = { + "num_profiles": config.num_profiles, + "num_resource_profiles": config.num_resource_profiles, + "resource_profiles": resources, + "default_profile_index": config.default_profile_index, + "profiles": profiles, + } + logging.debug("\namdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile_config | END - config_dict = \n" + str(json.dumps(config_dict, indent=4))) + + return config_dict def amdsmi_get_xgmi_info(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index 9075461da4..50fb8c752c 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -285,6 +285,7 @@ amdsmi_status_t__enumvalues = { 53: 'AMDSMI_STATUS_ARG_PTR_NULL', 54: 'AMDSMI_STATUS_AMDGPU_RESTART_ERR', 55: 'AMDSMI_STATUS_SETTING_UNAVAILABLE', + 56: 'AMDSMI_STATUS_CORRUPTED_EEPROM', 4294967294: 'AMDSMI_STATUS_MAP_ERROR', 4294967295: 'AMDSMI_STATUS_UNKNOWN_ERROR', } @@ -329,6 +330,7 @@ AMDSMI_STATUS_FILE_NOT_FOUND = 52 AMDSMI_STATUS_ARG_PTR_NULL = 53 AMDSMI_STATUS_AMDGPU_RESTART_ERR = 54 AMDSMI_STATUS_SETTING_UNAVAILABLE = 55 +AMDSMI_STATUS_CORRUPTED_EEPROM = 56 AMDSMI_STATUS_MAP_ERROR = 4294967294 AMDSMI_STATUS_UNKNOWN_ERROR = 4294967295 amdsmi_status_t = ctypes.c_uint32 # enum @@ -372,6 +374,7 @@ amdsmi_accelerator_partition_type_t__enumvalues = { 3: 'AMDSMI_ACCELERATOR_PARTITION_TPX', 4: 'AMDSMI_ACCELERATOR_PARTITION_QPX', 5: 'AMDSMI_ACCELERATOR_PARTITION_CPX', + 6: 'AMDSMI_ACCELERATOR_PARTITION_MAX', } AMDSMI_ACCELERATOR_PARTITION_INVALID = 0 AMDSMI_ACCELERATOR_PARTITION_SPX = 1 @@ -379,8 +382,26 @@ AMDSMI_ACCELERATOR_PARTITION_DPX = 2 AMDSMI_ACCELERATOR_PARTITION_TPX = 3 AMDSMI_ACCELERATOR_PARTITION_QPX = 4 AMDSMI_ACCELERATOR_PARTITION_CPX = 5 +AMDSMI_ACCELERATOR_PARTITION_MAX = 6 amdsmi_accelerator_partition_type_t = ctypes.c_uint32 # enum +# values for enumeration 'amdsmi_accelerator_partition_resource_type_t' +amdsmi_accelerator_partition_resource_type_t__enumvalues = { + 0: 'AMDSMI_ACCELERATOR_XCC', + 1: 'AMDSMI_ACCELERATOR_ENCODER', + 2: 'AMDSMI_ACCELERATOR_DECODER', + 3: 'AMDSMI_ACCELERATOR_DMA', + 4: 'AMDSMI_ACCELERATOR_JPEG', + 5: 'AMDSMI_ACCELERATOR_MAX', +} +AMDSMI_ACCELERATOR_XCC = 0 +AMDSMI_ACCELERATOR_ENCODER = 1 +AMDSMI_ACCELERATOR_DECODER = 2 +AMDSMI_ACCELERATOR_DMA = 3 +AMDSMI_ACCELERATOR_JPEG = 4 +AMDSMI_ACCELERATOR_MAX = 5 +amdsmi_accelerator_partition_resource_type_t = ctypes.c_uint32 # enum + # values for enumeration 'amdsmi_compute_partition_type_t' amdsmi_compute_partition_type_t__enumvalues = { 0: 'AMDSMI_COMPUTE_PARTITION_INVALID', @@ -403,14 +424,14 @@ amdsmi_memory_partition_type_t__enumvalues = { 0: 'AMDSMI_MEMORY_PARTITION_UNKNOWN', 1: 'AMDSMI_MEMORY_PARTITION_NPS1', 2: 'AMDSMI_MEMORY_PARTITION_NPS2', - 3: 'AMDSMI_MEMORY_PARTITION_NPS4', - 4: 'AMDSMI_MEMORY_PARTITION_NPS8', + 4: 'AMDSMI_MEMORY_PARTITION_NPS4', + 8: 'AMDSMI_MEMORY_PARTITION_NPS8', } AMDSMI_MEMORY_PARTITION_UNKNOWN = 0 AMDSMI_MEMORY_PARTITION_NPS1 = 1 AMDSMI_MEMORY_PARTITION_NPS2 = 2 -AMDSMI_MEMORY_PARTITION_NPS4 = 3 -AMDSMI_MEMORY_PARTITION_NPS8 = 4 +AMDSMI_MEMORY_PARTITION_NPS4 = 4 +AMDSMI_MEMORY_PARTITION_NPS8 = 8 amdsmi_memory_partition_type_t = ctypes.c_uint32 # enum # values for enumeration 'amdsmi_temperature_type_t' @@ -979,6 +1000,31 @@ union_amdsmi_nps_caps_t._fields_ = [ ] amdsmi_nps_caps_t = union_amdsmi_nps_caps_t +class struct_amdsmi_memory_partition_config_t(Structure): + pass + +class struct_numa_range_(Structure): + pass + +struct_numa_range_._pack_ = 1 # source:False +struct_numa_range_._fields_ = [ + ('memory_type', amdsmi_vram_type_t), + ('PADDING_0', ctypes.c_ubyte * 4), + ('start', ctypes.c_uint64), + ('end', ctypes.c_uint64), +] + +struct_amdsmi_memory_partition_config_t._pack_ = 1 # source:False +struct_amdsmi_memory_partition_config_t._fields_ = [ + ('partition_caps', amdsmi_nps_caps_t), + ('mp_mode', amdsmi_memory_partition_type_t), + ('num_numa_ranges', ctypes.c_uint32), + ('PADDING_0', ctypes.c_ubyte * 4), + ('numa_range', struct_numa_range_ * 32), + ('reserved', ctypes.c_uint64 * 11), +] + +amdsmi_memory_partition_config_t = struct_amdsmi_memory_partition_config_t class struct_amdsmi_accelerator_partition_profile_t(Structure): pass @@ -995,6 +1041,34 @@ struct_amdsmi_accelerator_partition_profile_t._fields_ = [ ] amdsmi_accelerator_partition_profile_t = struct_amdsmi_accelerator_partition_profile_t +class struct_amdsmi_accelerator_partition_resource_profile_t(Structure): + pass + +struct_amdsmi_accelerator_partition_resource_profile_t._pack_ = 1 # source:False +struct_amdsmi_accelerator_partition_resource_profile_t._fields_ = [ + ('profile_index', ctypes.c_uint32), + ('resource_type', amdsmi_accelerator_partition_resource_type_t), + ('partition_resource', ctypes.c_uint32), + ('num_partitions_share_resource', ctypes.c_uint32), + ('reserved', ctypes.c_uint64 * 6), +] + +amdsmi_accelerator_partition_resource_profile_t = struct_amdsmi_accelerator_partition_resource_profile_t +class struct_amdsmi_accelerator_partition_profile_config_t(Structure): + pass + +struct_amdsmi_accelerator_partition_profile_config_t._pack_ = 1 # source:False +struct_amdsmi_accelerator_partition_profile_config_t._fields_ = [ + ('num_profiles', ctypes.c_uint32), + ('num_resource_profiles', ctypes.c_uint32), + ('resource_profiles', struct_amdsmi_accelerator_partition_resource_profile_t * 32), + ('default_profile_index', ctypes.c_uint32), + ('PADDING_0', ctypes.c_ubyte * 4), + ('profiles', struct_amdsmi_accelerator_partition_profile_t * 32), + ('reserved', ctypes.c_uint64 * 30), +] + +amdsmi_accelerator_partition_profile_config_t = struct_amdsmi_accelerator_partition_profile_config_t # values for enumeration 'amdsmi_link_type_t' amdsmi_link_type_t__enumvalues = { @@ -2181,6 +2255,12 @@ amdsmi_get_gpu_memory_usage.argtypes = [amdsmi_processor_handle, amdsmi_memory_t amdsmi_get_gpu_bad_page_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_bad_page_info amdsmi_get_gpu_bad_page_info.restype = amdsmi_status_t amdsmi_get_gpu_bad_page_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(struct_amdsmi_retired_page_record_t)] +amdsmi_get_gpu_bad_page_threshold = _libraries['libamd_smi.so'].amdsmi_get_gpu_bad_page_threshold +amdsmi_get_gpu_bad_page_threshold.restype = amdsmi_status_t +amdsmi_get_gpu_bad_page_threshold.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32)] +amdsmi_gpu_validate_ras_eeprom = _libraries['libamd_smi.so'].amdsmi_gpu_validate_ras_eeprom +amdsmi_gpu_validate_ras_eeprom.restype = amdsmi_status_t +amdsmi_gpu_validate_ras_eeprom.argtypes = [amdsmi_processor_handle] amdsmi_get_gpu_ras_feature_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_ras_feature_info amdsmi_get_gpu_ras_feature_info.restype = amdsmi_status_t amdsmi_get_gpu_ras_feature_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_ras_feature_t)] @@ -2382,9 +2462,21 @@ amdsmi_get_gpu_memory_partition.argtypes = [amdsmi_processor_handle, ctypes.POIN amdsmi_set_gpu_memory_partition = _libraries['libamd_smi.so'].amdsmi_set_gpu_memory_partition amdsmi_set_gpu_memory_partition.restype = amdsmi_status_t amdsmi_set_gpu_memory_partition.argtypes = [amdsmi_processor_handle, amdsmi_memory_partition_type_t] +amdsmi_get_gpu_memory_partition_config = _libraries['libamd_smi.so'].amdsmi_get_gpu_memory_partition_config +amdsmi_get_gpu_memory_partition_config.restype = amdsmi_status_t +amdsmi_get_gpu_memory_partition_config.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_memory_partition_config_t)] +amdsmi_set_gpu_memory_partition_mode = _libraries['libamd_smi.so'].amdsmi_set_gpu_memory_partition_mode +amdsmi_set_gpu_memory_partition_mode.restype = amdsmi_status_t +amdsmi_set_gpu_memory_partition_mode.argtypes = [amdsmi_processor_handle, amdsmi_memory_partition_type_t] +amdsmi_get_gpu_accelerator_partition_profile_config = _libraries['libamd_smi.so'].amdsmi_get_gpu_accelerator_partition_profile_config +amdsmi_get_gpu_accelerator_partition_profile_config.restype = amdsmi_status_t +amdsmi_get_gpu_accelerator_partition_profile_config.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_accelerator_partition_profile_config_t)] amdsmi_get_gpu_accelerator_partition_profile = _libraries['libamd_smi.so'].amdsmi_get_gpu_accelerator_partition_profile amdsmi_get_gpu_accelerator_partition_profile.restype = amdsmi_status_t amdsmi_get_gpu_accelerator_partition_profile.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_accelerator_partition_profile_t), ctypes.POINTER(ctypes.c_uint32)] +amdsmi_set_gpu_accelerator_partition_profile = _libraries['libamd_smi.so'].amdsmi_set_gpu_accelerator_partition_profile +amdsmi_set_gpu_accelerator_partition_profile.restype = amdsmi_status_t +amdsmi_set_gpu_accelerator_partition_profile.argtypes = [amdsmi_processor_handle, uint32_t] amdsmi_init_gpu_event_notification = _libraries['libamd_smi.so'].amdsmi_init_gpu_event_notification amdsmi_init_gpu_event_notification.restype = amdsmi_status_t amdsmi_init_gpu_event_notification.argtypes = [amdsmi_processor_handle] @@ -2591,13 +2683,17 @@ amdsmi_get_esmi_err_msg = _libraries['libamd_smi.so'].amdsmi_get_esmi_err_msg amdsmi_get_esmi_err_msg.restype = amdsmi_status_t amdsmi_get_esmi_err_msg.argtypes = [amdsmi_status_t, ctypes.POINTER(ctypes.POINTER(ctypes.c_char))] __all__ = \ - ['AGG_BW0', 'AMDSMI_ACCELERATOR_PARTITION_CPX', + ['AGG_BW0', 'AMDSMI_ACCELERATOR_DECODER', + 'AMDSMI_ACCELERATOR_DMA', 'AMDSMI_ACCELERATOR_ENCODER', + 'AMDSMI_ACCELERATOR_JPEG', 'AMDSMI_ACCELERATOR_MAX', + 'AMDSMI_ACCELERATOR_PARTITION_CPX', 'AMDSMI_ACCELERATOR_PARTITION_DPX', 'AMDSMI_ACCELERATOR_PARTITION_INVALID', + 'AMDSMI_ACCELERATOR_PARTITION_MAX', 'AMDSMI_ACCELERATOR_PARTITION_QPX', 'AMDSMI_ACCELERATOR_PARTITION_SPX', - 'AMDSMI_ACCELERATOR_PARTITION_TPX', 'AMDSMI_AVERAGE_POWER', - 'AMDSMI_CACHE_PROPERTY_CPU_CACHE', + 'AMDSMI_ACCELERATOR_PARTITION_TPX', 'AMDSMI_ACCELERATOR_XCC', + 'AMDSMI_AVERAGE_POWER', 'AMDSMI_CACHE_PROPERTY_CPU_CACHE', 'AMDSMI_CACHE_PROPERTY_DATA_CACHE', 'AMDSMI_CACHE_PROPERTY_ENABLED', 'AMDSMI_CACHE_PROPERTY_INST_CACHE', @@ -2737,6 +2833,7 @@ __all__ = \ 'AMDSMI_REG_XGMI', 'AMDSMI_STATUS_ADDRESS_FAULT', 'AMDSMI_STATUS_AMDGPU_RESTART_ERR', 'AMDSMI_STATUS_API_FAILED', 'AMDSMI_STATUS_ARG_PTR_NULL', 'AMDSMI_STATUS_BUSY', + 'AMDSMI_STATUS_CORRUPTED_EEPROM', 'AMDSMI_STATUS_DRIVER_NOT_LOADED', 'AMDSMI_STATUS_DRM_ERROR', 'AMDSMI_STATUS_FAIL_LOAD_MODULE', 'AMDSMI_STATUS_FAIL_LOAD_SYMBOL', 'AMDSMI_STATUS_FILE_ERROR', @@ -2801,7 +2898,10 @@ __all__ = \ 'AMDSMI_XGMI_STATUS_ERROR', 'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS', 'AMDSMI_XGMI_STATUS_NO_ERRORS', 'CLK_LIMIT_MAX', 'CLK_LIMIT_MIN', 'RD_BW0', 'WR_BW0', 'amd_metrics_table_header_t', + 'amdsmi_accelerator_partition_profile_config_t', 'amdsmi_accelerator_partition_profile_t', + 'amdsmi_accelerator_partition_resource_profile_t', + 'amdsmi_accelerator_partition_resource_type_t', 'amdsmi_accelerator_partition_type_t', 'amdsmi_asic_info_t', 'amdsmi_bdf_t', 'amdsmi_bit_field_t', 'amdsmi_board_info_t', 'amdsmi_cache_property_type_t', 'amdsmi_card_form_factor_t', @@ -2849,9 +2949,11 @@ __all__ = \ 'amdsmi_get_energy_count', 'amdsmi_get_esmi_err_msg', 'amdsmi_get_fw_info', 'amdsmi_get_gpu_accelerator_partition_profile', + 'amdsmi_get_gpu_accelerator_partition_profile_config', 'amdsmi_get_gpu_activity', 'amdsmi_get_gpu_asic_info', 'amdsmi_get_gpu_available_counters', - 'amdsmi_get_gpu_bad_page_info', 'amdsmi_get_gpu_bdf_id', + 'amdsmi_get_gpu_bad_page_info', + 'amdsmi_get_gpu_bad_page_threshold', 'amdsmi_get_gpu_bdf_id', 'amdsmi_get_gpu_board_info', 'amdsmi_get_gpu_cache_info', 'amdsmi_get_gpu_compute_partition', 'amdsmi_get_gpu_compute_process_gpus', @@ -2865,6 +2967,7 @@ __all__ = \ 'amdsmi_get_gpu_id', 'amdsmi_get_gpu_kfd_info', 'amdsmi_get_gpu_mem_overdrive_level', 'amdsmi_get_gpu_memory_partition', + 'amdsmi_get_gpu_memory_partition_config', 'amdsmi_get_gpu_memory_reserved_pages', 'amdsmi_get_gpu_memory_total', 'amdsmi_get_gpu_memory_usage', 'amdsmi_get_gpu_metrics_header_info', @@ -2905,15 +3008,16 @@ __all__ = \ 'amdsmi_gpu_cache_info_t', 'amdsmi_gpu_control_counter', 'amdsmi_gpu_counter_group_supported', 'amdsmi_gpu_create_counter', 'amdsmi_gpu_destroy_counter', 'amdsmi_gpu_metrics_t', - 'amdsmi_gpu_read_counter', 'amdsmi_gpu_xcp_metrics_t', - 'amdsmi_gpu_xgmi_error_status', 'amdsmi_hsmp_freqlimit_src_names', - 'amdsmi_hsmp_metrics_table_t', 'amdsmi_init', - 'amdsmi_init_flags_t', 'amdsmi_init_gpu_event_notification', - 'amdsmi_io_bw_encoding_t', 'amdsmi_io_link_type_t', - 'amdsmi_is_P2P_accessible', + 'amdsmi_gpu_read_counter', 'amdsmi_gpu_validate_ras_eeprom', + 'amdsmi_gpu_xcp_metrics_t', 'amdsmi_gpu_xgmi_error_status', + 'amdsmi_hsmp_freqlimit_src_names', 'amdsmi_hsmp_metrics_table_t', + 'amdsmi_init', 'amdsmi_init_flags_t', + 'amdsmi_init_gpu_event_notification', 'amdsmi_io_bw_encoding_t', + 'amdsmi_io_link_type_t', 'amdsmi_is_P2P_accessible', 'amdsmi_is_gpu_power_management_enabled', 'amdsmi_kfd_info_t', 'amdsmi_link_id_bw_type_t', 'amdsmi_link_metrics_t', 'amdsmi_link_type_t', 'amdsmi_memory_page_status_t', + 'amdsmi_memory_partition_config_t', 'amdsmi_memory_partition_type_t', 'amdsmi_memory_type_t', 'amdsmi_mm_ip_t', 'amdsmi_name_value_t', 'amdsmi_nps_caps_t', 'amdsmi_od_vddc_point_t', 'amdsmi_od_volt_curve_t', @@ -2936,10 +3040,12 @@ __all__ = \ 'amdsmi_set_cpu_socket_boostlimit', 'amdsmi_set_cpu_socket_lclk_dpm_level', 'amdsmi_set_cpu_socket_power_cap', 'amdsmi_set_cpu_xgmi_width', + 'amdsmi_set_gpu_accelerator_partition_profile', 'amdsmi_set_gpu_clk_limit', 'amdsmi_set_gpu_clk_range', 'amdsmi_set_gpu_compute_partition', 'amdsmi_set_gpu_event_notification_mask', 'amdsmi_set_gpu_fan_speed', 'amdsmi_set_gpu_memory_partition', + 'amdsmi_set_gpu_memory_partition_mode', 'amdsmi_set_gpu_od_clk_info', 'amdsmi_set_gpu_od_volt_info', 'amdsmi_set_gpu_overdrive_level', 'amdsmi_set_gpu_pci_bandwidth', 'amdsmi_set_gpu_perf_determinism_mode', @@ -2962,7 +3068,9 @@ __all__ = \ 'amdsmi_xgmi_link_status_t', 'amdsmi_xgmi_link_status_type_t', 'amdsmi_xgmi_status_t', 'processor_type_t', 'size_t', 'struct__links', 'struct_amd_metrics_table_header_t', + 'struct_amdsmi_accelerator_partition_profile_config_t', 'struct_amdsmi_accelerator_partition_profile_t', + 'struct_amdsmi_accelerator_partition_resource_profile_t', 'struct_amdsmi_asic_info_t', 'struct_amdsmi_board_info_t', 'struct_amdsmi_clk_info_t', 'struct_amdsmi_counter_value_t', 'struct_amdsmi_ddr_bw_metrics_t', 'struct_amdsmi_dimm_power_t', @@ -2977,6 +3085,7 @@ __all__ = \ 'struct_amdsmi_gpu_xcp_metrics_t', 'struct_amdsmi_hsmp_metrics_table_t', 'struct_amdsmi_kfd_info_t', 'struct_amdsmi_link_id_bw_type_t', 'struct_amdsmi_link_metrics_t', + 'struct_amdsmi_memory_partition_config_t', 'struct_amdsmi_name_value_t', 'struct_amdsmi_od_vddc_point_t', 'struct_amdsmi_od_volt_curve_t', 'struct_amdsmi_od_volt_freq_data_t', @@ -2996,7 +3105,7 @@ __all__ = \ 'struct_amdsmi_vram_usage_t', 'struct_amdsmi_xgmi_info_t', 'struct_amdsmi_xgmi_link_status_t', 'struct_cache_', 'struct_engine_usage_', 'struct_fw_info_list_', - 'struct_memory_usage_', 'struct_nps_flags_', + 'struct_memory_usage_', 'struct_nps_flags_', 'struct_numa_range_', 'struct_pcie_metric_', 'struct_pcie_static_', 'struct_amdsmi_bdf_t', 'uint32_t', 'uint64_t', 'uint8_t', 'union_amdsmi_bdf_t', 'union_amdsmi_nps_caps_t'] diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index 497a08e2f4..ecb299dec2 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -440,6 +440,31 @@ typedef enum { typedef rsmi_memory_partition_type_t rsmi_memory_partition_type; /// \endcond +/** + * @brief XCP resources. + * This enum is used to identify + * various accelerator resource types. + */ +typedef enum { + RSMI_ACCELERATOR_XCC, + RSMI_ACCELERATOR_ENCODER, + RSMI_ACCELERATOR_DECODER, + RSMI_ACCELERATOR_DMA, + RSMI_ACCELERATOR_JPEG, + RSMI_ACCELERATOR_MAX +} rsmi_accelerator_partition_resource_type_t; + +/** + * @brief Accelerator Partition Resources. + * This struct is used to identify various partition resource profiles. + */ +typedef struct { + rsmi_accelerator_partition_resource_type_t resource_type; + uint32_t partition_resource; //!< Resources a partition can use, which may be shared + uint32_t num_partitions_share_resource; //!< If it is greater than 1, then resource is shared. + uint64_t reserved[6]; +} rsmi_accelerator_partition_resource_profile_t; + /** * @brief Temperature Metrics. This enum is used to identify various * temperature metrics. Corresponding values will be in millidegress @@ -4625,6 +4650,192 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, */ rsmi_status_t rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id); +/** + * @brief Retrieves the available compute partition capabilities + * for a desired device + * + * @details + * Given a device index @p dv_ind and a string @p compute_partition_caps , + * and uint32 @p len , this function will attempt to obtain the device's + * available compute partition capabilities string. Upon successful + * retreival, the obtained device's available compute partition capablilities + * string shall be stored in the passed @p compute_partition_caps + * char string variable. + * + * @param[in] dv_ind a device index + * + * @param[inout] compute_partition_caps a pointer to a char string variable, + * which the device's available compute partition capabilities will be written to. + * + * @param[in] len the length of the caller provided buffer @p len , + * suggested length is 30 or greater. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not + * large enough to hold the entire memory partition value. In this case, + * only @p len bytes will be written. + * + */ +rsmi_status_t +rsmi_dev_compute_partition_capabilities_get(uint32_t dv_ind, char *compute_partition_caps, + uint32_t len); + +/** + * @brief Retrieves the compute partition supported xcp configs + * for a desired device + * + * @details + * Given a device index @p dv_ind and a string @p supported_configs , + * and uint32 @p len , this function will attempt to obtain the device's + * compute partition supported xcp configs string. Upon successful + * retreival, the obtained device's available compute partition supported xcp configs + * string shall be stored in the passed @p supported_configs + * char string variable. + * + * @param[in] dv_ind a device index + * + * @param[inout] supported_configs a pointer to a char string variable, + * which the device's compute partition supported xcp configs will be written to. + * + * @param[in] len the length of the caller provided buffer @p len , + * suggested length is 30 or greater. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not + * large enough to hold the entire memory partition value. In this case, + * only @p len bytes will be written. + * + */ +rsmi_status_t +rsmi_dev_compute_partition_supported_xcp_configs_get(uint32_t dv_ind, char *supported_configs, + uint32_t len); + +/** + * @brief Retrieves the compute partition supported NPS configs + * for a desired device + * + * @details + * Given a device index @p dv_ind and a string @p supported_configs , + * and uint32 @p len , this function will attempt to obtain the device's + * compute partition supported NPS configs string. Upon successful + * retreival, the obtained device's available compute partition supported NPS configs + * string shall be stored in the passed @p supported_configs + * char string variable. + * + * @param[in] dv_ind a device index + * + * @param[inout] supported_configs a pointer to a char string variable, + * which the device's compute partition supported NPS configs will be written to. + * + * @param[in] len the length of the caller provided buffer @p len , + * suggested length is 30 or greater. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not + * large enough to hold the entire memory partition value. In this case, + * only @p len bytes will be written. + * + */ +rsmi_status_t +rsmi_dev_compute_partition_supported_nps_configs_get(uint32_t dv_ind, char *supported_configs, + uint32_t len); + +/** + * @brief Retrieves the current compute partition xcp config + * for a desired device + * + * @details + * Given a device index @p dv_ind and a string @p current_xcp_config , + * and uint32 @p len , this function will attempt to obtain the device's + * curren tcompute partition xcp config string. Upon successful + * retreival, the obtained device's current compute partition xcp config + * string shall be stored in the passed @p current_xcp_config + * char string variable. + * + * @param[in] dv_ind a device index + * + * @param[inout] supported_configs a pointer to a char string variable, + * which the device's current compute partition xcp config will be written to. + * + * @param[in] len the length of the caller provided buffer @p len , + * suggested length is 30 or greater. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not + * large enough to hold the entire memory partition value. In this case, + * only @p len bytes will be written. + * + */ +rsmi_status_t rsmi_dev_current_compute_xcp_config_get(uint32_t dv_ind, char *current_xcp_config, + uint32_t len); + +/** + * @brief Modifies a selected device's compute partition XCP config setting. + * + * @details Given a device index @p dv_ind, a type of compute partition + * @p xcp_config, this function will attempt to update the selected + * device's compute partition XCP config. + * + * @param[in] dv_ind a device index + * + * @param[in] xcp_config using enum ::rsmi_compute_partition_type_t, + * define what the selected device's compute partition XCP config should be + * updated to. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_PERMISSION function requires root access + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_SETTING_UNAVAILABLE the provided setting is + * unavailable for current device + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired + * because it is already being used - device is busy + * + */ +rsmi_status_t +rsmi_dev_compute_partition_xcp_config_set(uint32_t dv_ind, + rsmi_compute_partition_type_t xcp_config); + +/** + * @brief Retrieves a selected device's compute partition resource profile. + * + * @details Given a device index @p dv_ind, a pointer to a requested resorce of + * rsmi_accelerator_partition_resource_type_t @p type, and a rsmi_accelerator_partition_resource_profile_t + * @p profile this function will write the current XCP config's + * resource profile to its @p profile. + * + * @param[in] dv_ind a device index + * + * @param[in] type a pointer to a requested resource using enum ::rsmi_accelerator_partition_resource_type_t + * + * @param[inout] profile a pointer to the requested rsmi_accelerator_partition_resource_profile_t details + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + */ +rsmi_status_t rsmi_dev_compute_partition_resource_profile_get(uint32_t dv_ind, + rsmi_accelerator_partition_resource_type_t *type, + rsmi_accelerator_partition_resource_profile_t *profile); + /** @} */ // end of ComputePartition /*****************************************************************************/ diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h index 82a13b373b..a74ff680c2 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -163,6 +163,26 @@ enum DevInfoTypes { kDevComputePartition, kDevMemoryPartition, kDevAvailableMemoryPartition, + kDevSupportedXcpConfigs, + kDevSupportedNpsConfigs, + kDevXcpConfig, + + /** + * Possible xcp config resources start + */ + kDevDecoderInst, + kDevDecoderShared, + kDevEncoderInst, + kDevEncoderShared, + kDevDmaInst, + kDevDmaShared, + kDevJpegInst, + kDevJpegShared, + kDevXccInst, + kDevXccShared, + /** + * Possible xcp config resources end + */ // The information read from pci core sysfs kDevPCieTypeStart = 1000, diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h index 0388b6e7cb..b9c8027696 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -118,6 +118,8 @@ rsmi_status_t rsmi_dev_number_of_computes_get(uint32_t dv_ind, uint32_t* num_com std::string leftTrim(const std::string &s); std::string rightTrim(const std::string &s); std::string trim(const std::string &s); +std::string trimAllWhiteSpace(const std::string &s); +std::string removeWhitespace(const std::string &s); std::string removeNewLines(const std::string &s); std::string removeString(const std::string origStr, diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi.cc b/projects/amdsmi/rocm_smi/src/rocm_smi.cc index ab7d439130..6e927ccc1b 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi.cc @@ -5618,6 +5618,524 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, CATCH } +rsmi_status_t rsmi_dev_compute_partition_capabilities_get( + uint32_t dv_ind, char *compute_partition_caps, uint32_t len) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); + DEVICE_MUTEX + std::string availableComputePartitions; + rsmi_status_t ret = + get_dev_value_line(amd::smi::kDevAvailableComputePartition, + dv_ind, &availableComputePartitions); + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | FAIL " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition) + << " | Data: could not retrieve requested data" + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_ERROR(ss); + return ret; + } + + std::size_t length = availableComputePartitions.copy(compute_partition_caps, len-1); + compute_partition_caps[length]='\0'; + + if (len < (availableComputePartitions.size() + 1)) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition) + << " | Cause: requested size was insufficient" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_INSUFFICIENT_SIZE; + } + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition) + << " | Data: " << compute_partition_caps + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_TRACE(ss); + return ret; + CATCH +} + +rsmi_status_t rsmi_dev_compute_partition_supported_xcp_configs_get(uint32_t dv_ind, + char *supported_configs, uint32_t len) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); + DEVICE_MUTEX + std::string supported_xcp_configs; + rsmi_status_t ret = + get_dev_value_line(amd::smi::kDevSupportedXcpConfigs, + dv_ind, &supported_xcp_configs); + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | FAIL " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs) + << " | Data: could not retrieve requested data" + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_ERROR(ss); + return ret; + } + + std::size_t length = supported_xcp_configs.copy(supported_configs, len-1); + supported_configs[length]='\0'; + + if (len < (supported_xcp_configs.size() + 1)) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs) + << " | Cause: requested size was insufficient" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_INSUFFICIENT_SIZE; + } + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs) + << " | Data: " << supported_configs + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_TRACE(ss); + return ret; + CATCH +} + +rsmi_status_t rsmi_dev_compute_partition_supported_nps_configs_get(uint32_t dv_ind, + char *supported_configs, uint32_t len) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); + DEVICE_MUTEX + std::string supported_nps_configs; + rsmi_status_t ret = + get_dev_value_line(amd::smi::kDevSupportedNpsConfigs, + dv_ind, &supported_nps_configs); + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | FAIL " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevSupportedNpsConfigs) + << " | Data: could not retrieve requested data" + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_ERROR(ss); + return ret; + } + + std::size_t length = supported_nps_configs.copy(supported_configs, len-1); + supported_configs[length]='\0'; + + if (len < (supported_nps_configs.size() + 1)) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevSupportedNpsConfigs) + << " | Cause: requested size was insufficient" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_INSUFFICIENT_SIZE; + } + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevSupportedNpsConfigs) + << " | Data: " << supported_configs + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_TRACE(ss); + return ret; + CATCH +} + +rsmi_status_t rsmi_dev_current_compute_xcp_config_get( + uint32_t dv_ind, char *current_xcp_config, uint32_t len) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); + DEVICE_MUTEX + std::string currentXcpConfigStr; + rsmi_status_t ret = + get_dev_value_line(amd::smi::kDevXcpConfig, + dv_ind, ¤tXcpConfigStr); + if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | FAIL " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevXcpConfig) + << " | Data: could not retrieve requested data" + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_ERROR(ss); + return ret; + } + + std::size_t length = currentXcpConfigStr.copy(current_xcp_config, len-1); + current_xcp_config[length]='\0'; + + if (len < (currentXcpConfigStr.size() + 1)) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevXcpConfig) + << " | Cause: requested size was insufficient" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INSUFFICIENT_SIZE) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_INSUFFICIENT_SIZE; + } + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevXcpConfig) + << " | Data: " << currentXcpConfigStr + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_TRACE(ss); + return ret; + CATCH +} + +rsmi_status_t +rsmi_dev_compute_partition_xcp_config_set(uint32_t dv_ind, + rsmi_compute_partition_type_t xcp_config) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); + REQUIRE_ROOT_ACCESS + if (!amd::smi::is_sudo_user()) { + return RSMI_STATUS_PERMISSION; + } + std::string currentXcpConfig = ""; + std::string newXcpConfigStr = ""; + std::string availableXcpConfigsStr = ""; + const int kLen30 = 30; + char available_xcp_configs[kLen30]; + available_xcp_configs[0] = '\0'; + const int kLen5 = 5; + char current_xcp_config[kLen5]; + current_xcp_config[0] = '\0'; + + switch (xcp_config) { + case RSMI_COMPUTE_PARTITION_CPX: + case RSMI_COMPUTE_PARTITION_SPX: + case RSMI_COMPUTE_PARTITION_DPX: + case RSMI_COMPUTE_PARTITION_TPX: + case RSMI_COMPUTE_PARTITION_QPX: + newXcpConfigStr = + mapRSMIToStringComputePartitionTypes.at(xcp_config); + break; + case RSMI_COMPUTE_PARTITION_INVALID: + default: + newXcpConfigStr = + mapRSMIToStringComputePartitionTypes.at(RSMI_COMPUTE_PARTITION_INVALID); + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevXcpConfig) + << " | Data: " << newXcpConfigStr + << " | Cause: requested setting was invalid" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_INVALID_ARGS; + } + + // Confirm what we are trying to set is available, otherwise provide + // RSMI_STATUS_INVALID_ARGS + rsmi_status_t available_ret = + rsmi_dev_compute_partition_supported_xcp_configs_get(dv_ind, available_xcp_configs, kLen30); + if (available_ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs) + << " | Data: " << newXcpConfigStr + << " | Cause: could not find an available xcp configs file" + << " | Returning = " + << getRSMIStatusString(available_ret) << " |"; + LOG_ERROR(ss); + return available_ret; + } else { + availableXcpConfigsStr = available_xcp_configs; + } + + bool isXcpConfigAvailable = + amd::smi::containsString(availableXcpConfigsStr, + newXcpConfigStr); + if (!isXcpConfigAvailable) { + ss << __PRETTY_FUNCTION__ + << " | Fail - Detected that the requested xcp config is not available" + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevXcpConfig) + << " | Data (newXcpConfigStr): " << newXcpConfigStr + << " | Data (availableXcpConfigsStr): " << availableXcpConfigsStr; + LOG_ERROR(ss); + // We do not return RSMI_STATUS_INVALID_ARGS + // Instead we try setting anyways as requested + // write will provide the correct error code + } + + ss << __PRETTY_FUNCTION__ << " | about to try writing |" + << newXcpConfigStr + << "| size of string = " << newXcpConfigStr.size() + << "| size of c-string = "<< std::dec + << sizeof(newXcpConfigStr.c_str())/sizeof(newXcpConfigStr[0]) + << "| sizeof string = " << std::dec + << sizeof(newXcpConfigStr); + LOG_DEBUG(ss); + GET_DEV_FROM_INDX + DEVICE_MUTEX + int ret = dev->writeDevInfo(amd::smi::kDevXcpConfig, + newXcpConfigStr); + rsmi_status_t returnResponse = amd::smi::ErrnoToRsmiStatus(ret); + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevXcpConfig) + << " | Data: " << newXcpConfigStr + << " | Returning = " + << getRSMIStatusString(returnResponse) << " |"; + LOG_TRACE(ss); + + return returnResponse; + CATCH +} + +rsmi_status_t rsmi_dev_compute_partition_resource_profile_get(uint32_t dv_ind, + rsmi_accelerator_partition_resource_type_t *type, + rsmi_accelerator_partition_resource_profile_t *profile) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); + if (type == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevXcpConfig) + << " | Cause: user sent invalid arguments, type was a null ptr" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS, false); + LOG_ERROR(ss); + return RSMI_STATUS_INVALID_ARGS; + } + // initialize the profile + profile->partition_resource = std::numeric_limits::max(); + profile->num_partitions_share_resource = std::numeric_limits::max(); + + DEVICE_MUTEX + rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED; + // check if user provided supported resource types + // Note: RSMI_ACCELERATOR_MAX is == largest enum value + bool isAcceleratorTypeValid = false; + for (int i = 0; i <= RSMI_ACCELERATOR_MAX; i++) { + if (*type == i) { + isAcceleratorTypeValid = true; + break; + } + } + if (isAcceleratorTypeValid == false) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevXcpConfig) + << " | Cause: user sent invalid arguments, type was out of range" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS, false); + LOG_ERROR(ss); + return RSMI_STATUS_INVALID_ARGS; + } + amd::smi::DevInfoTypes dev_info_type_inst; + amd::smi::DevInfoTypes dev_info_type_shared; + if (*type == RSMI_ACCELERATOR_XCC) { + profile->resource_type = RSMI_ACCELERATOR_XCC; + dev_info_type_inst = amd::smi::kDevXccInst; + dev_info_type_shared = amd::smi::kDevXccShared; + std::string val_str; + ret = get_dev_value_str(amd::smi::kDevXccInst, dv_ind, &val_str); + if (ret == RSMI_STATUS_SUCCESS) { + uint64_t val_ul = strtoul(val_str.c_str(), nullptr, 10); + if (val_ul <= std::numeric_limits::max()) { + profile->partition_resource = static_cast(val_ul); + } + } + + val_str.clear(); + ret = get_dev_value_str(amd::smi::kDevXccShared, dv_ind, &val_str); + if (ret == RSMI_STATUS_SUCCESS) { + uint64_t val_ul = strtoul(val_str.c_str(), nullptr, 10); + if (val_ul <= std::numeric_limits::max()) { + profile->num_partitions_share_resource = static_cast(val_ul); + } + } + } + + if (*type == RSMI_ACCELERATOR_ENCODER) { + profile->resource_type = RSMI_ACCELERATOR_ENCODER; + dev_info_type_inst = amd::smi::kDevEncoderInst; + dev_info_type_shared = amd::smi::kDevEncoderShared; + std::string val_str; + ret = get_dev_value_str(amd::smi::kDevEncoderInst, dv_ind, &val_str); + if (ret == RSMI_STATUS_SUCCESS) { + uint64_t val_ul = strtoul(val_str.c_str(), nullptr, 10); + if (val_ul <= std::numeric_limits::max()) { + profile->partition_resource = static_cast(val_ul); + } + } + + val_str.clear(); + ret = get_dev_value_str(amd::smi::kDevEncoderShared, dv_ind, &val_str); + if (ret == RSMI_STATUS_SUCCESS) { + uint64_t val_ul = strtoul(val_str.c_str(), nullptr, 10); + if (val_ul <= std::numeric_limits::max()) { + profile->num_partitions_share_resource = static_cast(val_ul); + } + } + } + + if (*type == RSMI_ACCELERATOR_DECODER) { + profile->resource_type = RSMI_ACCELERATOR_DECODER; + dev_info_type_inst = amd::smi::kDevDecoderInst; + dev_info_type_shared = amd::smi::kDevDecoderShared; + std::string val_str; + ret = get_dev_value_str(amd::smi::kDevDecoderInst, dv_ind, &val_str); + if (ret == RSMI_STATUS_SUCCESS) { + uint64_t val_ul = strtoul(val_str.c_str(), nullptr, 10); + if (val_ul <= std::numeric_limits::max()) { + profile->partition_resource = static_cast(val_ul); + } + } + + val_str.clear(); + ret = get_dev_value_str(amd::smi::kDevDecoderShared, dv_ind, &val_str); + if (ret == RSMI_STATUS_SUCCESS) { + uint64_t val_ul = strtoul(val_str.c_str(), nullptr, 10); + if (val_ul <= std::numeric_limits::max()) { + profile->num_partitions_share_resource = static_cast(val_ul); + } + } + } + + if (*type == RSMI_ACCELERATOR_DMA) { + profile->resource_type = RSMI_ACCELERATOR_DMA; + dev_info_type_inst = amd::smi::kDevDmaInst; + dev_info_type_shared = amd::smi::kDevDmaShared; + std::string val_str; + ret = get_dev_value_str(amd::smi::kDevDmaInst, dv_ind, &val_str); + if (ret == RSMI_STATUS_SUCCESS) { + uint64_t val_ul = strtoul(val_str.c_str(), nullptr, 10); + if (val_ul <= std::numeric_limits::max()) { + profile->partition_resource = static_cast(val_ul); + } + } + + val_str.clear(); + ret = get_dev_value_str(amd::smi::kDevDmaShared, dv_ind, &val_str); + if (ret == RSMI_STATUS_SUCCESS) { + uint64_t val_ul = strtoul(val_str.c_str(), nullptr, 10); + if (val_ul <= std::numeric_limits::max()) { + profile->num_partitions_share_resource = static_cast(val_ul); + } + } + } + + // RSMI_ACCELERATOR_MAX == RSMI_ACCELERATOR_JPEG + if (*type == RSMI_ACCELERATOR_JPEG) { + profile->resource_type = RSMI_ACCELERATOR_JPEG; + dev_info_type_inst = amd::smi::kDevJpegInst; + dev_info_type_shared = amd::smi::kDevJpegShared; + std::string val_str; + ret = get_dev_value_str(amd::smi::kDevJpegInst, dv_ind, &val_str); + if (ret == RSMI_STATUS_SUCCESS) { + uint64_t val_ul = strtoul(val_str.c_str(), nullptr, 10); + if (val_ul <= std::numeric_limits::max()) { + profile->partition_resource = static_cast(val_ul); + } + } + + val_str.clear(); + ret = get_dev_value_str(amd::smi::kDevJpegShared, dv_ind, &val_str); + if (ret == RSMI_STATUS_SUCCESS) { + uint64_t val_ul = strtoul(val_str.c_str(), nullptr, 10); + if (val_ul <= std::numeric_limits::max()) { + profile->num_partitions_share_resource = static_cast(val_ul); + } + } + } + + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type (partition_resource): " + << amd::smi::Device::get_type_string(dev_info_type_inst) + << " | Data: " << profile->partition_resource + << " | Type (num_partitions_share_resource): " + << amd::smi::Device::get_type_string(dev_info_type_shared) + << " | Data: " << profile->num_partitions_share_resource + << " | Returning = " + << getRSMIStatusString(ret, false) << " |"; + LOG_TRACE(ss); + + return ret; + CATCH +} + static rsmi_status_t get_memory_partition(uint32_t dv_ind, std::string &memory_partition) { TRY diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc index 6715cfa6d4..590a8f965f 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -120,6 +120,21 @@ static const char *kDevAvailableComputePartitionFName = static const char *kDevComputePartitionFName = "current_compute_partition"; static const char *kDevMemoryPartitionFName = "current_memory_partition"; static const char *kDevAvailableMemoryPartitionFName = "available_memory_partition"; +static const char *kDevSupportedXcpConfigsFName = "compute_partition_config/supported_xcp_configs"; +static const char *kDevSupportedNpsConfigsFName = "compute_partition_config/supported_nps_configs"; +static const char *kDevXcpConfigFName = "compute_partition_config/xcp_config"; + +// XCP config resource files - not every file will exist in all ASICs (ex. Decoders vs Encoders) +static const char *kDevDecoderInstFName = "compute_partition_config/dec/num_inst"; +static const char *kDevDecoderSharedFName = "compute_partition_config/dec/num_shared"; +static const char *kDevEncoderInstFName = "compute_partition_config/enc/num_inst"; +static const char *kDevEncoderSharedFName = "compute_partition_config/enc/num_shared"; +static const char *kDevDmaInstFName = "compute_partition_config/dma/num_inst"; +static const char *kDevDmaSharedFName = "compute_partition_config/dma/num_shared"; +static const char *kDevJpegInstFName = "compute_partition_config/jpeg/num_inst"; +static const char *kDevJpegSharedFName = "compute_partition_config/jpeg/num_shared"; +static const char *kDevXccInstFName = "compute_partition_config/xcc/num_inst"; +static const char *kDevXccSharedFName = "compute_partition_config/xcc/num_shared"; // Firmware version files static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version"; @@ -309,6 +324,21 @@ static const std::map kDevAttribNameMap = { {kDevComputePartition, kDevComputePartitionFName}, {kDevMemoryPartition, kDevMemoryPartitionFName}, {kDevAvailableMemoryPartition, kDevAvailableMemoryPartitionFName}, + {kDevSupportedXcpConfigs, kDevSupportedXcpConfigsFName}, + {kDevSupportedNpsConfigs, kDevSupportedNpsConfigsFName}, + {kDevXcpConfig, kDevXcpConfigFName}, + + // XCP config resource files + {kDevDecoderInst, kDevDecoderInstFName}, + {kDevDecoderShared, kDevDecoderSharedFName}, + {kDevEncoderInst, kDevEncoderInstFName}, + {kDevEncoderShared, kDevEncoderSharedFName}, + {kDevDmaInst, kDevDmaInstFName}, + {kDevDmaShared, kDevDmaSharedFName}, + {kDevJpegInst, kDevJpegInstFName}, + {kDevJpegShared, kDevJpegSharedFName}, + {kDevXccInst, kDevXccInstFName}, + {kDevXccShared, kDevXccSharedFName}, }; static const std::map kDevPerfLvlMap = { @@ -466,6 +496,20 @@ Device::devInfoTypesStrings = { {kDevXgmiPlpd, "kDevXgmiPlpd"}, {kDevProcessIsolation, "kDevProcessIsolation"}, {kDevShaderClean, "kDevShaderClean"}, + {kDevSupportedXcpConfigs, "kDevSupportedXcpConfigs"}, + {kDevSupportedNpsConfigs, "kDevSupportedNpsConfigs"}, + {kDevXcpConfig, "kDevXcpConfig"}, + + {kDevDecoderInst, "kDevDecoderInst"}, + {kDevDecoderShared, "kDevDecoderShared"}, + {kDevEncoderInst, "kDevEncoderInst"}, + {kDevEncoderShared, "kDevEncoderShared"}, + {kDevDmaInst, "kDevDmaInst"}, + {kDevDmaShared, "kDevDmaShared"}, + {kDevJpegInst, "kDevJpegInst"}, + {kDevJpegShared, "kDevJpegShared"}, + {kDevXccInst, "kDevXccInst"}, + {kDevXccShared, "kDevXccShared"}, }; static const std::map kDevFuncDependsMap = { @@ -946,6 +990,7 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) { return writeDevInfoStr(type, val); case kDevComputePartition: case kDevMemoryPartition: + case kDevXcpConfig: return writeDevInfoStr(type, val, true); default: @@ -1292,6 +1337,19 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevXGMIPhysicalID: case kDevAvailableMemoryPartition: case kDevProcessIsolation: + case kDevSupportedXcpConfigs: + case kDevSupportedNpsConfigs: + case kDevXcpConfig: + case kDevDecoderInst: + case kDevDecoderShared: + case kDevEncoderInst: + case kDevEncoderShared: + case kDevDmaInst: + case kDevDmaShared: + case kDevJpegInst: + case kDevJpegShared: + case kDevXccInst: + case kDevXccShared: return readDevInfoStr(type, val); break; diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc index d0d1394221..aa98de89a0 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc @@ -747,6 +747,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { // location_id, bdf, domain, bus, device, // partition_id} std::multimap allSystemNodes; + std::set gpuNodeIdsFound; uint32_t node_id = 0; static const int BYTE = 8; while (true) { @@ -755,9 +756,24 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id); int ret_loc_id = read_node_properties(node_id, "location_id", &location_id); - read_node_properties(node_id, "domain", &domain); - if (ret_gpu_id == 0 && - !(ret_unique_id != 0 || ret_loc_id != 0 || ret_unique_id != 0)) { + int ret_domain = read_node_properties(node_id, "domain", &domain); + bool isANode = (ret_gpu_id == 0 && + (ret_domain == 0 && ret_loc_id == 0)); + ss << __PRETTY_FUNCTION__ << " | isAGpuNode: " + << (isANode ? "TRUE" : "FALSE") << "; is_vm_guest(): " + << (is_vm_guest() ? "TRUE" : "FALSE") + << "\nret_gpu_id: " << ret_gpu_id + << "; ret_domain: " << ret_domain + << "; ret_loc_id: " << ret_loc_id + << "; ret_unique_id: " << ret_unique_id + << "\n[node_id = " << print_unsigned_hex_and_int(node_id) << "\n" + << "; gpu_id = " << print_unsigned_hex_and_int(gpu_id) << "\n" + << "; unique_id = " << print_unsigned_hex_and_int(unique_id) << "\n" + << "; location_id = " << print_unsigned_hex_and_int(location_id) << "\n" + << "; domain = " << print_unsigned_hex_and_int(domain) + << "]\n"; + LOG_DEBUG(ss); + if (isANode || (is_vm_guest() && ret_gpu_id == 0)) { // Do not try to build a node if one of these fields // do not exist in KFD (0 as values okay) systemNode myNode; @@ -776,6 +792,24 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { myNode.s_function = myNode.s_location_id & 0x7; myNode.s_partition_id = ((myNode.s_location_id >> 28) & 0xF); if (gpu_id != 0) { // only add gpu nodes, 0 = CPU + auto ret = gpuNodeIdsFound.insert(node_id); + if (ret.second != false) { + // only print out nodes which do not already exist + ss << __PRETTY_FUNCTION__ << " | isAGpuNode: " + << (isANode ? "TRUE" : "FALSE") << "; is_vm_guest(): " + << (is_vm_guest() ? "TRUE" : "FALSE") + << "\nret_gpu_id: " << ret_gpu_id + << "; ret_domain: " << ret_domain + << "; ret_loc_id: " << ret_loc_id + << "; ret_unique_id: " << ret_unique_id + << "\n[node_id = " << print_unsigned_hex_and_int(node_id) << "\n" + << "; gpu_id = " << print_unsigned_hex_and_int(gpu_id) << "\n" + << "; unique_id = " << print_unsigned_hex_and_int(unique_id) << "\n" + << "; location_id = " << print_unsigned_hex_and_int(location_id) << "\n" + << "; domain = " << print_unsigned_hex_and_int(domain) << "\n" + << "]\n"; + LOG_DEBUG(ss); + } allSystemNodes.emplace(unique_id, myNode); } } else { @@ -866,7 +900,9 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << "; partition_id = " << std::to_string(i->second.s_partition_id) << "], "; LOG_DEBUG(ss); - AddToDeviceList(d_name, primaryBdfId); + ss << __PRETTY_FUNCTION__ << " | AddToDeviceList #1 (secondary node) \n" + << "; bdf: " << print_unsigned_hex_and_int(primaryBdfId) << "\n"; + LOG_DEBUG(ss); } else { ss << __PRETTY_FUNCTION__ << " | primary node add ; " << " BDF = " << std::to_string(UINT64_MAX); @@ -894,6 +930,9 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << "; partition_id = " << std::to_string(i->second.s_partition_id) << "], "; LOG_DEBUG(ss); + ss << __PRETTY_FUNCTION__ << " | AddToDeviceList #2 (primary node) \n" + << "; bdf: " << print_unsigned_hex_and_int(UINT64_MAX) << "\n"; + LOG_DEBUG(ss); AddToDeviceList(d_name, UINT64_MAX); } @@ -1029,6 +1068,9 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << "; partition_id = " << std::to_string(it->second.s_partition_id) << "], "; LOG_DEBUG(ss); + ss << __PRETTY_FUNCTION__ << " | AddToDeviceList #3 (secondary node add #2) \n" + << "; bdf: " << print_unsigned_hex_and_int(myBdfId) << "\n"; + LOG_DEBUG(ss); AddToDeviceList(secNode, myBdfId); allSystemNodes.erase(it++); numb_nodes--; diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc index bdd3e1d01b..c49a42f4ca 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc @@ -383,6 +383,7 @@ std::string removeNewLines(const std::string &s) { return s; } +// Trims white space from both ends of string std::string trim(const std::string &s) { if (!s.empty()) { // remove new lines -> trim white space at ends @@ -392,6 +393,23 @@ std::string trim(const std::string &s) { return s; } +// Trims white space from both ends of string and removes all white space +std::string trimAllWhiteSpace(const std::string &s) { + if (!s.empty()) { + // remove new lines -> trim white space at ends + std::string noNewLines = trim(s); + return removeWhitespace(noNewLines); + } + return s; +} + +std::string removeWhitespace(const std::string &s) { + if (!s.empty()) { + return std::regex_replace(s, std::regex("\\s+"), ""); + } + return s; +} + // Given original string and string to remove (removeMe) // Return will provide the resulting modified string with the removed string(s) std::string removeString(const std::string origStr, @@ -908,18 +926,18 @@ std::string getBuildType() { } const char *my_fname(void) { -std::string emptyRet=""; #ifdef _GNU_SOURCE Dl_info dl_info; - dladdr((void *)my_fname, &dl_info); + dladdr(reinterpret_cast(my_fname), &dl_info); return (dl_info.dli_fname); #else + std::string emptyRet = ""; return emptyRet.c_str(); #endif } std::string getMyLibPath(void) { - std::string libName = "rocm-smi-lib"; + std::string libName = "amd-smi-lib"; std::string path = std::string(my_fname()); if (path.empty()) { path = "Could not find library path for " + libName; diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 2f06f5312e..a390e9796f 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -52,6 +52,7 @@ #include "amd_smi/impl/amd_smi_utils.h" #include "amd_smi/impl/amd_smi_processor.h" #include "rocm_smi/rocm_smi_logger.h" +#include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi.h" // a global instance of std::mutex to protect data passed during threads @@ -67,6 +68,41 @@ char proc_id[SIZE] = "\0"; } \ } while (0) +static const std::map partition_types_map = { + { AMDSMI_ACCELERATOR_PARTITION_SPX, "SPX" }, + { AMDSMI_ACCELERATOR_PARTITION_DPX, "DPX" }, + { AMDSMI_ACCELERATOR_PARTITION_TPX, "TPX" }, + { AMDSMI_ACCELERATOR_PARTITION_QPX, "QPX" }, + { AMDSMI_ACCELERATOR_PARTITION_CPX, "CPX" }, + { AMDSMI_ACCELERATOR_PARTITION_MAX, "MAX" }, +}; +static const std::map accelerator_to_RSMI = { + { AMDSMI_ACCELERATOR_PARTITION_SPX, RSMI_COMPUTE_PARTITION_SPX }, + { AMDSMI_ACCELERATOR_PARTITION_DPX, RSMI_COMPUTE_PARTITION_DPX }, + { AMDSMI_ACCELERATOR_PARTITION_TPX, RSMI_COMPUTE_PARTITION_TPX }, + { AMDSMI_ACCELERATOR_PARTITION_QPX, RSMI_COMPUTE_PARTITION_QPX }, + { AMDSMI_ACCELERATOR_PARTITION_CPX, RSMI_COMPUTE_PARTITION_CPX } +}; +static const std::map resource_types_map = { + { AMDSMI_ACCELERATOR_XCC, "XCC" }, + { AMDSMI_ACCELERATOR_ENCODER, "ENCODER" }, + { AMDSMI_ACCELERATOR_DECODER, "DECODER" }, + { AMDSMI_ACCELERATOR_DMA, "DMA" }, + { AMDSMI_ACCELERATOR_JPEG, "JPEG" }, + { AMDSMI_ACCELERATOR_MAX, "MAX" }, +}; + +static const std::map nps_amdsmi_to_RSMI = { + { AMDSMI_MEMORY_PARTITION_UNKNOWN, RSMI_MEMORY_PARTITION_UNKNOWN }, + { AMDSMI_MEMORY_PARTITION_NPS1, RSMI_MEMORY_PARTITION_NPS1 }, + { AMDSMI_MEMORY_PARTITION_NPS2, RSMI_MEMORY_PARTITION_NPS2 }, + { AMDSMI_MEMORY_PARTITION_NPS4, RSMI_MEMORY_PARTITION_NPS4 }, + { AMDSMI_MEMORY_PARTITION_NPS8, RSMI_MEMORY_PARTITION_NPS8 } +}; + static amdsmi_status_t get_gpu_device_from_handle(amdsmi_processor_handle processor_handle, amd::smi::AMDSmiGPUDevice** gpudevice) { @@ -90,22 +126,32 @@ static amdsmi_status_t get_gpu_device_from_handle(amdsmi_processor_handle proces template amdsmi_status_t rsmi_wrapper(F && f, - amdsmi_processor_handle processor_handle, Args &&... args) { + amdsmi_processor_handle processor_handle, uint32_t increment_gpu_id = 0, Args &&... args) { AMDSMI_CHECK_INIT(); + std::ostringstream ss; amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device); if (r != AMDSMI_STATUS_SUCCESS) return r; - uint32_t gpu_index = gpu_device->get_gpu_id(); + uint32_t total_num_gpu_processors = 0; + rsmi_num_monitor_devices(&total_num_gpu_processors); + uint32_t gpu_index = gpu_device->get_gpu_id() + increment_gpu_id; + ss << __PRETTY_FUNCTION__ << " | total_num_gpu_processors: " << total_num_gpu_processors + << "; gpu_index: " << gpu_index; + LOG_DEBUG(ss); + if ((gpu_index + 1) > total_num_gpu_processors) { + ss << __PRETTY_FUNCTION__ << " | returning status = AMDSMI_STATUS_NOT_FOUND"; + LOG_INFO(ss); + return AMDSMI_STATUS_NOT_FOUND; + } + auto rstatus = std::forward(f)(gpu_index, std::forward(args)...); r = amd::smi::rsmi_to_amdsmi_status(rstatus); - std::ostringstream ss; - const char *status_string; - amdsmi_status_code_to_string(r, &status_string); + std::string status_string = smi_amdgpu_get_status_string(r, false); ss << __PRETTY_FUNCTION__ << " | returning status = " << status_string; LOG_INFO(ss); return r; @@ -137,17 +183,146 @@ amdsmi_shut_down() { amdsmi_status_t amdsmi_status_code_to_string(amdsmi_status_t status, const char **status_string) { switch (status) { + case AMDSMI_STATUS_SUCCESS: + *status_string = "AMDSMI_STATUS_SUCCESS: Call succeeded."; + break; + case AMDSMI_STATUS_INVAL: + *status_string = "AMDSMI_STATUS_INVAL: Invalid parameters."; + break; + case AMDSMI_STATUS_NOT_SUPPORTED: + *status_string = "AMDSMI_STATUS_NOT_SUPPORTED: Command not supported."; + break; + case AMDSMI_STATUS_NOT_YET_IMPLEMENTED: + *status_string = "AMDSMI_STATUS_NOT_YET_IMPLEMENTED: Not implemented yet."; + break; case AMDSMI_STATUS_FAIL_LOAD_MODULE: - *status_string = "FAIL_LOAD_MODULE: Fail to load module."; + *status_string = "AMDSMI_STATUS_FAIL_LOAD_MODULE: Fail to load lib module."; break; case AMDSMI_STATUS_FAIL_LOAD_SYMBOL: - *status_string = "FAIL_LOAD_SYMBOL: Fail to load symbol."; + *status_string = "AMDSMI_STATUS_FAIL_LOAD_SYMBOL: Fail to load symbol."; break; case AMDSMI_STATUS_DRM_ERROR: - *status_string = "DRM_ERROR: Fail to run function in libdrm."; + *status_string = "AMDSMI_STATUS_DRM_ERROR: Error when calling libdrm function."; + break; + case AMDSMI_STATUS_API_FAILED: + *status_string = "AMDSMI_STATUS_API_FAILED: API call failed."; + break; + case AMDSMI_STATUS_RETRY: + *status_string = "AMDSMI_STATUS_RETRY: Retry operation."; + break; + case AMDSMI_STATUS_NO_PERM: + *status_string = "AMDSMI_STATUS_NO_PERM: Permission Denied."; + break; + case AMDSMI_STATUS_INTERRUPT: + *status_string = "AMDSMI_STATUS_INTERRUPT: An interrupt occurred during" + " execution of function."; + break; + case AMDSMI_STATUS_IO: + *status_string = "AMDSMI_STATUS_IO: I/O Error."; + break; + case AMDSMI_STATUS_ADDRESS_FAULT: + *status_string = "AMDSMI_STATUS_ADDRESS_FAULT: Bad address."; + break; + case AMDSMI_STATUS_FILE_ERROR: + *status_string = "AMDSMI_STATUS_FILE_ERROR: Problem accessing a file."; + break; + case AMDSMI_STATUS_OUT_OF_RESOURCES: + *status_string = "AMDSMI_STATUS_OUT_OF_RESOURCES: Not enough memory."; + break; + case AMDSMI_STATUS_INTERNAL_EXCEPTION: + *status_string = "AMDSMI_STATUS_INTERNAL_EXCEPTION: An internal exception was caught."; + break; + case AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS: + *status_string = "AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS: The provided" + " input is out of allowable or safe range."; + break; + case AMDSMI_STATUS_INIT_ERROR: + *status_string = "AMDSMI_STATUS_INIT_ERROR: An error occurred when" + " initializing internal data structures."; + break; + case AMDSMI_STATUS_REFCOUNT_OVERFLOW: + *status_string = "AMDSMI_STATUS_REFCOUNT_OVERFLOW: An internal reference" + " counter exceeded INT32_MAX."; + break; + case AMDSMI_STATUS_BUSY: + *status_string = "AMDSMI_STATUS_BUSY: Processor busy."; + break; + case AMDSMI_STATUS_NOT_FOUND: + *status_string = "AMDSMI_STATUS_NOT_FOUND: Processor Not found."; + break; + case AMDSMI_STATUS_NOT_INIT: + *status_string = "AMDSMI_STATUS_NOT_INIT: Processor not initialized."; + break; + case AMDSMI_STATUS_NO_SLOT: + *status_string = "AMDSMI_STATUS_NO_SLOT: No more free slot."; + break; + case AMDSMI_STATUS_DRIVER_NOT_LOADED: + *status_string = "AMDSMI_STATUS_DRIVER_NOT_LOADED: Processor driver not loaded."; + break; + case AMDSMI_STATUS_NO_DATA: + *status_string = "AMDSMI_STATUS_NO_DATA: No data was found for a given input."; + break; + case AMDSMI_STATUS_INSUFFICIENT_SIZE: + *status_string = "AMDSMI_STATUS_INSUFFICIENT_SIZE: Not enough resources" + " were available for the operation."; + break; + case AMDSMI_STATUS_UNEXPECTED_SIZE: + *status_string = "AMDSMI_STATUS_UNEXPECTED_SIZE: An unexpected amount of data" + " was read."; + break; + case AMDSMI_STATUS_UNEXPECTED_DATA: + *status_string = "AMDSMI_STATUS_UNEXPECTED_DATA: The data read or provided to" + " function is not what was expected."; + break; + case AMDSMI_STATUS_NON_AMD_CPU: + *status_string = "AMDSMI_STATUS_NON_AMD_CPU: System has different cpu than AMD."; + break; + case AMDSMI_STATUS_NO_ENERGY_DRV: + *status_string = "AMDSMI_STATUS_NO_ENERGY_DRV: Energy driver not found."; + break; + case AMDSMI_STATUS_NO_MSR_DRV: + *status_string = "AMDSMI_STATUS_NO_MSR_DRV: MSR driver not found."; + break; + case AMDSMI_STATUS_NO_HSMP_DRV: + *status_string = "AMDSMI_STATUS_NO_HSMP_DRV: HSMP driver not found."; + break; + case AMDSMI_STATUS_NO_HSMP_SUP: + *status_string = "AMDSMI_STATUS_NO_HSMP_SUP: HSMP not supported."; + break; + case AMDSMI_STATUS_NO_HSMP_MSG_SUP: + *status_string = "AMDSMI_STATUS_NO_HSMP_MSG_SUP: HSMP message/feature not supported."; + break; + case AMDSMI_STATUS_HSMP_TIMEOUT: + *status_string = "AMDSMI_STATUS_HSMP_TIMEOUT: HSMP message timed out."; + break; + case AMDSMI_STATUS_NO_DRV: + *status_string = "AMDSMI_STATUS_NO_DRV: No Energy and HSMP driver present."; + break; + case AMDSMI_STATUS_FILE_NOT_FOUND: + *status_string = "AMDSMI_STATUS_FILE_NOT_FOUND: file or directory not found."; + break; + case AMDSMI_STATUS_ARG_PTR_NULL: + *status_string = "AMDSMI_STATUS_ARG_PTR_NULL: Parsed argument is invalid."; + break; + case AMDSMI_STATUS_AMDGPU_RESTART_ERR: + *status_string = "AMDSMI_STATUS_AMDGPU_RESTART_ERR: AMDGPU restart failed."; + break; + case AMDSMI_STATUS_SETTING_UNAVAILABLE: + *status_string = "AMDSMI_STATUS_SETTING_UNAVAILABLE: Setting is not available."; + break; + case AMDSMI_STATUS_CORRUPTED_EEPROM: + *status_string = "AMDSMI_STATUS_CORRUPTED_EEPROM: EEPROM is corrupted."; + break; + case AMDSMI_STATUS_MAP_ERROR: + *status_string = "AMDSMI_STATUS_MAP_ERROR: The internal library error did" + " not map to a status code."; + break; + case AMDSMI_STATUS_UNKNOWN_ERROR: + *status_string = "AMDSMI_STATUS_UNKNOWN_ERROR: An unknown error occurred."; break; default: - // The case above didn't have a match, so look up the amdsmi status in the rsmi status map + // The case above didn't have a match, so look up the amdsmi status in the rsmi + // status map // If found, get the rsmi status string. If not, return unknown error string for (auto& iter : amd::smi::rsmi_status_map) { if (iter.second == status) { @@ -393,10 +568,10 @@ amdsmi_status_t amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_hand } else { // ignore the errors so that it can populate as many fields as possible. // call rocm-smi which search multiple places for device name - status = rsmi_wrapper(rsmi_dev_name_get, processor_handle, + status = rsmi_wrapper(rsmi_dev_name_get, processor_handle, 0, board_info->product_name, AMDSMI_256_LENGTH); - status = rsmi_wrapper(rsmi_dev_serial_number_get, processor_handle, + status = rsmi_wrapper(rsmi_dev_serial_number_get, processor_handle, 0, board_info->product_serial, AMDSMI_NORMAL_STRING_LENGTH); } @@ -411,7 +586,7 @@ amdsmi_status_t amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_hand LOG_INFO(ss); if (board_info->product_serial[0] == '\0') { - status = rsmi_wrapper(rsmi_dev_serial_number_get, processor_handle, + status = rsmi_wrapper(rsmi_dev_serial_number_get, processor_handle, 0, board_info->product_serial, AMDSMI_NORMAL_STRING_LENGTH); if (status != AMDSMI_STATUS_SUCCESS) { memset(board_info->product_serial, 0, @@ -423,8 +598,8 @@ amdsmi_status_t amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_hand } if (board_info->product_name[0] == '\0') { - status = rsmi_wrapper(rsmi_dev_name_get, - processor_handle, board_info->product_name, + status = rsmi_wrapper(rsmi_dev_name_get, processor_handle, 0, + board_info->product_name, AMDSMI_256_LENGTH); // Check if the value is in hex format if (status == AMDSMI_STATUS_SUCCESS) { @@ -443,8 +618,8 @@ amdsmi_status_t amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_hand } if (board_info->manufacturer_name[0] == '\0') { - status = rsmi_wrapper(rsmi_dev_vendor_name_get, - processor_handle, board_info->manufacturer_name, + status = rsmi_wrapper(rsmi_dev_vendor_name_get, processor_handle, 0, + board_info->manufacturer_name, AMDSMI_MAX_STRING_LENGTH); if (status != AMDSMI_STATUS_SUCCESS) { memset(board_info->manufacturer_name, 0, @@ -481,8 +656,8 @@ amdsmi_status_t amdsmi_get_gpu_cache_info( return status; rsmi_gpu_cache_info_t rsmi_info; - status = rsmi_wrapper(rsmi_dev_cache_info_get, - processor_handle, &rsmi_info); + status = rsmi_wrapper(rsmi_dev_cache_info_get, processor_handle, 0, + &rsmi_info); if (status != AMDSMI_STATUS_SUCCESS) return status; // Sysfs cache type @@ -533,7 +708,7 @@ amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle *temperature = metric_info.temperature_vrsoc; return r_status; } - amdsmi_status_t amdsmi_status = rsmi_wrapper(rsmi_dev_temp_metric_get, processor_handle, + amdsmi_status_t amdsmi_status = rsmi_wrapper(rsmi_dev_temp_metric_get, processor_handle, 0, static_cast(sensor_type), static_cast(metric), temperature); *temperature /= 1000; @@ -670,7 +845,8 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha // default to 0xffffffff as not supported uint32_t partitition_id = std::numeric_limits::max(); auto tmp_partition_id = uint32_t(0); - amdsmi_status_t status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, &(tmp_partition_id)); + amdsmi_status_t status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, 0, + &(tmp_partition_id)); // Do not return early if this value fails // continue to try getting all info if (status == AMDSMI_STATUS_SUCCESS) { @@ -923,41 +1099,44 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha amdsmi_status_t amdsmi_get_gpu_fan_rpms(amdsmi_processor_handle processor_handle, uint32_t sensor_ind, int64_t *speed) { - return rsmi_wrapper(rsmi_dev_fan_rpms_get, processor_handle, sensor_ind, - speed); + return rsmi_wrapper(rsmi_dev_fan_rpms_get, processor_handle, 0, + sensor_ind, speed); } amdsmi_status_t amdsmi_get_gpu_fan_speed(amdsmi_processor_handle processor_handle, uint32_t sensor_ind, int64_t *speed) { - return rsmi_wrapper(rsmi_dev_fan_speed_get, processor_handle, + return rsmi_wrapper(rsmi_dev_fan_speed_get, processor_handle, 0, sensor_ind, speed); } amdsmi_status_t amdsmi_get_gpu_fan_speed_max(amdsmi_processor_handle processor_handle, uint32_t sensor_ind, uint64_t *max_speed) { - return rsmi_wrapper(rsmi_dev_fan_speed_max_get, processor_handle, - sensor_ind, max_speed); + return rsmi_wrapper(rsmi_dev_fan_speed_max_get, processor_handle, 0, + sensor_ind, max_speed); } amdsmi_status_t amdsmi_reset_gpu_fan(amdsmi_processor_handle processor_handle, uint32_t sensor_ind) { - return rsmi_wrapper(rsmi_dev_fan_reset, processor_handle, sensor_ind); + return rsmi_wrapper(rsmi_dev_fan_reset, processor_handle, 0, + sensor_ind); } amdsmi_status_t amdsmi_set_gpu_fan_speed(amdsmi_processor_handle processor_handle, uint32_t sensor_ind, uint64_t speed) { - return rsmi_wrapper(rsmi_dev_fan_speed_set, processor_handle, - sensor_ind, speed); + return rsmi_wrapper(rsmi_dev_fan_speed_set, processor_handle, 0, + sensor_ind, speed); } amdsmi_status_t amdsmi_get_gpu_id(amdsmi_processor_handle processor_handle, uint16_t *id) { - return rsmi_wrapper(rsmi_dev_id_get, processor_handle, id); + return rsmi_wrapper(rsmi_dev_id_get, processor_handle, 0, + id); } amdsmi_status_t amdsmi_get_gpu_revision(amdsmi_processor_handle processor_handle, uint16_t *revision) { - return rsmi_wrapper(rsmi_dev_revision_get, processor_handle, revision); + return rsmi_wrapper(rsmi_dev_revision_get, processor_handle, 0, + revision); } // TODO(bliu) : add fw info from libdrm @@ -995,7 +1174,7 @@ amdsmi_status_t amdsmi_get_fw_info(amdsmi_processor_handle processor_handle, // collect all rsmi supported fw block for (auto ite = fw_in_rsmi.begin(); ite != fw_in_rsmi.end(); ite ++) { - auto status = rsmi_wrapper(rsmi_dev_firmware_version_get, processor_handle, + auto status = rsmi_wrapper(rsmi_dev_firmware_version_get, processor_handle, 0, (*ite).second, &(info->fw_info_list[info->num_fw_info].fw_version)); if (status == AMDSMI_STATUS_SUCCESS) { @@ -1040,8 +1219,8 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i status = smi_amdgpu_get_market_name_from_dev_id(gpu_device, info->market_name); if (status != AMDSMI_STATUS_SUCCESS) { - rsmi_wrapper(rsmi_dev_brand_get, processor_handle, - info->market_name, AMDSMI_256_LENGTH); + rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0, + info->market_name, AMDSMI_256_LENGTH); } info->device_id = dev_info.device_id; @@ -1050,23 +1229,24 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i } else { uint64_t dv_uid = 0; - status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, &dv_uid); + status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0, + &dv_uid); if (status == AMDSMI_STATUS_SUCCESS) snprintf(info->asic_serial, sizeof(info->asic_serial), "%lu", dv_uid); - status = rsmi_wrapper(rsmi_dev_brand_get, processor_handle, + status = rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0, info->market_name, AMDSMI_256_LENGTH); - status = rsmi_wrapper(rsmi_dev_vendor_id_get, processor_handle, - &vendor_id); + status = rsmi_wrapper(rsmi_dev_vendor_id_get, processor_handle, 0, + &vendor_id); if (status == AMDSMI_STATUS_SUCCESS) info->vendor_id = vendor_id; } // For other sysfs related information, get from rocm-smi - status = rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, processor_handle, - &subvendor_id); + status = rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, processor_handle, 0, + &subvendor_id); if (status == AMDSMI_STATUS_SUCCESS) info->subvendor_id = subvendor_id; - status = rsmi_wrapper(rsmi_dev_pcie_vendor_name_get, processor_handle, - info->vendor_name, AMDSMI_MAX_STRING_LENGTH); + status = rsmi_wrapper(rsmi_dev_pcie_vendor_name_get, processor_handle, 0, + info->vendor_name, AMDSMI_MAX_STRING_LENGTH); // If vendor name is empty and the vendor id is 0x1002, set vendor name to AMD vendor string if ((info->vendor_name != NULL && info->vendor_name[0] == '\0') && info->vendor_id == 0x1002) { @@ -1078,14 +1258,14 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i // default to 0xffff as not supported info->oam_id = std::numeric_limits::max(); uint16_t tmp_oam_id = 0; - status = rsmi_wrapper(rsmi_dev_xgmi_physical_id_get, processor_handle, - &(tmp_oam_id)); + status = rsmi_wrapper(rsmi_dev_xgmi_physical_id_get, processor_handle, 0, + &(tmp_oam_id)); info->oam_id = tmp_oam_id; // default to 0xffffffff as not supported info->num_of_compute_units = std::numeric_limits::max(); auto tmp_num_of_compute_units = uint32_t(0); - status = rsmi_wrapper(amd::smi::rsmi_dev_number_of_computes_get, processor_handle, + status = rsmi_wrapper(amd::smi::rsmi_dev_number_of_computes_get, processor_handle, 0, &(tmp_num_of_compute_units)); if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { info->num_of_compute_units = tmp_num_of_compute_units; @@ -1094,7 +1274,7 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i // default to 0xffffffffffffffff as not supported info->target_graphics_version = std::numeric_limits::max(); auto tmp_target_gfx_version = uint64_t(0); - status = rsmi_wrapper(rsmi_dev_target_graphics_version_get, processor_handle, + status = rsmi_wrapper(rsmi_dev_target_graphics_version_get, processor_handle, 0, &(tmp_target_gfx_version)); if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { info->target_graphics_version = tmp_target_gfx_version; @@ -1152,7 +1332,8 @@ amdsmi_status_t amdsmi_get_gpu_kfd_info(amdsmi_processor_handle processor_handle // default to 0xffffffffffffffff as not supported info->kfd_id = std::numeric_limits::max(); auto tmp_kfd_id = uint64_t(0); - status = rsmi_wrapper(rsmi_dev_guid_get, processor_handle, &(tmp_kfd_id)); + status = rsmi_wrapper(rsmi_dev_guid_get, processor_handle, 0, + &(tmp_kfd_id)); // Do not return early if this value fails // continue to try getting all info if (status == AMDSMI_STATUS_SUCCESS) { @@ -1162,7 +1343,8 @@ amdsmi_status_t amdsmi_get_gpu_kfd_info(amdsmi_processor_handle processor_handle // default to 0xffffffff as not supported info->node_id = std::numeric_limits::max(); auto tmp_node_id = uint32_t(0); - status = rsmi_wrapper(rsmi_dev_node_id_get, processor_handle, &(tmp_node_id)); + status = rsmi_wrapper(rsmi_dev_node_id_get, processor_handle, 0, + &(tmp_node_id)); // Do not return early if this value fails // continue to try getting all info if (status == AMDSMI_STATUS_SUCCESS) { @@ -1172,7 +1354,8 @@ amdsmi_status_t amdsmi_get_gpu_kfd_info(amdsmi_processor_handle processor_handle // default to 0xffffffff as not supported info->current_partition_id = std::numeric_limits::max(); auto tmp_current_partition_id = uint32_t(0); - status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, &(tmp_current_partition_id)); + status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, 0, + &(tmp_current_partition_id)); // Do not return early if this value fails // continue to try getting all info if (status == AMDSMI_STATUS_SUCCESS) { @@ -1184,23 +1367,27 @@ amdsmi_status_t amdsmi_get_gpu_kfd_info(amdsmi_processor_handle processor_handle amdsmi_status_t amdsmi_get_gpu_subsystem_id(amdsmi_processor_handle processor_handle, uint16_t *id) { - return rsmi_wrapper(rsmi_dev_subsystem_id_get, processor_handle, id); + return rsmi_wrapper(rsmi_dev_subsystem_id_get, processor_handle, 0, + id); } amdsmi_status_t amdsmi_get_gpu_subsystem_name( amdsmi_processor_handle processor_handle, char *name, size_t len) { - return rsmi_wrapper(rsmi_dev_subsystem_name_get, processor_handle, name, len); + return rsmi_wrapper(rsmi_dev_subsystem_name_get, processor_handle, 0, + name, len); } amdsmi_status_t amdsmi_get_gpu_vendor_name( amdsmi_processor_handle processor_handle, char *name, size_t len) { - return rsmi_wrapper(rsmi_dev_vendor_name_get, processor_handle, name, len); + return rsmi_wrapper(rsmi_dev_vendor_name_get, processor_handle, 0, + name, len); } amdsmi_status_t amdsmi_get_gpu_vram_vendor(amdsmi_processor_handle processor_handle, char *brand, uint32_t len) { - return rsmi_wrapper(rsmi_dev_vram_vendor_get, processor_handle, brand, len); + return rsmi_wrapper(rsmi_dev_vram_vendor_get, processor_handle, 0, + brand, len); } amdsmi_status_t amdsmi_get_gpu_vram_info( @@ -1249,7 +1436,8 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( // map the vendor name to enum char brand[256]; - r = rsmi_wrapper(rsmi_dev_vram_vendor_get, processor_handle, brand, 255); + r = rsmi_wrapper(rsmi_dev_vram_vendor_get, processor_handle, 0, + brand, 255); if (r == AMDSMI_STATUS_SUCCESS) { if (strcasecmp(brand, "SAMSUNG") == 0) info->vram_vendor = AMDSMI_VRAM_VENDOR__SAMSUNG; @@ -1273,7 +1461,7 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( info->vram_vendor = AMDSMI_VRAM_VENDOR__MICRON; } uint64_t total = 0; - r = rsmi_wrapper(rsmi_dev_memory_total_get, processor_handle, + r = rsmi_wrapper(rsmi_dev_memory_total_get, processor_handle, 0, RSMI_MEM_TYPE_VRAM, &total); if (r == AMDSMI_STATUS_SUCCESS) { info->vram_size = total / (1024 * 1024); @@ -1284,13 +1472,13 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( amdsmi_status_t amdsmi_init_gpu_event_notification(amdsmi_processor_handle processor_handle) { - return rsmi_wrapper(rsmi_event_notification_init, processor_handle); + return rsmi_wrapper(rsmi_event_notification_init, processor_handle, 0); } amdsmi_status_t amdsmi_set_gpu_event_notification_mask(amdsmi_processor_handle processor_handle, uint64_t mask) { - return rsmi_wrapper(rsmi_event_notification_mask_set, processor_handle, mask); + return rsmi_wrapper(rsmi_event_notification_mask_set, processor_handle, 0, mask); } amdsmi_status_t @@ -1326,18 +1514,18 @@ amdsmi_get_gpu_event_notification(int timeout_ms, amdsmi_status_t amdsmi_stop_gpu_event_notification( amdsmi_processor_handle processor_handle) { - return rsmi_wrapper(rsmi_event_notification_stop, processor_handle); + return rsmi_wrapper(rsmi_event_notification_stop, processor_handle, 0); } amdsmi_status_t amdsmi_gpu_counter_group_supported( amdsmi_processor_handle processor_handle, amdsmi_event_group_t group) { - return rsmi_wrapper(rsmi_dev_counter_group_supported, processor_handle, + return rsmi_wrapper(rsmi_dev_counter_group_supported, processor_handle, 0, static_cast(group)); } amdsmi_status_t amdsmi_gpu_create_counter(amdsmi_processor_handle processor_handle, amdsmi_event_type_t type, amdsmi_event_handle_t *evnt_handle) { - return rsmi_wrapper(rsmi_dev_counter_create, processor_handle, + return rsmi_wrapper(rsmi_dev_counter_create, processor_handle, 0, static_cast(type), static_cast(evnt_handle)); } @@ -1368,14 +1556,14 @@ amdsmi_gpu_read_counter(amdsmi_event_handle_t evt_handle, amdsmi_status_t amdsmi_get_gpu_available_counters(amdsmi_processor_handle processor_handle, amdsmi_event_group_t grp, uint32_t *available) { - return rsmi_wrapper(rsmi_counter_available_counters_get, processor_handle, + return rsmi_wrapper(rsmi_counter_available_counters_get, processor_handle, 0, static_cast(grp), available); } amdsmi_status_t amdsmi_topo_get_numa_node_number(amdsmi_processor_handle processor_handle, uint32_t *numa_node) { - return rsmi_wrapper(rsmi_topo_get_numa_node_number, processor_handle, numa_node); + return rsmi_wrapper(rsmi_topo_get_numa_node_number, processor_handle, 0, numa_node); } amdsmi_status_t @@ -1499,7 +1687,7 @@ amdsmi_status_t amdsmi_get_gpu_compute_partition(amdsmi_processor_handle processor_handle, char *compute_partition, uint32_t len) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_compute_partition_get, processor_handle, + return rsmi_wrapper(rsmi_dev_compute_partition_get, processor_handle, 0, compute_partition, len); } @@ -1507,7 +1695,7 @@ amdsmi_status_t amdsmi_set_gpu_compute_partition(amdsmi_processor_handle processor_handle, amdsmi_compute_partition_type_t compute_partition) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_compute_partition_set, processor_handle, + return rsmi_wrapper(rsmi_dev_compute_partition_set, processor_handle, 0, static_cast(compute_partition)); } @@ -1516,7 +1704,7 @@ amdsmi_status_t amdsmi_get_gpu_memory_partition(amdsmi_processor_handle processor_handle, char *memory_partition, uint32_t len) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_memory_partition_get, processor_handle, + return rsmi_wrapper(rsmi_dev_memory_partition_get, processor_handle, 0, memory_partition, len); } @@ -1526,52 +1714,85 @@ amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle, AMDSMI_CHECK_INIT(); std::ostringstream ss; std::lock_guard g(myMutex); + + const uint32_t k256 = 256; + char current_partition[k256]; + std::string current_partition_str = "UNKNOWN"; + std::string req_user_partition = "UNKNOWN"; + // open libdrm connections prevents the ability to unload driver amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); ss << __PRETTY_FUNCTION__ << " | \n" - << "***********************************\n" - << "* Cleaned up - clean_up_drm() *\n" - << "***********************************\n"; + << "**************************************\n" + << "* Cleaned up - clean_up_drm() *\n" + << "**************************************\n"; LOG_INFO(ss); - amdsmi_status_t ret = rsmi_wrapper(rsmi_dev_memory_partition_set, processor_handle, - static_cast(memory_partition)); - if (ret == AMDSMI_STATUS_SUCCESS) { - const uint32_t k256 = 256; - char current_partition[k256]; - std::string current_partition_str = "UNKNOWN"; - std::string req_user_partition; - amdsmi_status_t ret_get = rsmi_wrapper(rsmi_dev_memory_partition_get, processor_handle, - current_partition, k256); - if (ret_get == AMDSMI_STATUS_SUCCESS) { - current_partition_str.clear(); - current_partition_str = current_partition; - } - switch (memory_partition) { - case AMDSMI_MEMORY_PARTITION_NPS1: - req_user_partition = "NPS1"; - break; - case AMDSMI_MEMORY_PARTITION_NPS2: - req_user_partition = "NPS2"; - break; - case AMDSMI_MEMORY_PARTITION_NPS4: - req_user_partition = "NPS4"; - break; - case AMDSMI_MEMORY_PARTITION_NPS8: - req_user_partition = "NPS8"; - break; - default: - req_user_partition = "UNKNOWN"; - break; - } - if (req_user_partition == current_partition_str) { - amd::smi::AMDSmiSystem::getInstance().init_drm(); - ss << __PRETTY_FUNCTION__ << " | \n" - << "***********************************\n" - << "* Initialized libdrm - init_drm() *\n" - << "***********************************\n"; - LOG_INFO(ss); - } + req_user_partition.clear(); + switch (memory_partition) { + case AMDSMI_MEMORY_PARTITION_NPS1: + req_user_partition = "NPS1"; + break; + case AMDSMI_MEMORY_PARTITION_NPS2: + req_user_partition = "NPS2"; + break; + case AMDSMI_MEMORY_PARTITION_NPS4: + req_user_partition = "NPS4"; + break; + case AMDSMI_MEMORY_PARTITION_NPS8: + req_user_partition = "NPS8"; + break; + default: + req_user_partition = "UNKNOWN"; + break; } + rsmi_memory_partition_type_t rsmi_type; + auto it = nps_amdsmi_to_RSMI.find(memory_partition); + if (it != nps_amdsmi_to_RSMI.end()) { + rsmi_type = it->second; + } else if (it == nps_amdsmi_to_RSMI.end()) { + amd::smi::AMDSmiSystem::getInstance().init_drm(); + ss << __PRETTY_FUNCTION__ << " | Could not find " << req_user_partition << "\n" + << "**************************************\n" + << "* Re-Initialized libdrm - init_drm() *\n" + << "**************************************\n"; + LOG_INFO(ss); + return AMDSMI_STATUS_INVAL; + } + amdsmi_status_t ret = rsmi_wrapper(rsmi_dev_memory_partition_set, processor_handle, 0, + rsmi_type); + + amdsmi_status_t ret_get = rsmi_wrapper(rsmi_dev_memory_partition_get, processor_handle, 0, + current_partition, k256); + + if (ret_get == AMDSMI_STATUS_SUCCESS) { + current_partition_str.clear(); + current_partition_str = current_partition; + } + + // WORKAROUND: Re-initialize libdrm connection + // Only re-initialize if the memory partition was correctly set + // otherwise, we can re-try through the CLI. + // This is a workaround for cases which we cannot properly remove libdrm + // connection. + bool drm_reinit = (req_user_partition == current_partition_str + || ret == AMDSMI_STATUS_INVAL + || ret == AMDSMI_STATUS_NOT_SUPPORTED); + if (drm_reinit) { + amd::smi::AMDSmiSystem::getInstance().init_drm(); + ss << __PRETTY_FUNCTION__ << " | \n" + << "**************************************\n" + << "* Re-Initialized libdrm - init_drm() *\n" + << "**************************************\n"; + LOG_INFO(ss); + } + + ss << __PRETTY_FUNCTION__ + << " | After attepting to set memory partition to " << req_user_partition << "\n" + << " | Current memory partition is " << current_partition_str << "\n" + << " | " << (drm_reinit ? + "We were successfully able to restart libdrm" : "We are unable to restart libdrm") << "\n" + << " | Returning: " << smi_amdgpu_get_status_string(ret, false); + LOG_INFO(ss); // TODO(amdsmi_team): issue completely closing -> reopening libdrm on 1st try (workaround above) // amd::smi::AMDSmiSystem::getInstance().init_drm(); @@ -1583,16 +1804,416 @@ amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle, return ret; } +amdsmi_status_t +amdsmi_get_gpu_memory_partition_config(amdsmi_processor_handle processor_handle, + amdsmi_memory_partition_config_t *config) { + AMDSMI_CHECK_INIT(); + std::ostringstream ss; + + // initialization for devices which do not support partitions + amdsmi_nps_caps_t flags; + flags.amdsmi_nps_flags_t.nps1_cap = 0; + flags.amdsmi_nps_flags_t.nps2_cap = 0; + flags.amdsmi_nps_flags_t.nps4_cap = 0; + flags.amdsmi_nps_flags_t.nps8_cap = 0; + config->partition_caps = flags; + config->mp_mode = AMDSMI_MEMORY_PARTITION_UNKNOWN; + + // current memory partition + constexpr uint32_t kCurrentPartitionSize = 5; + char current_mem_partition[kCurrentPartitionSize]; + std::string current_mem_partition_str = "N/A"; + amdsmi_status_t status = amdsmi_get_gpu_memory_partition(processor_handle, + current_mem_partition, kCurrentPartitionSize); + ss << __PRETTY_FUNCTION__ << " | amdsmi_get_gpu_memory_partition() current_partition = |" + << current_mem_partition << "|"; + LOG_DEBUG(ss); + current_mem_partition_str = current_mem_partition; + if (status == AMDSMI_STATUS_SUCCESS) { + if (current_mem_partition_str == "NPS1") { + config->mp_mode = AMDSMI_MEMORY_PARTITION_NPS1; + } else if (current_mem_partition_str == "NPS2") { + config->mp_mode = AMDSMI_MEMORY_PARTITION_NPS2; + } else if (current_mem_partition_str == "NPS4") { + config->mp_mode = AMDSMI_MEMORY_PARTITION_NPS4; + } else if (current_mem_partition_str == "NPS8") { + config->mp_mode = AMDSMI_MEMORY_PARTITION_NPS8; + } + } + + // Add memory partition capabilities here + constexpr uint32_t kLenCapsSize = 30; + char memory_caps[kLenCapsSize]; + status = rsmi_wrapper(rsmi_dev_memory_partition_capabilities_get, + processor_handle, 0, + memory_caps, kLenCapsSize); + ss << __PRETTY_FUNCTION__ + << " | rsmi_dev_memory_partition_capabilities_get Returning: " + << smi_amdgpu_get_status_string(status, false) + << " | Type: memory_partition_capabilities" + << " | Data: " << memory_caps; + LOG_DEBUG(ss); + std::string memory_caps_str = "N/A"; + if (status == AMDSMI_STATUS_SUCCESS) { + memory_caps_str = std::string(memory_caps); + if (memory_caps_str.find("NPS1") != std::string::npos) { + flags.amdsmi_nps_flags_t.nps1_cap = 1; + } + if (memory_caps_str.find("NPS2") != std::string::npos) { + flags.amdsmi_nps_flags_t.nps2_cap = 1; + } + if (memory_caps_str.find("NPS4") != std::string::npos) { + flags.amdsmi_nps_flags_t.nps4_cap = 1; + } + if (memory_caps_str.find("NPS8") != std::string::npos) { + flags.amdsmi_nps_flags_t.nps8_cap = 1; + } + } + config->partition_caps = flags; + return status; +} + +amdsmi_status_t +amdsmi_set_gpu_memory_partition_mode(amdsmi_processor_handle processor_handle, + amdsmi_memory_partition_type_t mode) { + AMDSMI_CHECK_INIT(); + return amdsmi_set_gpu_memory_partition(processor_handle, mode); +} + +// Accelerator Partition functions +amdsmi_status_t +amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle processor_handle, + amdsmi_accelerator_partition_profile_config_t *profile_config) { + AMDSMI_CHECK_INIT(); + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ + << " | START "; + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + + if (profile_config == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + // Initialize values + amdsmi_status_t return_status = AMDSMI_STATUS_NOT_SUPPORTED; + amdsmi_status_t status = AMDSMI_STATUS_NOT_SUPPORTED; + profile_config->default_profile_index = 0; + profile_config->num_profiles = 0; + profile_config->num_resource_profiles = 0; + profile_config->resource_profiles->profile_index = 0; + profile_config->resource_profiles->resource_type = AMDSMI_ACCELERATOR_MAX; + profile_config->resource_profiles->partition_resource = 0; + profile_config->resource_profiles->num_partitions_share_resource = 0; + amdsmi_nps_caps_t flags; + flags.amdsmi_nps_flags_t.nps1_cap = 0; + flags.amdsmi_nps_flags_t.nps2_cap = 0; + flags.amdsmi_nps_flags_t.nps4_cap = 0; + flags.amdsmi_nps_flags_t.nps8_cap = 0; + + ss << __PRETTY_FUNCTION__ + << " | 1"; + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + + // get supported xcp_configs (this will tell use # of profiles/index's) + // /sys/class/drm/../device/compute_partition_config/supported_xcp_configs + // ex. SPX, DPX, QPX, CPX + std::string accelerator_caps_str = "N/A"; + constexpr uint32_t kLenXCPConfigSize = 30; + char supported_xcp_configs[kLenXCPConfigSize]; + bool use_xcp_config = false; + return_status + = rsmi_wrapper(rsmi_dev_compute_partition_supported_xcp_configs_get, processor_handle, 0, + supported_xcp_configs, kLenXCPConfigSize); + if (return_status == AMDSMI_STATUS_SUCCESS) { + accelerator_caps_str.clear(); + accelerator_caps_str = std::string(supported_xcp_configs); + use_xcp_config = true; + } else if (return_status == AMDSMI_STATUS_NO_PERM) { // initialize what we can + ss << __PRETTY_FUNCTION__ + << "\n | rsmi_dev_compute_partition_supported_xcp_configs_get()" + << " failed due to no permission" + << "\n | Defaulting to use rsmi_dev_compute_partition_capabilities_get"; + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + return_status = rsmi_wrapper(rsmi_dev_compute_partition_capabilities_get, + processor_handle, 0, + supported_xcp_configs, kLenXCPConfigSize); + if (return_status == AMDSMI_STATUS_SUCCESS) { + accelerator_caps_str.clear(); + accelerator_caps_str = std::string(supported_xcp_configs); + } else { + ss << __PRETTY_FUNCTION__ + << "\n | rsmi_dev_compute_partition_capabilities_get() failed, " + << "likely due to feature not supported" + << "\n | Returning: " << smi_amdgpu_get_status_string(return_status, false); + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + return return_status; + } + } + + ss << __PRETTY_FUNCTION__ + << (use_xcp_config ? "\n | Used rsmi_dev_compute_partition_supported_xcp_configs_get()" : + "\n | Used rsmi_dev_compute_partition_capabilities_get()") + << "\n | Returning: " << smi_amdgpu_get_status_string(return_status, false) + << "\n | Type: " + << (use_xcp_config ? amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs): + amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition)) + << "\n | Data: " << accelerator_caps_str; + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + if (accelerator_caps_str.find("SPX") != std::string::npos) { + profile_config->profiles[profile_config->num_profiles].profile_type + = AMDSMI_ACCELERATOR_PARTITION_SPX; + profile_config->profiles[profile_config->num_profiles].num_partitions = 1; + profile_config->profiles[profile_config->num_profiles].profile_index + = profile_config->num_profiles; + // default all memory partition caps to 0 + profile_config->profiles[profile_config->num_profiles].memory_caps = flags; + profile_config->num_profiles++; + } + if (accelerator_caps_str.find("DPX") != std::string::npos) { + profile_config->profiles[profile_config->num_profiles].profile_type + = AMDSMI_ACCELERATOR_PARTITION_DPX; + profile_config->profiles[profile_config->num_profiles].num_partitions = 2; + profile_config->profiles[profile_config->num_profiles].profile_index + = profile_config->num_profiles; + // default all memory partition caps to 0 + profile_config->profiles[profile_config->num_profiles].memory_caps = flags; + profile_config->num_profiles++; + } + if (accelerator_caps_str.find("TPX") != std::string::npos) { + profile_config->profiles[profile_config->num_profiles].profile_type + = AMDSMI_ACCELERATOR_PARTITION_TPX; + profile_config->profiles[profile_config->num_profiles].num_partitions = 3; + profile_config->profiles[profile_config->num_profiles].profile_index + = profile_config->num_profiles; + // default all memory partition caps to 0 + profile_config->profiles[profile_config->num_profiles].memory_caps = flags; + profile_config->num_profiles++; + } + if (accelerator_caps_str.find("QPX") != std::string::npos) { + profile_config->profiles[profile_config->num_profiles].profile_type + = AMDSMI_ACCELERATOR_PARTITION_QPX; + profile_config->profiles[profile_config->num_profiles].num_partitions = 4; + profile_config->profiles[profile_config->num_profiles].profile_index + = profile_config->num_profiles; + // default all memory partition caps to 0 + profile_config->profiles[profile_config->num_profiles].memory_caps = flags; + profile_config->num_profiles++; + } + if (accelerator_caps_str.find("CPX") != std::string::npos) { + profile_config->profiles[profile_config->num_profiles].profile_type + = AMDSMI_ACCELERATOR_PARTITION_CPX; + // Note: # of XCDs is max # of partitions CPX supports + uint16_t tmp_xcd_count = 0; + status = rsmi_wrapper(rsmi_dev_metrics_xcd_counter_get, + processor_handle, 0, &tmp_xcd_count); + profile_config->profiles[ + profile_config->num_profiles].num_partitions = 0; // default to 0 + if (status == AMDSMI_STATUS_SUCCESS) { + profile_config->profiles[ + profile_config->num_profiles].num_partitions = tmp_xcd_count; + } + profile_config->profiles[profile_config->num_profiles].profile_index + = profile_config->num_profiles; + // default all memory partition caps to 0 + profile_config->profiles[profile_config->num_profiles].memory_caps = flags; + profile_config->num_profiles++; + } + + ss << __PRETTY_FUNCTION__ + << " | 2"; + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + auto resource_index = 0; + // get resource info for each profile + for (auto i = 0; i < profile_config->num_profiles; i++) { + auto it = partition_types_map.find(profile_config->profiles[i].profile_type); + std::string partition_type_str = "UNKNOWN"; + if (it != partition_types_map.end()) { + partition_type_str.clear(); + partition_type_str = it->second; + } + auto it3 = accelerator_to_RSMI.find(profile_config->profiles[i].profile_type); + rsmi_compute_partition_type_t rsmi_partition_type = RSMI_COMPUTE_PARTITION_INVALID; + if (it3 == accelerator_to_RSMI.end()) { + ss << __PRETTY_FUNCTION__ << " | reached end of map\n"; + LOG_DEBUG(ss); + continue; + } else { + rsmi_partition_type = it3->second; + } + status = rsmi_wrapper(rsmi_dev_compute_partition_xcp_config_set, processor_handle, 0, + rsmi_partition_type); + ss << __PRETTY_FUNCTION__ + << "\n | profile_num: " << i + << "\n | profile_type: " << partition_type_str + << "\n | rsmi_dev_compute_partition_xcp_config_set(" << partition_type_str + << ") Returning: " + << smi_amdgpu_get_status_string(status, false) + << "\n | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs) + << "\n | Data: " << "N/A"; + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + + // 1) get memory caps for each profile + /** + * rsmi_status_t rsmi_dev_compute_partition_supported_nps_configs_get(uint32_t dv_ind, char *supported_configs, + * uint32_t len); + */ + constexpr uint32_t kLenNPSConfigSize = 30; + char supported_nps_configs[kLenNPSConfigSize]; + std::string supported_nps_caps_str = "N/A"; + status = rsmi_wrapper(rsmi_dev_compute_partition_supported_nps_configs_get, + processor_handle, 0, + supported_nps_configs, kLenNPSConfigSize); + if (status == AMDSMI_STATUS_SUCCESS) { + supported_nps_caps_str.clear(); + supported_nps_caps_str = std::string(supported_nps_configs); + } + if (supported_nps_caps_str.find("NPS1") != std::string::npos) { + profile_config->profiles[i].memory_caps.amdsmi_nps_flags_t.nps1_cap = 1; + } + if (supported_nps_caps_str.find("NPS2") != std::string::npos) { + profile_config->profiles[i].memory_caps.amdsmi_nps_flags_t.nps2_cap = 1; + } + if (supported_nps_caps_str.find("NPS4") != std::string::npos) { + profile_config->profiles[i].memory_caps.amdsmi_nps_flags_t.nps4_cap = 1; + } + if (supported_nps_caps_str.find("NPS8") != std::string::npos) { + profile_config->profiles[i].memory_caps.amdsmi_nps_flags_t.nps8_cap = 1; + } + // 2) get resource profiles + for (auto r = static_cast(RSMI_ACCELERATOR_XCC); + r < static_cast(RSMI_ACCELERATOR_MAX); r++) { + rsmi_accelerator_partition_resource_type_t type + = static_cast(r); + rsmi_accelerator_partition_resource_profile_t profile; + status = rsmi_wrapper( + rsmi_dev_compute_partition_resource_profile_get, processor_handle, 0, + &type, &profile); + if (status == AMDSMI_STATUS_SUCCESS) { + uint32_t inc_res_profile = + profile_config->num_resource_profiles + 1; + if (inc_res_profile < static_cast(RSMI_ACCELERATOR_MAX)) { + profile_config->num_resource_profiles = inc_res_profile; + } + profile_config->resource_profiles[resource_index].profile_index = i; + profile_config->resource_profiles[resource_index].resource_type + = static_cast(type); + profile_config->resource_profiles[resource_index].partition_resource + = profile.partition_resource; + profile_config->resource_profiles[resource_index].num_partitions_share_resource + = profile.num_partitions_share_resource; + resource_index += 1; + profile_config->profiles[i].num_resources + = profile_config->profiles[i].num_resources + 1; + } + + it = partition_types_map.find(profile_config->profiles[i].profile_type); + partition_type_str = "UNKNOWN"; + if (it != partition_types_map.end()) { + partition_type_str.clear(); + partition_type_str = it->second; + } + auto it2 = resource_types_map.find( + static_cast(type)); + std::string resource_type_str = "UNKNOWN"; + if (it2 != resource_types_map.end()) { + resource_type_str.clear(); + resource_type_str = it2->second; + } + auto current_resource_idx = (resource_index >= 1) ? resource_index - 1 : 0; + std::string nps_caps = "N/A"; + if (profile_config->profiles[i].memory_caps.amdsmi_nps_flags_t.nps1_cap == 1) { + if (nps_caps == "N/A") { + nps_caps.clear(); + nps_caps = "NPS1"; + } else { + nps_caps += ", NPS1"; + } + } + if (profile_config->profiles[i].memory_caps.amdsmi_nps_flags_t.nps2_cap == 1) { + if (nps_caps == "N/A") { + nps_caps.clear(); + nps_caps = "NPS2"; + } else { + nps_caps += ", NPS2"; + } + } + if (profile_config->profiles[i].memory_caps.amdsmi_nps_flags_t.nps4_cap == 1) { + if (nps_caps == "N/A") { + nps_caps.clear(); + nps_caps = "NPS4"; + } else { + nps_caps += ", NPS4"; + } + } + if (profile_config->profiles[i].memory_caps.amdsmi_nps_flags_t.nps8_cap == 1) { + if (nps_caps == "N/A") { + nps_caps.clear(); + nps_caps = "NPS8"; + } else { + nps_caps += ", NPS8"; + } + } + ss << __PRETTY_FUNCTION__ + << "\n | profile_config->num_profiles: " << profile_config->num_profiles + << "\n | profile_num (i): " << i + << "\n | resource_num (r): " << r + << "\n | current_resource_idx: " << current_resource_idx + << "\n | profile_config->resource_profiles[current_resource_idx].profile_index: " + << profile_config->resource_profiles[current_resource_idx].profile_index + << "\n | profile_config->profiles[i].memory_caps: " + << nps_caps + << "\n | profile_config->profiles[i].num_resources: " + << profile_config->profiles[i].num_resources + << "\n | profile_type: " << partition_type_str + << "\n | resource_type: " << resource_type_str + << "\n | partition_resource: " << profile.partition_resource + << "\n | num_partitions_share_resource: " + << profile.num_partitions_share_resource + << "\n | profile_config->num_resource_profiles: " + << profile_config->num_resource_profiles + << "\n | rsmi_dev_compute_partition_resource_profile_get(" + << resource_type_str << ") Returning: " + << smi_amdgpu_get_status_string(status, false) + << "\n | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs) + << "\n"; + // std::cout << ss.str() << std::endl; + LOG_DEBUG(ss); + } // END resources loop + } // END profile loop + ss << __PRETTY_FUNCTION__ + << " | END returning " << smi_amdgpu_get_status_string(return_status, false); + LOG_INFO(ss); + + return return_status; +} + amdsmi_status_t amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_handle, amdsmi_accelerator_partition_profile_t *profile, uint32_t *partition_id) { + std::ostringstream ss; + AMDSMI_CHECK_INIT(); - if (profile == nullptr) { + if (profile == nullptr || partition_id == nullptr) { return AMDSMI_STATUS_INVAL; } - std::ostringstream ss; - // TODO(amdsmi_team): also fill out profile later + + // initialization for devices which do not support partitions + profile->num_partitions = std::numeric_limits::max(); + profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_INVALID; + *partition_id = {0}; + profile->profile_index = std::numeric_limits::max(); + profile->num_resources = 0; + amdsmi_nps_caps_t flags; flags.amdsmi_nps_flags_t.nps1_cap = 0; flags.amdsmi_nps_flags_t.nps2_cap = 0; @@ -1602,17 +2223,136 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h // TODO(amdsmi_team): add resources here ^ auto tmp_partition_id = uint32_t(0); + auto tmp_xcd_count = uint16_t(0); amdsmi_status_t status = AMDSMI_STATUS_NOT_SUPPORTED; - status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, &tmp_partition_id); + // get xcp config info (this will tell use # of profiles/index's) + // /sys/class/drm/../device/compute_partition_config/supported_xcp_configs + // ex. SPX, DPX, QPX, CPX + // Depending on what is available, we can determine the profile index + // ex. SPX = 0, DPX = 1, QPX = 2, CPX = 3; other devices may have different values + std::string accelerator_capabilities = "N/A"; + constexpr uint32_t kLenSupportedXCPConfigSize = 30; + char xcp_supported_configs[kLenSupportedXCPConfigSize]; + status = rsmi_wrapper(rsmi_dev_compute_partition_supported_xcp_configs_get, processor_handle, 0, + xcp_supported_configs, kLenSupportedXCPConfigSize); if (status == AMDSMI_STATUS_SUCCESS) { - *partition_id = tmp_partition_id; + accelerator_capabilities.clear(); + accelerator_capabilities = std::string(xcp_supported_configs); + // remove leading/trailing spaces + whitespace + accelerator_capabilities = amd::smi::trimAllWhiteSpace(accelerator_capabilities); } + ss << __PRETTY_FUNCTION__ + << "\n | rsmi_dev_compute_partition_supported_xcp_configs_get Returning: " + << smi_amdgpu_get_status_string(status, false) + << "\n | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs) + << "\n | Data (accelerator_capabilities/supported_xcp_configs): " + << accelerator_capabilities; + LOG_DEBUG(ss); + + // get index by comma and place into a string vector + char delimiter = ','; + std::stringstream ss_obj(accelerator_capabilities); + std::string temp; + std::vector tokens; + while (getline(ss_obj, temp, delimiter)) { + tokens.push_back(temp); + } + + constexpr uint32_t kCurrentPartitionSize = 5; + char current_partition[kCurrentPartitionSize]; + std::string current_partition_str = "N/A"; + status = amdsmi_get_gpu_compute_partition(processor_handle, current_partition, + kCurrentPartitionSize); + ss << __PRETTY_FUNCTION__ << " | amdsmi_get_gpu_compute_partition() current_partition = |" + << current_partition << "|"; + LOG_DEBUG(ss); + current_partition_str = current_partition; + if (status == AMDSMI_STATUS_SUCCESS) { + // 1) get profile index from + // /sys/class/drm/../device/compute_partition_config/supported_xcp_configs + if (current_partition_str == "SPX" || current_partition_str == "DPX" + || current_partition_str == "TPX" || current_partition_str == "QPX" + || current_partition_str == "CPX") { + // get index according to supported_xcp_configs, separated by commas + if (accelerator_capabilities.find(current_partition_str) != std::string::npos) { + auto it = std::find(tokens.begin(), tokens.end(), current_partition_str); + if (it != tokens.end()) { + profile->profile_index = std::distance(tokens.begin(), it); + } + } + } + + // 2) get profile type from /sys/class/drm/../device/current_compute_partition + if (current_partition_str == "SPX") { + profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_SPX; + } else if (current_partition_str == "DPX") { + profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_DPX; + } else if (current_partition_str == "TPX") { + profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_TPX; + } else if (current_partition_str == "QPX") { + profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_QPX; + } else if (current_partition_str == "CPX") { + profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_CPX; + } else { + profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_INVALID; + } + } else { + profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_INVALID; + } + + amdsmi_gpu_metrics_t metric_info = {}; + status = amdsmi_get_gpu_metrics_info(processor_handle, &metric_info); + if (status == AMDSMI_STATUS_SUCCESS + && metric_info.num_partition != std::numeric_limits::max()) { + profile->num_partitions = metric_info.num_partition; + } + + bool isPrimaryNode = false; + for (uint32_t partition_num = 0; partition_num < profile->num_partitions; partition_num++) { + amdsmi_status_t status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, + partition_num, &tmp_partition_id); + if (status == AMDSMI_STATUS_SUCCESS) { + // only create list from primary partition, rest should be array* = {0} + if ((partition_num == 0 && tmp_partition_id == 0) + || (profile->profile_type == AMDSMI_ACCELERATOR_PARTITION_SPX) + || (profile->profile_type == AMDSMI_ACCELERATOR_PARTITION_INVALID)) { + isPrimaryNode = true; + partition_id[partition_num] = tmp_partition_id; + ss << __PRETTY_FUNCTION__ + << " | [PRIMARY node confirmed] partition_id[" + << partition_num << "]: " << tmp_partition_id; + LOG_DEBUG(ss); + } else if (isPrimaryNode) { + partition_id[partition_num] = tmp_partition_id; + ss << __PRETTY_FUNCTION__ + << " | [PRIMARY node confirmed - remaining node list] partition_id[" + << partition_num << "]: " << tmp_partition_id; + LOG_DEBUG(ss); + } + } else { + break; + } + } + + std::ostringstream ss_2; + const uint32_t kMaxPartitions = 8; + uint32_t copy_partition_ids[kMaxPartitions] = {0}; // initialize all to 0s + std::copy(partition_id, partition_id + kMaxPartitions, copy_partition_ids); + std::copy(std::begin(copy_partition_ids), + std::end(copy_partition_ids), + amd::smi::make_ostream_joiner(&ss_2, ", ")); + ss << __PRETTY_FUNCTION__ + << " | Num_partitions: " << profile->num_partitions + << "; profile->profile_type: " << profile->profile_type + << "; partition_id: " << ss_2.str() << "\n"; + LOG_DEBUG(ss); // Add memory partition capabilities here constexpr uint32_t kLenCapsSize = 30; char memory_caps[kLenCapsSize]; - status = rsmi_wrapper(rsmi_dev_memory_partition_capabilities_get, processor_handle, + status = rsmi_wrapper(rsmi_dev_memory_partition_capabilities_get, processor_handle, 0, memory_caps, kLenCapsSize); ss << __PRETTY_FUNCTION__ << " | rsmi_dev_memory_partition_capabilities_get Returning: " @@ -1641,6 +2381,46 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h return status; } +amdsmi_status_t +amdsmi_set_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_handle, + uint32_t profile_index) { + AMDSMI_CHECK_INIT(); + std::ostringstream ss; + amdsmi_accelerator_partition_profile_config_t config; + amdsmi_status_t status = amdsmi_get_gpu_accelerator_partition_profile_config( + processor_handle, &config); + + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + + std::map mp_prof_indx_to_accel_type; + + for (uint32_t i = 0; i < config.num_profiles; i++) { + auto it = partition_types_map.find(config.profiles[i].profile_type); + std::string partition_type_str = "N/A"; + if (it != partition_types_map.end()) { + partition_type_str.clear(); + partition_type_str = it->second; + } + config.profiles[i].profile_index; + ss << __PRETTY_FUNCTION__ << " | " + << "config.profiles[" << i << "].profile_type: " + << static_cast(config.profiles[i].profile_type) << "\n" + << " | config.profiles[" << i << "].profile_type (str): " + << partition_type_str << "\n" + << "| config.profiles[" << i << "].profile_index: " + << static_cast(config.profiles[i].profile_index) + << "\n"; + LOG_DEBUG(ss); + mp_prof_indx_to_accel_type[config.profiles[i].profile_index] + = config.profiles[i].profile_type; + } + auto return_status = amdsmi_set_gpu_compute_partition(processor_handle, + static_cast(mp_prof_indx_to_accel_type[profile_index])); + return return_status; +} + // TODO(bliu) : other xgmi related information amdsmi_status_t amdsmi_get_xgmi_info(amdsmi_processor_handle processor_handle, amdsmi_xgmi_info_t *info) { @@ -1648,19 +2428,19 @@ amdsmi_get_xgmi_info(amdsmi_processor_handle processor_handle, amdsmi_xgmi_info_ if (info == nullptr) return AMDSMI_STATUS_INVAL; - return rsmi_wrapper(rsmi_dev_xgmi_hive_id_get, processor_handle, + return rsmi_wrapper(rsmi_dev_xgmi_hive_id_get, processor_handle, 0, &(info->xgmi_hive_id)); } amdsmi_status_t amdsmi_gpu_xgmi_error_status(amdsmi_processor_handle processor_handle, amdsmi_xgmi_status_t *status) { - return rsmi_wrapper(rsmi_dev_xgmi_error_status, processor_handle, + return rsmi_wrapper(rsmi_dev_xgmi_error_status, processor_handle, 0, reinterpret_cast(status)); } amdsmi_status_t amdsmi_reset_gpu_xgmi_error(amdsmi_processor_handle processor_handle) { - return rsmi_wrapper(rsmi_dev_xgmi_error_reset, processor_handle); + return rsmi_wrapper(rsmi_dev_xgmi_error_reset, processor_handle, 0); } amdsmi_status_t @@ -1702,7 +2482,7 @@ amdsmi_status_t amdsmi_get_gpu_ecc_count(amdsmi_processor_handle processor_hand AMDSMI_CHECK_INIT(); // nullptr api supported - return rsmi_wrapper(rsmi_dev_ecc_count_get, processor_handle, + return rsmi_wrapper(rsmi_dev_ecc_count_get, processor_handle, 0, static_cast(block), reinterpret_cast(ec)); } @@ -1711,7 +2491,7 @@ amdsmi_status_t amdsmi_get_gpu_ecc_enabled(amdsmi_processor_handle processor_ha AMDSMI_CHECK_INIT(); // nullptr api supported - return rsmi_wrapper(rsmi_dev_ecc_enabled_get, processor_handle, + return rsmi_wrapper(rsmi_dev_ecc_enabled_get, processor_handle, 0, enabled_blocks); } amdsmi_status_t amdsmi_get_gpu_ecc_status(amdsmi_processor_handle processor_handle, @@ -1720,7 +2500,7 @@ amdsmi_status_t amdsmi_get_gpu_ecc_status(amdsmi_processor_handle processor_han AMDSMI_CHECK_INIT(); // nullptr api supported - return rsmi_wrapper(rsmi_dev_ecc_status_get, processor_handle, + return rsmi_wrapper(rsmi_dev_ecc_status_get, processor_handle, 0, static_cast(block), reinterpret_cast(state)); } @@ -1732,7 +2512,7 @@ amdsmi_get_gpu_metrics_header_info(amdsmi_processor_handle processor_handle, AMDSMI_CHECK_INIT(); // nullptr api supported - return rsmi_wrapper(rsmi_dev_metrics_header_info_get, processor_handle, + return rsmi_wrapper(rsmi_dev_metrics_header_info_get, processor_handle, 0, reinterpret_cast(header_value)); } @@ -1744,7 +2524,7 @@ amdsmi_status_t amdsmi_get_gpu_metrics_info( if (pgpu_metrics != nullptr) { *pgpu_metrics = {}; } - return rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle, + return rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle, 0, reinterpret_cast(pgpu_metrics)); } @@ -1755,7 +2535,7 @@ amdsmi_status_t amdsmi_get_gpu_pm_metrics_info( uint32_t *num_of_metrics) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_pm_metrics_info_get, processor_handle, + return rsmi_wrapper(rsmi_dev_pm_metrics_info_get, processor_handle, 0, reinterpret_cast(pm_metrics), num_of_metrics); } @@ -1767,7 +2547,7 @@ amdsmi_status_t amdsmi_get_gpu_reg_table_info( uint32_t *num_of_metrics) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_reg_table_info_get, processor_handle, + return rsmi_wrapper(rsmi_dev_reg_table_info_get, processor_handle, 0, static_cast(reg_type), reinterpret_cast(reg_metrics), num_of_metrics); @@ -1816,24 +2596,23 @@ amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle, if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) set_ret_success = true; info->dpm_cap = dpm; - } - else { - status = rsmi_wrapper(rsmi_dev_power_cap_get, processor_handle, + } else { + status = rsmi_wrapper(rsmi_dev_power_cap_get, processor_handle, 0, sensor_ind, &(info->power_cap)); if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) set_ret_success = true; } // Get other information from rocm-smi - status = rsmi_wrapper(rsmi_dev_power_cap_default_get, processor_handle, + status = rsmi_wrapper(rsmi_dev_power_cap_default_get, processor_handle, 0, &(info->default_power_cap)); if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) set_ret_success = true; - status = rsmi_wrapper(rsmi_dev_power_cap_range_get, processor_handle, sensor_ind, - &(info->max_power_cap), &(info->min_power_cap)); + status = rsmi_wrapper(rsmi_dev_power_cap_range_get, processor_handle, 0, + sensor_ind, &(info->max_power_cap), &(info->min_power_cap)); if ((status == AMDSMI_STATUS_SUCCESS) && !set_ret_success) @@ -1845,7 +2624,7 @@ amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle, amdsmi_status_t amdsmi_set_power_cap(amdsmi_processor_handle processor_handle, uint32_t sensor_ind, uint64_t cap) { - return rsmi_wrapper(rsmi_dev_power_cap_set, processor_handle, + return rsmi_wrapper(rsmi_dev_power_cap_set, processor_handle, 0, sensor_ind, cap); } @@ -1856,21 +2635,20 @@ amdsmi_status_t AMDSMI_CHECK_INIT(); // nullptr api supported - return rsmi_wrapper(rsmi_dev_power_profile_presets_get, processor_handle, - sensor_ind, - reinterpret_cast(status)); + return rsmi_wrapper(rsmi_dev_power_profile_presets_get, processor_handle, 0, + sensor_ind, reinterpret_cast(status)); } amdsmi_status_t amdsmi_set_gpu_perf_determinism_mode( amdsmi_processor_handle processor_handle, uint64_t clkvalue) { - return rsmi_wrapper(rsmi_perf_determinism_mode_set, processor_handle, + return rsmi_wrapper(rsmi_perf_determinism_mode_set, processor_handle, 0, clkvalue); } amdsmi_status_t amdsmi_set_gpu_power_profile(amdsmi_processor_handle processor_handle, uint32_t reserved, amdsmi_power_profile_preset_masks_t profile) { - return rsmi_wrapper(rsmi_dev_power_profile_set, processor_handle, + return rsmi_wrapper(rsmi_dev_power_profile_set, processor_handle, 0, reserved, static_cast(profile)); } @@ -1880,26 +2658,26 @@ amdsmi_status_t amdsmi_get_gpu_perf_level(amdsmi_processor_handle processor_hand AMDSMI_CHECK_INIT(); // nullptr api supported - return rsmi_wrapper(rsmi_dev_perf_level_get, processor_handle, + return rsmi_wrapper(rsmi_dev_perf_level_get, processor_handle, 0, reinterpret_cast(perf)); } amdsmi_status_t amdsmi_set_gpu_perf_level(amdsmi_processor_handle processor_handle, amdsmi_dev_perf_level_t perf_lvl) { - return rsmi_wrapper(rsmi_dev_perf_level_set_v1, processor_handle, + return rsmi_wrapper(rsmi_dev_perf_level_set_v1, processor_handle, 0, static_cast(perf_lvl)); } amdsmi_status_t amdsmi_set_gpu_pci_bandwidth(amdsmi_processor_handle processor_handle, uint64_t bw_bitmask) { - return rsmi_wrapper(rsmi_dev_pci_bandwidth_set, processor_handle, - bw_bitmask); + return rsmi_wrapper(rsmi_dev_pci_bandwidth_set, processor_handle, 0, + bw_bitmask); } amdsmi_status_t amdsmi_get_gpu_pci_bandwidth(amdsmi_processor_handle processor_handle, amdsmi_pcie_bandwidth_t *bandwidth) { - return rsmi_wrapper(rsmi_dev_pci_bandwidth_get, processor_handle, + return rsmi_wrapper(rsmi_dev_pci_bandwidth_get, processor_handle, 0, reinterpret_cast(bandwidth)); } @@ -1972,7 +2750,7 @@ amdsmi_status_t amdsmi_get_clk_freq(amdsmi_processor_handle processor_handle, return r_status; } - return rsmi_wrapper(rsmi_dev_gpu_clk_freq_get, processor_handle, + return rsmi_wrapper(rsmi_dev_gpu_clk_freq_get, processor_handle, 0, static_cast(clk_type), reinterpret_cast(f)); } @@ -1989,7 +2767,7 @@ amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle, return AMDSMI_STATUS_NOT_SUPPORTED; } - return rsmi_wrapper(rsmi_dev_gpu_clk_freq_set, processor_handle, + return rsmi_wrapper(rsmi_dev_gpu_clk_freq_set, processor_handle, 0, static_cast(clk_type), freq_bitmask); } @@ -1997,7 +2775,7 @@ amdsmi_status_t amdsmi_set_soc_pstate(amdsmi_processor_handle processor_handle, uint32_t policy) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_soc_pstate_set, processor_handle, + return rsmi_wrapper(rsmi_dev_soc_pstate_set, processor_handle, 0, policy); } @@ -2005,7 +2783,7 @@ amdsmi_status_t amdsmi_get_soc_pstate(amdsmi_processor_handle processor_handle, amdsmi_dpm_policy_t* policy) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_soc_pstate_get, processor_handle, + return rsmi_wrapper(rsmi_dev_soc_pstate_get, processor_handle, 0, reinterpret_cast(policy)); } @@ -2013,7 +2791,7 @@ amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle, uint32_t policy) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_xgmi_plpd_set, processor_handle, + return rsmi_wrapper(rsmi_dev_xgmi_plpd_set, processor_handle, 0, policy); } @@ -2021,7 +2799,7 @@ amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle, amdsmi_dpm_policy_t* policy) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_xgmi_plpd_get, processor_handle, + return rsmi_wrapper(rsmi_dev_xgmi_plpd_get, processor_handle, 0, reinterpret_cast(policy)); } @@ -2029,7 +2807,7 @@ amdsmi_status_t amdsmi_get_gpu_process_isolation(amdsmi_processor_handle process uint32_t* pisolate) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_process_isolation_get, processor_handle, + return rsmi_wrapper(rsmi_dev_process_isolation_get, processor_handle, 0, pisolate); } @@ -2037,80 +2815,80 @@ amdsmi_status_t amdsmi_set_gpu_process_isolation(amdsmi_processor_handle process uint32_t pisolate) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_process_isolation_set, processor_handle, + return rsmi_wrapper(rsmi_dev_process_isolation_set, processor_handle, 0, pisolate); } amdsmi_status_t amdsmi_clean_gpu_local_data(amdsmi_processor_handle processor_handle) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_gpu_run_cleaner_shader, processor_handle); + return rsmi_wrapper(rsmi_dev_gpu_run_cleaner_shader, processor_handle, 0); } amdsmi_status_t amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle, uint32_t *num_pages, amdsmi_retired_page_record_t *records) { - return rsmi_wrapper(rsmi_dev_memory_reserved_pages_get, processor_handle, + return rsmi_wrapper(rsmi_dev_memory_reserved_pages_get, processor_handle, 0, num_pages, reinterpret_cast(records)); } amdsmi_status_t amdsmi_get_gpu_memory_total(amdsmi_processor_handle processor_handle, amdsmi_memory_type_t mem_type, uint64_t *total) { - return rsmi_wrapper(rsmi_dev_memory_total_get, processor_handle, + return rsmi_wrapper(rsmi_dev_memory_total_get, processor_handle, 0, static_cast(mem_type), total); } amdsmi_status_t amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle, amdsmi_memory_type_t mem_type, uint64_t *used) { - return rsmi_wrapper(rsmi_dev_memory_usage_get, processor_handle, + return rsmi_wrapper(rsmi_dev_memory_usage_get, processor_handle, 0, static_cast(mem_type), used); } amdsmi_status_t amdsmi_get_gpu_overdrive_level( amdsmi_processor_handle processor_handle, uint32_t *od) { - return rsmi_wrapper(rsmi_dev_overdrive_level_get, processor_handle, od); + return rsmi_wrapper(rsmi_dev_overdrive_level_get, processor_handle, 0, od); } amdsmi_status_t amdsmi_get_gpu_mem_overdrive_level( amdsmi_processor_handle processor_handle, uint32_t *od) { - return rsmi_wrapper(rsmi_dev_mem_overdrive_level_get, processor_handle, od); + return rsmi_wrapper(rsmi_dev_mem_overdrive_level_get, processor_handle, 0, od); } amdsmi_status_t amdsmi_set_gpu_overdrive_level( amdsmi_processor_handle processor_handle, uint32_t od) { - return rsmi_wrapper(rsmi_dev_overdrive_level_set_v1, processor_handle, od); + return rsmi_wrapper(rsmi_dev_overdrive_level_set_v1, processor_handle, 0, od); } amdsmi_status_t amdsmi_get_gpu_pci_replay_counter( amdsmi_processor_handle processor_handle, uint64_t *counter) { - return rsmi_wrapper(rsmi_dev_pci_replay_counter_get, - processor_handle, counter); + return rsmi_wrapper(rsmi_dev_pci_replay_counter_get, processor_handle, 0, + counter); } amdsmi_status_t amdsmi_get_gpu_pci_throughput( amdsmi_processor_handle processor_handle, uint64_t *sent, uint64_t *received, uint64_t *max_pkt_sz) { - return rsmi_wrapper(rsmi_dev_pci_throughput_get, processor_handle, + return rsmi_wrapper(rsmi_dev_pci_throughput_get, processor_handle, 0, sent, received, max_pkt_sz); } amdsmi_status_t amdsmi_get_gpu_od_volt_info(amdsmi_processor_handle processor_handle, amdsmi_od_volt_freq_data_t *odv) { - return rsmi_wrapper(rsmi_dev_od_volt_info_get, processor_handle, + return rsmi_wrapper(rsmi_dev_od_volt_info_get, processor_handle, 0, reinterpret_cast(odv)); } amdsmi_status_t amdsmi_get_gpu_od_volt_curve_regions( amdsmi_processor_handle processor_handle, uint32_t *num_regions, amdsmi_freq_volt_region_t *buffer) { - return rsmi_wrapper(rsmi_dev_od_volt_curve_regions_get, processor_handle, + return rsmi_wrapper(rsmi_dev_od_volt_curve_regions_get, processor_handle, 0, num_regions, reinterpret_cast(buffer)); } amdsmi_status_t amdsmi_get_gpu_volt_metric(amdsmi_processor_handle processor_handle, amdsmi_voltage_type_t sensor_type, amdsmi_voltage_metric_t metric, int64_t *voltage) { - return rsmi_wrapper(rsmi_dev_volt_metric_get, processor_handle, + return rsmi_wrapper(rsmi_dev_volt_metric_get, processor_handle, 0, static_cast(sensor_type), static_cast(metric), voltage); } @@ -2119,14 +2897,14 @@ amdsmi_status_t amdsmi_set_gpu_od_clk_info(amdsmi_processor_handle processor_ha amdsmi_freq_ind_t level, uint64_t clkvalue, amdsmi_clk_type_t clkType) { - return rsmi_wrapper(rsmi_dev_od_clk_info_set, processor_handle, + return rsmi_wrapper(rsmi_dev_od_clk_info_set, processor_handle, 0, static_cast(level), clkvalue, static_cast(clkType)); } amdsmi_status_t amdsmi_set_gpu_od_volt_info(amdsmi_processor_handle processor_handle, uint32_t vpoint, uint64_t clkvalue, uint64_t voltvalue) { - return rsmi_wrapper(rsmi_dev_od_volt_info_set, processor_handle, + return rsmi_wrapper(rsmi_dev_od_volt_info_set, processor_handle, 0, vpoint, clkvalue, voltvalue); } @@ -2134,7 +2912,7 @@ amdsmi_status_t amdsmi_set_gpu_clk_range(amdsmi_processor_handle processor_handl uint64_t minclkvalue, uint64_t maxclkvalue, amdsmi_clk_type_t clkType) { - return rsmi_wrapper(rsmi_dev_clk_range_set, processor_handle, + return rsmi_wrapper(rsmi_dev_clk_range_set, processor_handle, 0, minclkvalue, maxclkvalue, static_cast(clkType)); } @@ -2143,40 +2921,40 @@ amdsmi_status_t amdsmi_set_gpu_clk_limit(amdsmi_processor_handle processor_handl amdsmi_clk_type_t clk_type, amdsmi_clk_limit_type_t limit_type, uint64_t clk_value) { - return rsmi_wrapper(rsmi_dev_clk_extremum_set, processor_handle, + return rsmi_wrapper(rsmi_dev_clk_extremum_set, processor_handle, 0, static_cast(limit_type), clk_value, static_cast(clk_type)); } amdsmi_status_t amdsmi_reset_gpu(amdsmi_processor_handle processor_handle) { - return rsmi_wrapper(rsmi_dev_gpu_reset, processor_handle); + return rsmi_wrapper(rsmi_dev_gpu_reset, processor_handle, 0); } amdsmi_status_t amdsmi_get_utilization_count(amdsmi_processor_handle processor_handle, amdsmi_utilization_counter_t utilization_counters[], uint32_t count, uint64_t *timestamp) { - return rsmi_wrapper(rsmi_utilization_count_get, processor_handle, + return rsmi_wrapper(rsmi_utilization_count_get, processor_handle, 0, reinterpret_cast(utilization_counters), count, timestamp); } amdsmi_status_t amdsmi_get_energy_count(amdsmi_processor_handle processor_handle, uint64_t *energy_accumulator, float *counter_resolution, uint64_t *timestamp) { - return rsmi_wrapper(rsmi_dev_energy_count_get, processor_handle, + return rsmi_wrapper(rsmi_dev_energy_count_get, processor_handle, 0, energy_accumulator, counter_resolution, timestamp); } amdsmi_status_t amdsmi_get_gpu_bdf_id( amdsmi_processor_handle processor_handle, uint64_t *bdfid) { - return rsmi_wrapper(rsmi_dev_pci_id_get, processor_handle, + return rsmi_wrapper(rsmi_dev_pci_id_get, processor_handle, 0, bdfid); } amdsmi_status_t amdsmi_get_gpu_topo_numa_affinity( amdsmi_processor_handle processor_handle, int32_t *numa_node) { - return rsmi_wrapper(rsmi_topo_numa_affinity_get, processor_handle, + return rsmi_wrapper(rsmi_topo_numa_affinity_get, processor_handle, 0, numa_node); } @@ -2224,7 +3002,7 @@ amdsmi_get_gpu_vbios_info(amdsmi_processor_handle processor_handle, amdsmi_vbios else { // get vbios version string from rocm_smi char vbios_version[AMDSMI_NORMAL_STRING_LENGTH]; - status = rsmi_wrapper(rsmi_dev_vbios_version_get, processor_handle, + status = rsmi_wrapper(rsmi_dev_vbios_version_get, processor_handle, 0, vbios_version, AMDSMI_NORMAL_STRING_LENGTH); @@ -2449,7 +3227,7 @@ amdsmi_status_t amdsmi_get_gpu_ras_feature_info( return r; rsmi_ras_feature_info_t rsmi_ras_feature; - r = rsmi_wrapper(rsmi_ras_feature_info_get, processor_handle, + r = rsmi_wrapper(rsmi_ras_feature_info_get, processor_handle, 0, &rsmi_ras_feature); if (r != AMDSMI_STATUS_SUCCESS) @@ -2735,8 +3513,8 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a // default to PCIe info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_PCIE; rsmi_pcie_slot_type_t slot_type; - status = rsmi_wrapper(rsmi_dev_pcie_slot_type_get, - processor_handle, &slot_type); + status = rsmi_wrapper(rsmi_dev_pcie_slot_type_get, processor_handle, 0, + &slot_type); if (status == AMDSMI_STATUS_SUCCESS) { switch (slot_type) { case RSMI_PCIE_SLOT_PCIE: diff --git a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc index d0e20fc76a..5ba60a17d0 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc @@ -128,7 +128,7 @@ amdsmi_status_t AMDSmiGPUDevice::amdgpu_query_vbios(void *info) const { amdsmi_status_t ret; uint32_t fd = 0; ret = drm_.get_drm_fd_by_index(gpu_id_, &fd); - if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED;; + if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED; return drm_.amdgpu_query_vbios(fd, info); }