diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index da234a8302..325d4d7a1e 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -198,7 +198,9 @@ class AMDSMICommands(): if args.gpu == None: args.gpu = self.device_handles - self.helpers.check_required_groups() + if not self.group_check_printed: + self.helpers.check_required_groups() + self.group_check_printed = True # Handle multiple GPUs handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.list) @@ -219,15 +221,6 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: uuid = e.get_error_info() - try: - enumeration_info = amdsmi_interface.amdsmi_get_gpu_enumeration_info(args.gpu) - except: - enumeration_info = {"drm_render": "N/A", - "drm_card": "N/A", - "hip_id": "N/A", - "hip_uuid": "N/A", - "hsa_id": "N/A"} - try: kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu) kfd_id = kfd_info['kfd_id'] @@ -250,6 +243,14 @@ class AMDSMICommands(): self.logger.store_output(args.gpu, 'partition_id', partition_id) if args.e: + try: + enumeration_info = amdsmi_interface.amdsmi_get_gpu_enumeration_info(args.gpu) + except: + enumeration_info = {"drm_render": "N/A", + "drm_card": "N/A", + "hip_id": "N/A", + "hip_uuid": "N/A", + "hsa_id": "N/A"} if enumeration_info['drm_render'] == "N/A": self.logger.store_output(args.gpu, 'render', enumeration_info['drm_render']) else: @@ -404,7 +405,9 @@ class AMDSMICommands(): args.vram, args.cache, args.board, args.process_isolation, args.clock, args.partition] - self.helpers.check_required_groups() + if not self.group_check_printed: + self.helpers.check_required_groups() + self.group_check_printed = True if self.helpers.is_linux() and self.helpers.is_baremetal(): if limit: @@ -3409,7 +3412,9 @@ class AMDSMICommands(): # Clear the table header self.logger.table_header = ''.rjust(12) - self.helpers.check_required_groups() + if not self.group_check_printed: + self.helpers.check_required_groups() + self.group_check_printed = True # Populate the possible gpus topo_values = [] @@ -4698,7 +4703,9 @@ class AMDSMICommands(): if core: args.core = core - self.helpers.check_required_groups() + if not self.group_check_printed: + self.helpers.check_required_groups() + self.group_check_printed = True # Check if a GPU argument has been set gpu_args_enabled = False @@ -4844,7 +4851,9 @@ class AMDSMICommands(): if args.gpu == None: args.gpu = self.device_handles - self.helpers.check_required_groups() + if not self.group_check_printed: + self.helpers.check_required_groups() + self.group_check_printed = True # Handle multiple GPUs handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.reset) @@ -5722,7 +5731,9 @@ class AMDSMICommands(): # Clear the table header self.logger.table_header = ''.rjust(7) - self.helpers.check_required_groups() + if not self.group_check_printed: + self.helpers.check_required_groups() + self.group_check_printed = True # Populate the possible gpus and their bdfs xgmi_values = [] @@ -5968,7 +5979,9 @@ class AMDSMICommands(): if accelerator: args.accelerator = accelerator - self.helpers.check_required_groups() + if not self.group_check_printed: + self.helpers.check_required_groups() + self.group_check_printed = True ########################################### # amd-smi partition (no args) # @@ -6296,7 +6309,10 @@ class AMDSMICommands(): if args.gpu == None: args.gpu = self.device_handles - self.helpers.check_required_groups() + if not self.group_check_printed: + self.helpers.check_required_groups() + self.group_check_printed = True + handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.ras) if handled_multiple_gpus: return diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi.cc b/projects/amdsmi/rocm_smi/src/rocm_smi.cc index 56a4de719c..b6aac0ef9c 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi.cc @@ -799,6 +799,9 @@ rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node) { DEVICE_MUTEX std::string str_val; ret = get_dev_value_str(amd::smi::kDevNumaNode, dv_ind, &str_val); + if (ret != RSMI_STATUS_SUCCESS){ + return ret; + } *numa_node = std::stoi(str_val, nullptr); return ret; @@ -913,11 +916,41 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(id) + *id = std::numeric_limits::max(); + // Get the device ID from KGD ret = get_id(dv_ind, amd::smi::kDevDevID, id); - ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", reporting " << amd::smi::getRSMIStatusString(ret); LOG_TRACE(ss); + ss << __PRETTY_FUNCTION__ + << (ret == RSMI_STATUS_SUCCESS ? + " | No fall back needed retrieved from KGD" : " | fall back needed") + << " | Device #: " << std::to_string(dv_ind) + << " | Data: device_id = " << std::to_string(*id) + << " | ret = " << getRSMIStatusString(ret, false); + LOG_DEBUG(ss); + // If the device ID is not supported, use KFD's device ID + if (ret != RSMI_STATUS_SUCCESS) { + GET_DEV_AND_KFDNODE_FROM_INDX + uint32_t node_id; + uint64_t kfd_device_id; + int ret_kfd = kfd_node->get_node_id(&node_id); + ret_kfd = amd::smi::read_node_properties(node_id, "device_id", &kfd_device_id); + if (ret_kfd == 0) { + *id = kfd_device_id; + ret = RSMI_STATUS_SUCCESS; + } else { + *id = std::numeric_limits::max(); + ret = RSMI_STATUS_NOT_SUPPORTED; + } + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read device from sysfs, falling back to KFD" << "\n" + << " ; Device #: " << std::to_string(dv_ind) << "\n" + << " ; ret_kfd: " << std::to_string(ret_kfd) << "\n" + << " ; node: " << std::to_string(node_id) << "\n" + << " ; Data: device_id (from KFD)= " << std::to_string(*id) << "\n" + << " ; ret = " << getRSMIStatusString(ret, false); + LOG_DEBUG(ss); + } return ret; } diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index d06e9d53a5..f220965ce9 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -570,31 +570,39 @@ amdsmi_get_gpu_device_uuid(amdsmi_processor_handle processor_handle, char *uuid) { AMDSMI_CHECK_INIT(); - if (uuid_length == nullptr || uuid == nullptr || uuid_length == nullptr || *uuid_length < AMDSMI_GPU_UUID_SIZE) { + if (uuid_length == nullptr || uuid == nullptr || *uuid_length < AMDSMI_GPU_UUID_SIZE) { return AMDSMI_STATUS_INVAL; } - amd::smi::AMDSmiGPUDevice* gpu_device = nullptr; - amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device); - if (r != AMDSMI_STATUS_SUCCESS) - return r; + uint64_t device_uuid = 0; + uint16_t device_id = std::numeric_limits::max(); + amdsmi_status_t status; - amdsmi_status_t status = AMDSMI_STATUS_SUCCESS; - SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()) - - amdsmi_asic_info_t asic_info = {}; - const uint8_t fcn = 0xff; - - status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info); + status = rsmi_wrapper(rsmi_dev_id_get, processor_handle, 0, &device_id); if (status != AMDSMI_STATUS_SUCCESS) { - printf("Getting asic info failed. Return code: %d", status); + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ + << " | rsmi_dev_id_get(): " + << smi_amdgpu_get_status_string(status, true); + LOG_INFO(ss); + device_id = std::numeric_limits::max(); + } + + status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0, + &device_uuid); + if (status != AMDSMI_STATUS_SUCCESS) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ + << " | rsmi_dev_unique_id_get(): " + << smi_amdgpu_get_status_string(status, true); + LOG_INFO(ss); return status; } + const uint8_t fcn = 0xff; + /* generate random UUID */ - status = amdsmi_uuid_gen(uuid, - strtoull(asic_info.asic_serial, nullptr, 16), - (uint16_t)asic_info.device_id, fcn); + status = amdsmi_uuid_gen(uuid, device_uuid, device_id, fcn); return status; } @@ -648,10 +656,10 @@ amdsmi_get_gpu_enumeration_info(amdsmi_processor_handle processor_handle, // Retrieve HIP UUID std::string hip_uuid_str = "GPU-"; - amdsmi_asic_info_t asic_info = {}; - status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info); + char asic_serial[AMDSMI_GPU_UUID_SIZE]; + status = amdsmi_get_gpu_device_uuid(processor_handle, 0, asic_serial); if (status == AMDSMI_STATUS_SUCCESS) { - hip_uuid_str += std::string(asic_info.asic_serial).substr(0, sizeof(info->hip_uuid) - hip_uuid_str.size() - 1); + hip_uuid_str += std::string(asic_serial).substr(0, sizeof(info->hip_uuid) - hip_uuid_str.size() - 1); std::strncpy(info->hip_uuid, hip_uuid_str.c_str(), sizeof(info->hip_uuid) - 1); info->hip_uuid[sizeof(info->hip_uuid) - 1] = '\0'; // Ensure null termination }