From 640eba77641bb0937226985964caa9fca8cdbde2 Mon Sep 17 00:00:00 2001 From: gabrpham_amdeng Date: Mon, 28 Apr 2025 14:46:11 -0500 Subject: [PATCH] [SWDEV-529889] Fixed incorrect vendor_id reporting in amdsmi_get_gpu_asic_info Signed-off-by: gabrpham_amdeng [ROCm/amdsmi commit: 1ab57ce7dda677d1ad5363a077c919b7da7dfb97] --- projects/amdsmi/rocm_smi/src/rocm_smi.cc | 28 ++++++++++- projects/amdsmi/src/amd_smi/amd_smi.cc | 63 ++++++++++++++---------- 2 files changed, 63 insertions(+), 28 deletions(-) diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi.cc b/projects/amdsmi/rocm_smi/src/rocm_smi.cc index b6aac0ef9c..eb95a7d8d1 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi.cc @@ -1012,11 +1012,37 @@ rsmi_dev_subsystem_id_get(uint32_t dv_ind, uint16_t *id) { rsmi_status_t rsmi_dev_vendor_id_get(uint32_t dv_ind, uint16_t *id) { + // need to get this to fall back to kfd if sysfs doesn't work std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(id) - return get_id(dv_ind, amd::smi::kDevVendorID, id); + rsmi_status_t status = get_id(dv_ind, amd::smi::kDevVendorID, id); + if (status != RSMI_STATUS_SUCCESS) + { + // /sys/class/kfd/kfd/topology/nodes/*/properties + GET_DEV_AND_KFDNODE_FROM_INDX + uint32_t node_id; + uint64_t kfd_vendor_id; + int ret_kfd = kfd_node->get_node_id(&node_id); + ret_kfd = amd::smi::read_node_properties(node_id, "vendor_id", &kfd_vendor_id); + if (ret_kfd == 0) { + *id = kfd_vendor_id; + status = RSMI_STATUS_SUCCESS; + } else { + *id = std::numeric_limits::max(); + status = RSMI_STATUS_NOT_SUPPORTED; + } + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read device from sysfs, falling back to KFD" << "\n" + << " ; Device #: " << std::to_string(dv_ind) << "\n" + << " ; ret_kfd: " << std::to_string(ret_kfd) << "\n" + << " ; node: " << std::to_string(node_id) << "\n" + << " ; Data: vendor_id (from KFD)= " << std::to_string(*id) << "\n" + << " ; ret = " << getRSMIStatusString(status, false); + LOG_DEBUG(ss); + } + return status; } rsmi_status_t diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 8df8c10ec7..cf37c1c44b 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -1353,35 +1353,44 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i amdsmi_status_t status; amd::smi::AMDSmiSystem::getInstance().init_drm(); - if (gpu_device->check_if_drm_is_supported()) { - status = gpu_device->amdgpu_query_info(AMDGPU_INFO_DEV_INFO, - sizeof(struct drm_amdgpu_info_device), &dev_info); - ss << __PRETTY_FUNCTION__ - << " | amdgpu_query_info(): " - << smi_amdgpu_get_status_string(status, true); - LOG_INFO(ss); - if (status != AMDSMI_STATUS_SUCCESS) { - amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); - return status; - } - SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()) - status = smi_amdgpu_get_market_name_from_dev_id(gpu_device, info->market_name); - if (status != AMDSMI_STATUS_SUCCESS) { - rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0, - info->market_name, AMDSMI_256_LENGTH); - } + // removing drm check for now due to drm issues + // if (gpu_device->check_if_drm_is_supported()) { + // status = gpu_device->amdgpu_query_info(AMDGPU_INFO_DEV_INFO, + // sizeof(struct drm_amdgpu_info_device), &dev_info); + // ss << __PRETTY_FUNCTION__ + // << " | amdgpu_query_info(): " + // << smi_amdgpu_get_status_string(status, true); + // LOG_INFO(ss); + // if (status != AMDSMI_STATUS_SUCCESS) { + // amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); + // return status; + // } + // SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()) + // status = smi_amdgpu_get_market_name_from_dev_id(gpu_device, info->market_name); + // if (status != AMDSMI_STATUS_SUCCESS) { + // rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0, + // info->market_name, AMDSMI_256_LENGTH); + // } - info->device_id = dev_info.device_id; - info->rev_id = dev_info.pci_rev; - info->vendor_id = gpu_device->get_vendor_id(); - } else { - status = rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0, - info->market_name, AMDSMI_256_LENGTH); + // info->device_id = dev_info.device_id; + // info->rev_id = dev_info.pci_rev; + // info->vendor_id = gpu_device->get_vendor_id(); + // } else { + uint16_t device_id = std::numeric_limits::max(); + status = rsmi_wrapper(rsmi_dev_id_get, processor_handle, 0, &device_id); + info->device_id = static_cast(device_id); - status = rsmi_wrapper(rsmi_dev_vendor_id_get, processor_handle, 0, - &vendor_id); - if (status == AMDSMI_STATUS_SUCCESS) info->vendor_id = vendor_id; - } + uint16_t rev_id = std::numeric_limits::max(); + status = rsmi_wrapper(rsmi_dev_revision_get, processor_handle, 0, &rev_id); + info->rev_id = static_cast(rev_id); + + status = rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0, + info->market_name, AMDSMI_256_LENGTH); + + status = rsmi_wrapper(rsmi_dev_vendor_id_get, processor_handle, 0, + &vendor_id); + if (status == AMDSMI_STATUS_SUCCESS) info->vendor_id = vendor_id; + // } // For other sysfs related information, get from rocm-smi // Ensure asic_serial defaults to an unsupported value