From b58625cafa74f72d04edf1c86cdc1917549f015f Mon Sep 17 00:00:00 2001 From: "Poag, Charis" Date: Tue, 22 Apr 2025 16:27:33 -0500 Subject: [PATCH] [SWDEV-528097] Unique ID fix for missing ID in KGD -> use KFD's (#292) Changes: - Unique Id tries reading from KGD -> falls back to use KFD if not found Change-Id: I05456dd79715e04d83f118b5bb4f1d3612822173 --------- Signed-off-by: Charis Poag Signed-off-by: Arif, Maisam --- include/amd_smi/impl/amd_smi_utils.h | 2 ++ rocm_smi/src/rocm_smi.cc | 36 ++++++++++++++++++++++++++ rocm_smi/src/rocm_smi_kfd.cc | 2 +- src/amd_smi/amd_smi.cc | 38 ++++++++++++++++------------ src/amd_smi/amd_smi_utils.cc | 8 +++--- 5 files changed, 66 insertions(+), 20 deletions(-) diff --git a/include/amd_smi/impl/amd_smi_utils.h b/include/amd_smi/impl/amd_smi_utils.h index bebdf12a20..4038284730 100644 --- a/include/amd_smi/impl/amd_smi_utils.h +++ b/include/amd_smi/impl/amd_smi_utils.h @@ -54,6 +54,8 @@ amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(amd::smi::AMDSmiGPUDevice amdsmi_status_t smi_amdgpu_is_gpu_power_management_enabled(amd::smi::AMDSmiGPUDevice* device, bool *enabled); std::string smi_split_string(std::string str, char delim); std::string smi_amdgpu_get_status_string(amdsmi_status_t ret, bool fullStatus); +amdsmi_status_t smi_clear_char_and_reinitialize(char buffer[], uint32_t len, + std::string newString); amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(const char *amdgpu_ring_cper_file, uint32_t severity_mask, char *cper_data, uint64_t *buf_size, amdsmi_cper_hdr_t **cper_hdrs, uint64_t *entry_count, uint64_t *cursor); /** diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 7626b3b64e..56a4de719c 100644 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -4523,11 +4523,47 @@ rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) { CHK_SUPPORT_NAME_ONLY(unique_id) DEVICE_MUTEX + if (unique_id == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + *unique_id = std::numeric_limits::max(); ret = get_dev_value_int(amd::smi::kDevUniqueId, dv_ind, unique_id); + + ss << __PRETTY_FUNCTION__ + << (ret == RSMI_STATUS_SUCCESS ? + " | No fall back needed retrieved from KGD" : " | fall back needed") + << " | Device #: " << std::to_string(dv_ind) + << " | Data: unique_id = " << std::to_string(*unique_id) + << " | ret = " << getRSMIStatusString(ret, false); + LOG_DEBUG(ss); + // If the unique ID is not supported, use KFD's unique ID + if (ret != RSMI_STATUS_SUCCESS) { + GET_DEV_AND_KFDNODE_FROM_INDX + uint32_t node_id; + uint64_t kfd_unique_id; + int ret_kfd = kfd_node->get_node_id(&node_id); + ret_kfd = amd::smi::read_node_properties(node_id, "unique_id", &kfd_unique_id); + if (ret_kfd == 0) { + *unique_id = kfd_unique_id; + ret = RSMI_STATUS_SUCCESS; + } else { + *unique_id = std::numeric_limits::max(); + ret = RSMI_STATUS_NOT_SUPPORTED; + } + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read unique_id from sysfs, falling back to KFD" << "\n" + << " ; Device #: " << std::to_string(dv_ind) << "\n" + << " ; ret_kfd: " << std::to_string(ret_kfd) << "\n" + << " ; node: " << std::to_string(node_id) << "\n" + << " ; Data: unique_id (from KFD)= " << std::to_string(*unique_id) << "\n" + << " ; ret = " << getRSMIStatusString(ret, false); + LOG_DEBUG(ss); + } return ret; CATCH } + rsmi_status_t rsmi_dev_counter_create(uint32_t dv_ind, rsmi_event_type_t type, rsmi_event_handle_t *evnt_handle) { diff --git a/rocm_smi/src/rocm_smi_kfd.cc b/rocm_smi/src/rocm_smi_kfd.cc index 24c986e75a..070e7a6f7b 100644 --- a/rocm_smi/src/rocm_smi_kfd.cc +++ b/rocm_smi/src/rocm_smi_kfd.cc @@ -1134,6 +1134,7 @@ int KFDNode::get_node_id(uint32_t *node_id) { int ret = 0; std::string nodeid_path = "/sys/class/kfd/kfd/topology/nodes/" + std::to_string(this->node_indx_); + *node_id = this->node_indx_; ss << __PRETTY_FUNCTION__ << " | File: " << nodeid_path << " | Read node #: " << std::to_string(this->node_indx_) @@ -1141,7 +1142,6 @@ int KFDNode::get_node_id(uint32_t *node_id) { << " | Return: " << getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(ret), false) << " | "; - *node_id = this->node_indx_; LOG_DEBUG(ss); return ret; } diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 9ee0ef4728..d06e9d53a5 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -1341,16 +1341,7 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); return status; } - SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()) - - std::string path = "/sys/class/drm/" + gpu_device->get_gpu_path() + "/device/unique_id"; - FILE *fp = fopen(path.c_str(), "r"); - if (fp) { - fscanf(fp, "%s", info->asic_serial); - fclose(fp); - } - status = smi_amdgpu_get_market_name_from_dev_id(gpu_device, info->market_name); if (status != AMDSMI_STATUS_SUCCESS) { rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0, @@ -1361,13 +1352,6 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i info->rev_id = dev_info.pci_rev; info->vendor_id = gpu_device->get_vendor_id(); } else { - uint64_t dv_uid = 0; - status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0, - &dv_uid); - if (status == AMDSMI_STATUS_SUCCESS) { - snprintf(info->asic_serial, sizeof(info->asic_serial), "%lu", dv_uid); - } - status = rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0, info->market_name, AMDSMI_256_LENGTH); @@ -1376,6 +1360,28 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i if (status == AMDSMI_STATUS_SUCCESS) info->vendor_id = vendor_id; } // For other sysfs related information, get from rocm-smi + + // Ensure asic_serial defaults to an unsupported value + std::string max_uint64_str = "ffffffffffffffff"; + smi_clear_char_and_reinitialize(info->asic_serial, AMDSMI_MAX_STRING_LENGTH, max_uint64_str); + uint64_t dv_uid = 0; + status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0, &dv_uid); + if (status == AMDSMI_STATUS_SUCCESS) { + ss.clear(); + ss << std::hex << dv_uid; + std::string asic_serial_str = ss.str(); + ss.clear(); + smi_clear_char_and_reinitialize(info->asic_serial, AMDSMI_MAX_STRING_LENGTH, + asic_serial_str); + ss << __PRETTY_FUNCTION__ + << " | Retrieved unique_id from rsmi: " << processor_handle << "\n" + << " ; Successfully fell back to KFD's unique_id... \n" + << " ; info->asic_serial (hex): " << info->asic_serial << "\n" + << " ; info->asic_serial (dec): " << std::dec + << static_cast(std::stoull(asic_serial_str, nullptr, 16)); + LOG_INFO(ss); + } + status = rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, processor_handle, 0, &subvendor_id); if (status == AMDSMI_STATUS_SUCCESS) info->subvendor_id = subvendor_id; diff --git a/src/amd_smi/amd_smi_utils.cc b/src/amd_smi/amd_smi_utils.cc index ede1948f42..1ae35d4a1b 100644 --- a/src/amd_smi/amd_smi_utils.cc +++ b/src/amd_smi/amd_smi_utils.cc @@ -97,7 +97,8 @@ std::string removeString(const std::string origStr, return modifiedStr; } -static void clearCharBufferAndReinitialize(char buffer[], uint32_t len, std::string newString) { +amdsmi_status_t smi_clear_char_and_reinitialize(char buffer[], uint32_t len, + std::string newString) { char *begin = &buffer[0]; char *end = &buffer[len]; std::fill(begin, end, 0); @@ -108,14 +109,15 @@ static void clearCharBufferAndReinitialize(char buffer[], uint32_t len, std::str std::memcpy(buffer, newString.c_str(), copy_len); } buffer[copy_len] = '\0'; - } + return AMDSMI_STATUS_SUCCESS; +} int openFileAndModifyBuffer(std::string path, char *buff, size_t sizeOfBuff, bool trim_whitespace = true) { bool errorDiscovered = false; std::ifstream file(path, std::ifstream::in); std::string contents = {std::istreambuf_iterator{file}, std::istreambuf_iterator{}}; - clearCharBufferAndReinitialize(buff, static_cast(sizeOfBuff), contents); + smi_clear_char_and_reinitialize(buff, static_cast(sizeOfBuff), contents); if (!file.is_open()) { errorDiscovered = true; } else {