[SWDEV-528097] Unique ID fix for missing ID in KGD -> use KFD's (#292)
Changes:
- Unique Id tries reading from KGD
-> falls back to use KFD if not found
Change-Id: I05456dd79715e04d83f118b5bb4f1d3612822173
---------
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
Signed-off-by: Arif, Maisam <Maisam.Arif@amd.com>
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
bd20b1b150
Коммит
b58625cafa
@@ -54,6 +54,8 @@ amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(amd::smi::AMDSmiGPUDevice
|
||||
amdsmi_status_t smi_amdgpu_is_gpu_power_management_enabled(amd::smi::AMDSmiGPUDevice* device, bool *enabled);
|
||||
std::string smi_split_string(std::string str, char delim);
|
||||
std::string smi_amdgpu_get_status_string(amdsmi_status_t ret, bool fullStatus);
|
||||
amdsmi_status_t smi_clear_char_and_reinitialize(char buffer[], uint32_t len,
|
||||
std::string newString);
|
||||
amdsmi_status_t amdsmi_get_gpu_cper_entries_by_path(const char *amdgpu_ring_cper_file, uint32_t severity_mask, char *cper_data, uint64_t *buf_size, amdsmi_cper_hdr_t **cper_hdrs, uint64_t *entry_count, uint64_t *cursor);
|
||||
|
||||
/**
|
||||
|
||||
@@ -4523,11 +4523,47 @@ rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) {
|
||||
CHK_SUPPORT_NAME_ONLY(unique_id)
|
||||
|
||||
DEVICE_MUTEX
|
||||
if (unique_id == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
*unique_id = std::numeric_limits<uint64_t>::max();
|
||||
ret = get_dev_value_int(amd::smi::kDevUniqueId, dv_ind, unique_id);
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< (ret == RSMI_STATUS_SUCCESS ?
|
||||
" | No fall back needed retrieved from KGD" : " | fall back needed")
|
||||
<< " | Device #: " << std::to_string(dv_ind)
|
||||
<< " | Data: unique_id = " << std::to_string(*unique_id)
|
||||
<< " | ret = " << getRSMIStatusString(ret, false);
|
||||
LOG_DEBUG(ss);
|
||||
// If the unique ID is not supported, use KFD's unique ID
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
GET_DEV_AND_KFDNODE_FROM_INDX
|
||||
uint32_t node_id;
|
||||
uint64_t kfd_unique_id;
|
||||
int ret_kfd = kfd_node->get_node_id(&node_id);
|
||||
ret_kfd = amd::smi::read_node_properties(node_id, "unique_id", &kfd_unique_id);
|
||||
if (ret_kfd == 0) {
|
||||
*unique_id = kfd_unique_id;
|
||||
ret = RSMI_STATUS_SUCCESS;
|
||||
} else {
|
||||
*unique_id = std::numeric_limits<uint64_t>::max();
|
||||
ret = RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: Could not read unique_id from sysfs, falling back to KFD" << "\n"
|
||||
<< " ; Device #: " << std::to_string(dv_ind) << "\n"
|
||||
<< " ; ret_kfd: " << std::to_string(ret_kfd) << "\n"
|
||||
<< " ; node: " << std::to_string(node_id) << "\n"
|
||||
<< " ; Data: unique_id (from KFD)= " << std::to_string(*unique_id) << "\n"
|
||||
<< " ; ret = " << getRSMIStatusString(ret, false);
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
return ret;
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_counter_create(uint32_t dv_ind, rsmi_event_type_t type,
|
||||
rsmi_event_handle_t *evnt_handle) {
|
||||
|
||||
@@ -1134,6 +1134,7 @@ int KFDNode::get_node_id(uint32_t *node_id) {
|
||||
int ret = 0;
|
||||
std::string nodeid_path = "/sys/class/kfd/kfd/topology/nodes/"
|
||||
+ std::to_string(this->node_indx_);
|
||||
*node_id = this->node_indx_;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | File: " << nodeid_path
|
||||
<< " | Read node #: " << std::to_string(this->node_indx_)
|
||||
@@ -1141,7 +1142,6 @@ int KFDNode::get_node_id(uint32_t *node_id) {
|
||||
<< " | Return: "
|
||||
<< getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(ret), false)
|
||||
<< " | ";
|
||||
*node_id = this->node_indx_;
|
||||
LOG_DEBUG(ss);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1341,16 +1341,7 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
|
||||
amd::smi::AMDSmiSystem::getInstance().clean_up_drm();
|
||||
return status;
|
||||
}
|
||||
|
||||
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex())
|
||||
|
||||
std::string path = "/sys/class/drm/" + gpu_device->get_gpu_path() + "/device/unique_id";
|
||||
FILE *fp = fopen(path.c_str(), "r");
|
||||
if (fp) {
|
||||
fscanf(fp, "%s", info->asic_serial);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
status = smi_amdgpu_get_market_name_from_dev_id(gpu_device, info->market_name);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0,
|
||||
@@ -1361,13 +1352,6 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
|
||||
info->rev_id = dev_info.pci_rev;
|
||||
info->vendor_id = gpu_device->get_vendor_id();
|
||||
} else {
|
||||
uint64_t dv_uid = 0;
|
||||
status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0,
|
||||
&dv_uid);
|
||||
if (status == AMDSMI_STATUS_SUCCESS) {
|
||||
snprintf(info->asic_serial, sizeof(info->asic_serial), "%lu", dv_uid);
|
||||
}
|
||||
|
||||
status = rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0,
|
||||
info->market_name, AMDSMI_256_LENGTH);
|
||||
|
||||
@@ -1376,6 +1360,28 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
|
||||
if (status == AMDSMI_STATUS_SUCCESS) info->vendor_id = vendor_id;
|
||||
}
|
||||
// For other sysfs related information, get from rocm-smi
|
||||
|
||||
// Ensure asic_serial defaults to an unsupported value
|
||||
std::string max_uint64_str = "ffffffffffffffff";
|
||||
smi_clear_char_and_reinitialize(info->asic_serial, AMDSMI_MAX_STRING_LENGTH, max_uint64_str);
|
||||
uint64_t dv_uid = 0;
|
||||
status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0, &dv_uid);
|
||||
if (status == AMDSMI_STATUS_SUCCESS) {
|
||||
ss.clear();
|
||||
ss << std::hex << dv_uid;
|
||||
std::string asic_serial_str = ss.str();
|
||||
ss.clear();
|
||||
smi_clear_char_and_reinitialize(info->asic_serial, AMDSMI_MAX_STRING_LENGTH,
|
||||
asic_serial_str);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Retrieved unique_id from rsmi: " << processor_handle << "\n"
|
||||
<< " ; Successfully fell back to KFD's unique_id... \n"
|
||||
<< " ; info->asic_serial (hex): " << info->asic_serial << "\n"
|
||||
<< " ; info->asic_serial (dec): " << std::dec
|
||||
<< static_cast<uint64_t>(std::stoull(asic_serial_str, nullptr, 16));
|
||||
LOG_INFO(ss);
|
||||
}
|
||||
|
||||
status = rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, processor_handle, 0,
|
||||
&subvendor_id);
|
||||
if (status == AMDSMI_STATUS_SUCCESS) info->subvendor_id = subvendor_id;
|
||||
|
||||
@@ -97,7 +97,8 @@ std::string removeString(const std::string origStr,
|
||||
return modifiedStr;
|
||||
}
|
||||
|
||||
static void clearCharBufferAndReinitialize(char buffer[], uint32_t len, std::string newString) {
|
||||
amdsmi_status_t smi_clear_char_and_reinitialize(char buffer[], uint32_t len,
|
||||
std::string newString) {
|
||||
char *begin = &buffer[0];
|
||||
char *end = &buffer[len];
|
||||
std::fill(begin, end, 0);
|
||||
@@ -108,14 +109,15 @@ static void clearCharBufferAndReinitialize(char buffer[], uint32_t len, std::str
|
||||
std::memcpy(buffer, newString.c_str(), copy_len);
|
||||
}
|
||||
buffer[copy_len] = '\0';
|
||||
}
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
int openFileAndModifyBuffer(std::string path, char *buff, size_t sizeOfBuff,
|
||||
bool trim_whitespace = true) {
|
||||
bool errorDiscovered = false;
|
||||
std::ifstream file(path, std::ifstream::in);
|
||||
std::string contents = {std::istreambuf_iterator<char>{file}, std::istreambuf_iterator<char>{}};
|
||||
clearCharBufferAndReinitialize(buff, static_cast<uint32_t>(sizeOfBuff), contents);
|
||||
smi_clear_char_and_reinitialize(buff, static_cast<uint32_t>(sizeOfBuff), contents);
|
||||
if (!file.is_open()) {
|
||||
errorDiscovered = true;
|
||||
} else {
|
||||
|
||||
Ссылка в новой задаче
Block a user