Reduce Load times for Partition CLI (#290)

* Reduced Load times for CLI in partition mode
* Change rsmi_dev_id_get() to use KFD, if KGD interface does not exist
* Make gpu_device_uuid fallback to rsmi_wrapper
* Moved Enumeration info calls in list for more speed
* Moved made group check excluded from recursion

---------

Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
Co-authored-by: Charis Poag <Charis.Poag@amd.com>
Co-authored-by: gabrpham_amdeng <Gabriel.Pham@amd.com>

[ROCm/amdsmi commit: 63b13ecb05]
Этот коммит содержится в:
Arif, Maisam
2025-04-22 22:54:43 -05:00
коммит произвёл GitHub
родитель 5c59f20f22
Коммит 05c80e7ace
3 изменённых файлов: 95 добавлений и 38 удалений
+33 -17
Просмотреть файл
@@ -198,7 +198,9 @@ class AMDSMICommands():
if args.gpu == None:
args.gpu = self.device_handles
self.helpers.check_required_groups()
if not self.group_check_printed:
self.helpers.check_required_groups()
self.group_check_printed = True
# Handle multiple GPUs
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.list)
@@ -219,15 +221,6 @@ class AMDSMICommands():
except amdsmi_exception.AmdSmiLibraryException as e:
uuid = e.get_error_info()
try:
enumeration_info = amdsmi_interface.amdsmi_get_gpu_enumeration_info(args.gpu)
except:
enumeration_info = {"drm_render": "N/A",
"drm_card": "N/A",
"hip_id": "N/A",
"hip_uuid": "N/A",
"hsa_id": "N/A"}
try:
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)
kfd_id = kfd_info['kfd_id']
@@ -250,6 +243,14 @@ class AMDSMICommands():
self.logger.store_output(args.gpu, 'partition_id', partition_id)
if args.e:
try:
enumeration_info = amdsmi_interface.amdsmi_get_gpu_enumeration_info(args.gpu)
except:
enumeration_info = {"drm_render": "N/A",
"drm_card": "N/A",
"hip_id": "N/A",
"hip_uuid": "N/A",
"hsa_id": "N/A"}
if enumeration_info['drm_render'] == "N/A":
self.logger.store_output(args.gpu, 'render', enumeration_info['drm_render'])
else:
@@ -404,7 +405,9 @@ class AMDSMICommands():
args.vram, args.cache, args.board, args.process_isolation,
args.clock, args.partition]
self.helpers.check_required_groups()
if not self.group_check_printed:
self.helpers.check_required_groups()
self.group_check_printed = True
if self.helpers.is_linux() and self.helpers.is_baremetal():
if limit:
@@ -3409,7 +3412,9 @@ class AMDSMICommands():
# Clear the table header
self.logger.table_header = ''.rjust(12)
self.helpers.check_required_groups()
if not self.group_check_printed:
self.helpers.check_required_groups()
self.group_check_printed = True
# Populate the possible gpus
topo_values = []
@@ -4698,7 +4703,9 @@ class AMDSMICommands():
if core:
args.core = core
self.helpers.check_required_groups()
if not self.group_check_printed:
self.helpers.check_required_groups()
self.group_check_printed = True
# Check if a GPU argument has been set
gpu_args_enabled = False
@@ -4844,7 +4851,9 @@ class AMDSMICommands():
if args.gpu == None:
args.gpu = self.device_handles
self.helpers.check_required_groups()
if not self.group_check_printed:
self.helpers.check_required_groups()
self.group_check_printed = True
# Handle multiple GPUs
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.reset)
@@ -5722,7 +5731,9 @@ class AMDSMICommands():
# Clear the table header
self.logger.table_header = ''.rjust(7)
self.helpers.check_required_groups()
if not self.group_check_printed:
self.helpers.check_required_groups()
self.group_check_printed = True
# Populate the possible gpus and their bdfs
xgmi_values = []
@@ -5968,7 +5979,9 @@ class AMDSMICommands():
if accelerator:
args.accelerator = accelerator
self.helpers.check_required_groups()
if not self.group_check_printed:
self.helpers.check_required_groups()
self.group_check_printed = True
###########################################
# amd-smi partition (no args) #
@@ -6296,7 +6309,10 @@ class AMDSMICommands():
if args.gpu == None:
args.gpu = self.device_handles
self.helpers.check_required_groups()
if not self.group_check_printed:
self.helpers.check_required_groups()
self.group_check_printed = True
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.ras)
if handled_multiple_gpus:
return
+35 -2
Просмотреть файл
@@ -799,6 +799,9 @@ rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node) {
DEVICE_MUTEX
std::string str_val;
ret = get_dev_value_str(amd::smi::kDevNumaNode, dv_ind, &str_val);
if (ret != RSMI_STATUS_SUCCESS){
return ret;
}
*numa_node = std::stoi(str_val, nullptr);
return ret;
@@ -913,11 +916,41 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) {
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
LOG_TRACE(ss);
CHK_SUPPORT_NAME_ONLY(id)
*id = std::numeric_limits<uint16_t>::max();
// Get the device ID from KGD
ret = get_id(dv_ind, amd::smi::kDevDevID, id);
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
LOG_TRACE(ss);
ss << __PRETTY_FUNCTION__
<< (ret == RSMI_STATUS_SUCCESS ?
" | No fall back needed retrieved from KGD" : " | fall back needed")
<< " | Device #: " << std::to_string(dv_ind)
<< " | Data: device_id = " << std::to_string(*id)
<< " | ret = " << getRSMIStatusString(ret, false);
LOG_DEBUG(ss);
// If the device ID is not supported, use KFD's device ID
if (ret != RSMI_STATUS_SUCCESS) {
GET_DEV_AND_KFDNODE_FROM_INDX
uint32_t node_id;
uint64_t kfd_device_id;
int ret_kfd = kfd_node->get_node_id(&node_id);
ret_kfd = amd::smi::read_node_properties(node_id, "device_id", &kfd_device_id);
if (ret_kfd == 0) {
*id = kfd_device_id;
ret = RSMI_STATUS_SUCCESS;
} else {
*id = std::numeric_limits<uint16_t>::max();
ret = RSMI_STATUS_NOT_SUPPORTED;
}
ss << __PRETTY_FUNCTION__
<< " | Issue: Could not read device from sysfs, falling back to KFD" << "\n"
<< " ; Device #: " << std::to_string(dv_ind) << "\n"
<< " ; ret_kfd: " << std::to_string(ret_kfd) << "\n"
<< " ; node: " << std::to_string(node_id) << "\n"
<< " ; Data: device_id (from KFD)= " << std::to_string(*id) << "\n"
<< " ; ret = " << getRSMIStatusString(ret, false);
LOG_DEBUG(ss);
}
return ret;
}
+27 -19
Просмотреть файл
@@ -570,31 +570,39 @@ amdsmi_get_gpu_device_uuid(amdsmi_processor_handle processor_handle,
char *uuid) {
AMDSMI_CHECK_INIT();
if (uuid_length == nullptr || uuid == nullptr || uuid_length == nullptr || *uuid_length < AMDSMI_GPU_UUID_SIZE) {
if (uuid_length == nullptr || uuid == nullptr || *uuid_length < AMDSMI_GPU_UUID_SIZE) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
uint64_t device_uuid = 0;
uint16_t device_id = std::numeric_limits<uint16_t>::max();
amdsmi_status_t status;
amdsmi_status_t status = AMDSMI_STATUS_SUCCESS;
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex())
amdsmi_asic_info_t asic_info = {};
const uint8_t fcn = 0xff;
status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
status = rsmi_wrapper(rsmi_dev_id_get, processor_handle, 0, &device_id);
if (status != AMDSMI_STATUS_SUCCESS) {
printf("Getting asic info failed. Return code: %d", status);
std::ostringstream ss;
ss << __PRETTY_FUNCTION__
<< " | rsmi_dev_id_get(): "
<< smi_amdgpu_get_status_string(status, true);
LOG_INFO(ss);
device_id = std::numeric_limits<uint16_t>::max();
}
status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0,
&device_uuid);
if (status != AMDSMI_STATUS_SUCCESS) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__
<< " | rsmi_dev_unique_id_get(): "
<< smi_amdgpu_get_status_string(status, true);
LOG_INFO(ss);
return status;
}
const uint8_t fcn = 0xff;
/* generate random UUID */
status = amdsmi_uuid_gen(uuid,
strtoull(asic_info.asic_serial, nullptr, 16),
(uint16_t)asic_info.device_id, fcn);
status = amdsmi_uuid_gen(uuid, device_uuid, device_id, fcn);
return status;
}
@@ -648,10 +656,10 @@ amdsmi_get_gpu_enumeration_info(amdsmi_processor_handle processor_handle,
// Retrieve HIP UUID
std::string hip_uuid_str = "GPU-";
amdsmi_asic_info_t asic_info = {};
status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
char asic_serial[AMDSMI_GPU_UUID_SIZE];
status = amdsmi_get_gpu_device_uuid(processor_handle, 0, asic_serial);
if (status == AMDSMI_STATUS_SUCCESS) {
hip_uuid_str += std::string(asic_info.asic_serial).substr(0, sizeof(info->hip_uuid) - hip_uuid_str.size() - 1);
hip_uuid_str += std::string(asic_serial).substr(0, sizeof(info->hip_uuid) - hip_uuid_str.size() - 1);
std::strncpy(info->hip_uuid, hip_uuid_str.c_str(), sizeof(info->hip_uuid) - 1);
info->hip_uuid[sizeof(info->hip_uuid) - 1] = '\0'; // Ensure null termination
}