Reduce Load times for Partition CLI (#290)
* Reduced Load times for CLI in partition mode
* Change rsmi_dev_id_get() to use KFD, if KGD interface does not exist
* Make gpu_device_uuid fallback to rsmi_wrapper
* Moved Enumeration info calls in list for more speed
* Moved made group check excluded from recursion
---------
Signed-off-by: Maisam Arif <Maisam.Arif@amd.com>
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
Co-authored-by: Charis Poag <Charis.Poag@amd.com>
Co-authored-by: gabrpham_amdeng <Gabriel.Pham@amd.com>
[ROCm/amdsmi commit: 63b13ecb05]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
5c59f20f22
Коммит
05c80e7ace
@@ -198,7 +198,9 @@ class AMDSMICommands():
|
||||
if args.gpu == None:
|
||||
args.gpu = self.device_handles
|
||||
|
||||
self.helpers.check_required_groups()
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.group_check_printed = True
|
||||
|
||||
# Handle multiple GPUs
|
||||
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.list)
|
||||
@@ -219,15 +221,6 @@ class AMDSMICommands():
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
uuid = e.get_error_info()
|
||||
|
||||
try:
|
||||
enumeration_info = amdsmi_interface.amdsmi_get_gpu_enumeration_info(args.gpu)
|
||||
except:
|
||||
enumeration_info = {"drm_render": "N/A",
|
||||
"drm_card": "N/A",
|
||||
"hip_id": "N/A",
|
||||
"hip_uuid": "N/A",
|
||||
"hsa_id": "N/A"}
|
||||
|
||||
try:
|
||||
kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)
|
||||
kfd_id = kfd_info['kfd_id']
|
||||
@@ -250,6 +243,14 @@ class AMDSMICommands():
|
||||
self.logger.store_output(args.gpu, 'partition_id', partition_id)
|
||||
|
||||
if args.e:
|
||||
try:
|
||||
enumeration_info = amdsmi_interface.amdsmi_get_gpu_enumeration_info(args.gpu)
|
||||
except:
|
||||
enumeration_info = {"drm_render": "N/A",
|
||||
"drm_card": "N/A",
|
||||
"hip_id": "N/A",
|
||||
"hip_uuid": "N/A",
|
||||
"hsa_id": "N/A"}
|
||||
if enumeration_info['drm_render'] == "N/A":
|
||||
self.logger.store_output(args.gpu, 'render', enumeration_info['drm_render'])
|
||||
else:
|
||||
@@ -404,7 +405,9 @@ class AMDSMICommands():
|
||||
args.vram, args.cache, args.board, args.process_isolation,
|
||||
args.clock, args.partition]
|
||||
|
||||
self.helpers.check_required_groups()
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.group_check_printed = True
|
||||
|
||||
if self.helpers.is_linux() and self.helpers.is_baremetal():
|
||||
if limit:
|
||||
@@ -3409,7 +3412,9 @@ class AMDSMICommands():
|
||||
# Clear the table header
|
||||
self.logger.table_header = ''.rjust(12)
|
||||
|
||||
self.helpers.check_required_groups()
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.group_check_printed = True
|
||||
|
||||
# Populate the possible gpus
|
||||
topo_values = []
|
||||
@@ -4698,7 +4703,9 @@ class AMDSMICommands():
|
||||
if core:
|
||||
args.core = core
|
||||
|
||||
self.helpers.check_required_groups()
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.group_check_printed = True
|
||||
|
||||
# Check if a GPU argument has been set
|
||||
gpu_args_enabled = False
|
||||
@@ -4844,7 +4851,9 @@ class AMDSMICommands():
|
||||
if args.gpu == None:
|
||||
args.gpu = self.device_handles
|
||||
|
||||
self.helpers.check_required_groups()
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.group_check_printed = True
|
||||
|
||||
# Handle multiple GPUs
|
||||
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.reset)
|
||||
@@ -5722,7 +5731,9 @@ class AMDSMICommands():
|
||||
# Clear the table header
|
||||
self.logger.table_header = ''.rjust(7)
|
||||
|
||||
self.helpers.check_required_groups()
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.group_check_printed = True
|
||||
|
||||
# Populate the possible gpus and their bdfs
|
||||
xgmi_values = []
|
||||
@@ -5968,7 +5979,9 @@ class AMDSMICommands():
|
||||
if accelerator:
|
||||
args.accelerator = accelerator
|
||||
|
||||
self.helpers.check_required_groups()
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.group_check_printed = True
|
||||
|
||||
###########################################
|
||||
# amd-smi partition (no args) #
|
||||
@@ -6296,7 +6309,10 @@ class AMDSMICommands():
|
||||
if args.gpu == None:
|
||||
args.gpu = self.device_handles
|
||||
|
||||
self.helpers.check_required_groups()
|
||||
if not self.group_check_printed:
|
||||
self.helpers.check_required_groups()
|
||||
self.group_check_printed = True
|
||||
|
||||
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.ras)
|
||||
if handled_multiple_gpus:
|
||||
return
|
||||
|
||||
@@ -799,6 +799,9 @@ rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node) {
|
||||
DEVICE_MUTEX
|
||||
std::string str_val;
|
||||
ret = get_dev_value_str(amd::smi::kDevNumaNode, dv_ind, &str_val);
|
||||
if (ret != RSMI_STATUS_SUCCESS){
|
||||
return ret;
|
||||
}
|
||||
*numa_node = std::stoi(str_val, nullptr);
|
||||
|
||||
return ret;
|
||||
@@ -913,11 +916,41 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) {
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
CHK_SUPPORT_NAME_ONLY(id)
|
||||
*id = std::numeric_limits<uint16_t>::max();
|
||||
|
||||
// Get the device ID from KGD
|
||||
ret = get_id(dv_ind, amd::smi::kDevDevID, id);
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
|
||||
LOG_TRACE(ss);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< (ret == RSMI_STATUS_SUCCESS ?
|
||||
" | No fall back needed retrieved from KGD" : " | fall back needed")
|
||||
<< " | Device #: " << std::to_string(dv_ind)
|
||||
<< " | Data: device_id = " << std::to_string(*id)
|
||||
<< " | ret = " << getRSMIStatusString(ret, false);
|
||||
LOG_DEBUG(ss);
|
||||
// If the device ID is not supported, use KFD's device ID
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
GET_DEV_AND_KFDNODE_FROM_INDX
|
||||
uint32_t node_id;
|
||||
uint64_t kfd_device_id;
|
||||
int ret_kfd = kfd_node->get_node_id(&node_id);
|
||||
ret_kfd = amd::smi::read_node_properties(node_id, "device_id", &kfd_device_id);
|
||||
if (ret_kfd == 0) {
|
||||
*id = kfd_device_id;
|
||||
ret = RSMI_STATUS_SUCCESS;
|
||||
} else {
|
||||
*id = std::numeric_limits<uint16_t>::max();
|
||||
ret = RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: Could not read device from sysfs, falling back to KFD" << "\n"
|
||||
<< " ; Device #: " << std::to_string(dv_ind) << "\n"
|
||||
<< " ; ret_kfd: " << std::to_string(ret_kfd) << "\n"
|
||||
<< " ; node: " << std::to_string(node_id) << "\n"
|
||||
<< " ; Data: device_id (from KFD)= " << std::to_string(*id) << "\n"
|
||||
<< " ; ret = " << getRSMIStatusString(ret, false);
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -570,31 +570,39 @@ amdsmi_get_gpu_device_uuid(amdsmi_processor_handle processor_handle,
|
||||
char *uuid) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
|
||||
if (uuid_length == nullptr || uuid == nullptr || uuid_length == nullptr || *uuid_length < AMDSMI_GPU_UUID_SIZE) {
|
||||
if (uuid_length == nullptr || uuid == nullptr || *uuid_length < AMDSMI_GPU_UUID_SIZE) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
|
||||
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
|
||||
if (r != AMDSMI_STATUS_SUCCESS)
|
||||
return r;
|
||||
uint64_t device_uuid = 0;
|
||||
uint16_t device_id = std::numeric_limits<uint16_t>::max();
|
||||
amdsmi_status_t status;
|
||||
|
||||
amdsmi_status_t status = AMDSMI_STATUS_SUCCESS;
|
||||
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex())
|
||||
|
||||
amdsmi_asic_info_t asic_info = {};
|
||||
const uint8_t fcn = 0xff;
|
||||
|
||||
status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
|
||||
status = rsmi_wrapper(rsmi_dev_id_get, processor_handle, 0, &device_id);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
printf("Getting asic info failed. Return code: %d", status);
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | rsmi_dev_id_get(): "
|
||||
<< smi_amdgpu_get_status_string(status, true);
|
||||
LOG_INFO(ss);
|
||||
device_id = std::numeric_limits<uint16_t>::max();
|
||||
}
|
||||
|
||||
status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0,
|
||||
&device_uuid);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | rsmi_dev_unique_id_get(): "
|
||||
<< smi_amdgpu_get_status_string(status, true);
|
||||
LOG_INFO(ss);
|
||||
return status;
|
||||
}
|
||||
|
||||
const uint8_t fcn = 0xff;
|
||||
|
||||
/* generate random UUID */
|
||||
status = amdsmi_uuid_gen(uuid,
|
||||
strtoull(asic_info.asic_serial, nullptr, 16),
|
||||
(uint16_t)asic_info.device_id, fcn);
|
||||
status = amdsmi_uuid_gen(uuid, device_uuid, device_id, fcn);
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -648,10 +656,10 @@ amdsmi_get_gpu_enumeration_info(amdsmi_processor_handle processor_handle,
|
||||
|
||||
// Retrieve HIP UUID
|
||||
std::string hip_uuid_str = "GPU-";
|
||||
amdsmi_asic_info_t asic_info = {};
|
||||
status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
|
||||
char asic_serial[AMDSMI_GPU_UUID_SIZE];
|
||||
status = amdsmi_get_gpu_device_uuid(processor_handle, 0, asic_serial);
|
||||
if (status == AMDSMI_STATUS_SUCCESS) {
|
||||
hip_uuid_str += std::string(asic_info.asic_serial).substr(0, sizeof(info->hip_uuid) - hip_uuid_str.size() - 1);
|
||||
hip_uuid_str += std::string(asic_serial).substr(0, sizeof(info->hip_uuid) - hip_uuid_str.size() - 1);
|
||||
std::strncpy(info->hip_uuid, hip_uuid_str.c_str(), sizeof(info->hip_uuid) - 1);
|
||||
info->hip_uuid[sizeof(info->hip_uuid) - 1] = '\0'; // Ensure null termination
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user