Use the same mutex as rocm-smi

Share the same mutex as rocm-smi implementation. Handle the crash
when a user is not in render group.

Change-Id: I486b26569f9b523b41bbdaf95d51f4a730978cfd
Цей коміт міститься в:
Bill(Shuzhou) Liu
2024-01-11 08:35:04 -06:00
зафіксовано Shuzhou Liu
джерело d74be3120e
коміт 5a6b5d2a0a
3 змінених файлів з 11 додано та 9 видалено
-2
Переглянути файл
@@ -63,7 +63,6 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
if (check_if_drm_is_supported()) this->get_drm_data();
}
~AMDSmiGPUDevice() {
if (check_if_drm_is_supported()) shared_mutex_close(mutex_);
}
amdsmi_status_t get_drm_data();
@@ -91,7 +90,6 @@ class AMDSmiGPUDevice: public AMDSmiProcessor {
amdsmi_bdf_t bdf_;
uint32_t vendor_id_;
AMDSmiDrm& drm_;
shared_mutex_t mutex_;
};
+9 -1
Переглянути файл
@@ -131,6 +131,7 @@ amdsmi_status_t AMDSmiDrm::init() {
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
auto devices = smi.devices();
bool has_valid_fds = false;
for (uint32_t i=0; i < devices.size(); i++) {
auto rocm_smi_device = devices[i];
std::string render_file_name;
@@ -171,6 +172,7 @@ amdsmi_status_t AMDSmiDrm::init() {
continue;
}
has_valid_fds = true;
bdf.fields.function_number = device->businfo.pci->func;
bdf.fields.device_number = device->businfo.pci->dev;
bdf.fields.bus_number = device->businfo.pci->bus;
@@ -182,6 +184,12 @@ amdsmi_status_t AMDSmiDrm::init() {
drm_free_device(&device);
}
// cannot find any valid fds.
if (!has_valid_fds) {
drm_bdfs_.clear();
return AMDSMI_STATUS_INIT_ERROR;
}
return AMDSMI_STATUS_SUCCESS;
}
@@ -315,7 +323,7 @@ std::vector<std::string>& AMDSmiDrm::get_drm_paths() {
}
bool AMDSmiDrm::check_if_drm_is_supported() {
return drm_cmd_write_ != NULL ? true : false;
return (drm_cmd_write_ != NULL && drm_bdfs_.size() >0) ? true : false;
}
std::vector<amdsmi_bdf_t> AMDSmiDrm::get_bdfs() {
+2 -6
Переглянути файл
@@ -43,6 +43,7 @@
#include <functional>
#include "amd_smi/impl/amd_smi_gpu_device.h"
#include "rocm_smi/rocm_smi_utils.h"
namespace amd {
@@ -80,11 +81,6 @@ amdsmi_status_t AMDSmiGPUDevice::get_drm_data() {
ret = drm_.get_bdf_by_index(gpu_id_, &bdf);
if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED;
mutex_ = shared_mutex_init(path.c_str(), 0777);
if (mutex_.ptr == nullptr) {
printf("Failed to create shared mem. mutex.");
return AMDSMI_STATUS_INIT_ERROR;
}
bdf_ = bdf, path_ = path, fd_ = fd;
vendor_id_ = drm_.get_vendor_id();
@@ -92,7 +88,7 @@ amdsmi_status_t AMDSmiGPUDevice::get_drm_data() {
}
pthread_mutex_t* AMDSmiGPUDevice::get_mutex() {
return mutex_.ptr;
return amd::smi::GetMutex(gpu_id_);
}
amdsmi_status_t AMDSmiGPUDevice::amdgpu_query_info(unsigned info_id,