Don't use hwmon# as indicator of gpu
Previously, during the rsmi_init discovery process, the existence of an hwmon# directory was used to distinguish between gpus nodes and non-gpu nodes. This isn't reliable in some scenarios. Instead, the existence of the vbios_version file is used as an indicator that the node is indeed a gpu. Change-Id: Icfbe5c42ed0970077b05f25c3d209308a31bec85
This commit is contained in:
@@ -52,13 +52,13 @@
|
||||
|
||||
#define CHECK_DV_IND_RANGE \
|
||||
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); \
|
||||
if (dv_ind >= smi.monitor_devices().size()) { \
|
||||
if (dv_ind >= smi.devices().size()) { \
|
||||
return RSMI_STATUS_INVALID_ARGS; \
|
||||
} \
|
||||
|
||||
#define GET_DEV_FROM_INDX \
|
||||
CHECK_DV_IND_RANGE \
|
||||
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dv_ind]; \
|
||||
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dv_ind]; \
|
||||
assert(dev != nullptr);
|
||||
|
||||
|
||||
|
||||
@@ -76,7 +76,8 @@ class RocmSMI {
|
||||
void Cleanup(void);
|
||||
|
||||
std::vector<std::shared_ptr<amd::smi::Device>>&
|
||||
monitor_devices() {return monitor_devices_;}
|
||||
devices() {return devices_;}
|
||||
|
||||
uint32_t DiscoverAmdgpuDevices(void);
|
||||
int DiscoverAMDPowerMonitors(bool force_update = false);
|
||||
|
||||
@@ -123,9 +124,7 @@ class RocmSMI {
|
||||
std::map<uint32_t, uint32_t> dev_ind_to_node_ind_map_;
|
||||
void AddToDeviceList(std::string dev_name);
|
||||
void GetEnvVariables(void);
|
||||
uint32_t DiscoverAMDMonitors(void);
|
||||
|
||||
std::vector<std::shared_ptr<amd::smi::Device>> monitor_devices_;
|
||||
std::shared_ptr<Monitor> FindMonitor(std::string monitor_path);
|
||||
|
||||
RocmSMI_env_vars env_vars_;
|
||||
uint64_t init_options_;
|
||||
|
||||
+17
-17
@@ -391,7 +391,7 @@ static rsmi_status_t get_power_mon_value(amd::smi::PowerMonTypes type,
|
||||
uint32_t dv_ind, uint64_t *val) {
|
||||
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
|
||||
|
||||
if (dv_ind >= smi.monitor_devices().size() || val == nullptr) {
|
||||
if (dv_ind >= smi.devices().size() || val == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
@@ -400,7 +400,7 @@ static rsmi_status_t get_power_mon_value(amd::smi::PowerMonTypes type,
|
||||
return amd::smi::ErrnoToRsmiStatus(ret);
|
||||
}
|
||||
|
||||
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dv_ind];
|
||||
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dv_ind];
|
||||
assert(dev != nullptr);
|
||||
assert(dev->monitor() != nullptr);
|
||||
|
||||
@@ -462,9 +462,9 @@ rsmi_shut_down(void) {
|
||||
#if DEBUG
|
||||
int ret = 0;
|
||||
#endif
|
||||
for (uint32_t i = 0; i < smi.monitor_devices().size(); ++i) {
|
||||
for (uint32_t i = 0; i < smi.devices().size(); ++i) {
|
||||
#if DEBUG
|
||||
ret = pthread_mutex_unlock(smi.monitor_devices()[i]->mutex());
|
||||
ret = pthread_mutex_unlock(smi.devices()[i]->mutex());
|
||||
if (ret != EPERM) { // We expect to get EPERM if the lock has already
|
||||
// been released
|
||||
if (ret == 0) {
|
||||
@@ -476,7 +476,7 @@ rsmi_shut_down(void) {
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)pthread_mutex_unlock(smi.monitor_devices()[i]->mutex());
|
||||
(void)pthread_mutex_unlock(smi.devices()[i]->mutex());
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -499,7 +499,7 @@ rsmi_num_monitor_devices(uint32_t *num_devices) {
|
||||
|
||||
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
|
||||
|
||||
*num_devices = static_cast<uint32_t>(smi.monitor_devices().size());
|
||||
*num_devices = static_cast<uint32_t>(smi.devices().size());
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
CATCH
|
||||
}
|
||||
@@ -1458,13 +1458,13 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind,
|
||||
|
||||
// Above call to rsmi_dev_get_gpu_clk_freq should have emitted an error if
|
||||
// assert below is not true
|
||||
assert(dv_ind < smi.monitor_devices().size());
|
||||
assert(dv_ind < smi.devices().size());
|
||||
|
||||
std::string freq_enable_str =
|
||||
bitfield_to_freq_string(freq_bitmask, freqs.num_supported);
|
||||
|
||||
|
||||
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dv_ind];
|
||||
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dv_ind];
|
||||
assert(dev != nullptr);
|
||||
|
||||
ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL);
|
||||
@@ -1912,12 +1912,12 @@ rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask) {
|
||||
|
||||
// Above call to rsmi_dev_pci_bandwidth_get() should have emitted an error
|
||||
// if assert below is not true
|
||||
assert(dv_ind < smi.monitor_devices().size());
|
||||
assert(dv_ind < smi.devices().size());
|
||||
|
||||
std::string freq_enable_str =
|
||||
bitfield_to_freq_string(bw_bitmask, bws.transfer_rate.num_supported);
|
||||
|
||||
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dv_ind];
|
||||
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dv_ind];
|
||||
assert(dev != nullptr);
|
||||
|
||||
ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL);
|
||||
@@ -2994,7 +2994,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices,
|
||||
}
|
||||
|
||||
*num_devices = static_cast<uint32_t>(gpu_set.size());
|
||||
if (gpu_set.size() > smi.monitor_devices().size()) {
|
||||
if (gpu_set.size() > smi.devices().size()) {
|
||||
return RSMI_STATUS_UNEXPECTED_SIZE;
|
||||
}
|
||||
|
||||
@@ -3672,11 +3672,11 @@ rsmi_event_notification_get(int timeout_ms,
|
||||
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
|
||||
std::vector<uint32_t> fd_indx_to_dev_id;
|
||||
|
||||
for (uint32_t i = 0; i < smi.monitor_devices().size(); ++i) {
|
||||
if (smi.monitor_devices()[i]->evt_notif_anon_fd() == -1) {
|
||||
for (uint32_t i = 0; i < smi.devices().size(); ++i) {
|
||||
if (smi.devices()[i]->evt_notif_anon_fd() == -1) {
|
||||
continue;
|
||||
}
|
||||
fds.push_back({smi.monitor_devices()[i]->evt_notif_anon_fd(),
|
||||
fds.push_back({smi.devices()[i]->evt_notif_anon_fd(),
|
||||
POLLIN | POLLRDNORM, 0});
|
||||
fd_indx_to_dev_id.push_back(i);
|
||||
}
|
||||
@@ -3694,7 +3694,7 @@ rsmi_event_notification_get(int timeout_ms,
|
||||
}
|
||||
|
||||
FILE *anon_fp =
|
||||
smi.monitor_devices()[fd_indx_to_dev_id[i]]->evt_notif_anon_file_ptr();
|
||||
smi.devices()[fd_indx_to_dev_id[i]]->evt_notif_anon_file_ptr();
|
||||
data_item =
|
||||
reinterpret_cast<rsmi_evt_notification_data_t *>(&data[*num_elem]);
|
||||
|
||||
@@ -3754,7 +3754,7 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
// close(dev->evt_notif_anon_fd());
|
||||
FILE *anon_fp = smi.monitor_devices()[dv_ind]->evt_notif_anon_file_ptr();
|
||||
FILE *anon_fp = smi.devices()[dv_ind]->evt_notif_anon_file_ptr();
|
||||
fclose(anon_fp);
|
||||
assert(errno == 0 || errno == EAGAIN);
|
||||
dev->set_evt_notif_anon_file_ptr(nullptr);
|
||||
@@ -3800,7 +3800,7 @@ rsmi_test_refcount(uint64_t refcnt_type) {
|
||||
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
|
||||
std::lock_guard<std::mutex> guard(*smi.bootstrap_mutex());
|
||||
|
||||
if (smi.ref_count() == 0 && smi.monitor_devices().size() != 0) {
|
||||
if (smi.ref_count() == 0 && smi.devices().size() != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
@@ -175,8 +175,8 @@ Event::Event(rsmi_event_type_t event, uint32_t dev_ind) :
|
||||
|
||||
|
||||
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
|
||||
assert(dev_ind < smi.monitor_devices().size());
|
||||
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dev_ind];
|
||||
assert(dev_ind < smi.devices().size());
|
||||
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dev_ind];
|
||||
assert(dev != nullptr);
|
||||
|
||||
|
||||
|
||||
@@ -709,7 +709,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
|
||||
|
||||
size_t num = fread(p_binary_data, b_size, 1, ptr);
|
||||
fclose(ptr);
|
||||
if((num*b_size) != b_size){
|
||||
if ((num*b_size) != b_size) {
|
||||
return ENOENT;
|
||||
}
|
||||
return 0;
|
||||
|
||||
+84
-115
@@ -118,10 +118,6 @@ static uint32_t GetDrmRenderMinor(const std::string s) {
|
||||
return static_cast<uint32_t>(drm_minor);
|
||||
}
|
||||
|
||||
static int SameDevice(const std::string fileA, const std::string fileB) {
|
||||
return SameFile(fileA + "/device", fileB + "/device");
|
||||
}
|
||||
|
||||
// Determine if provided string is a bdfid pci path directory of the form
|
||||
// XXXX:XX:XX.X,
|
||||
// domain:bus:device.function
|
||||
@@ -217,28 +213,6 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
|
||||
|
||||
return 1;
|
||||
}
|
||||
// Call-back function to append to a vector of Devices
|
||||
static uint32_t GetMonitorDevices(const std::shared_ptr<amd::smi::Device> &d,
|
||||
void *p) {
|
||||
std::string val_str;
|
||||
uint64_t bdfid;
|
||||
|
||||
assert(p != nullptr);
|
||||
|
||||
std::vector<std::shared_ptr<amd::smi::Device>> *device_list =
|
||||
reinterpret_cast<std::vector<std::shared_ptr<amd::smi::Device>> *>(p);
|
||||
|
||||
if (d->monitor() != nullptr) {
|
||||
// Calculate BDFID and set for this device
|
||||
if (ConstructBDFID(d->path(), &bdfid) != 0) {
|
||||
std::cerr << "Failed to construct BDFID." << std::endl;
|
||||
return 1;
|
||||
}
|
||||
d->set_bdfid(bdfid);
|
||||
device_list->push_back(d);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
RocmSMI::Initialize(uint64_t flags) {
|
||||
@@ -273,11 +247,14 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
"DiscoverAmdgpuDevices() failed.");
|
||||
}
|
||||
|
||||
// IterateSMIDevices will iterate through all the known devices and apply
|
||||
// the provided call-back to each device found.
|
||||
ret = IterateSMIDevices(GetMonitorDevices,
|
||||
reinterpret_cast<void *>(&monitor_devices_));
|
||||
|
||||
uint64_t bdfid;
|
||||
for (uint32_t i = 0; i < devices_.size(); ++i) {
|
||||
if (ConstructBDFID(devices_[i]->path(), &bdfid) != 0) {
|
||||
std::cerr << "Failed to construct BDFID." << std::endl;
|
||||
ret = 1;
|
||||
}
|
||||
devices_[i]->set_bdfid(bdfid);
|
||||
}
|
||||
if (ret != 0) {
|
||||
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
|
||||
"Failed to initialize rocm_smi library (amdgpu node discovery).");
|
||||
@@ -305,11 +282,11 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
|
||||
// Remove any drm nodes that don't have a corresponding readable kfd node.
|
||||
// kfd nodes will not be added if their properties file is not readable.
|
||||
auto dev_iter = monitor_devices_.begin();
|
||||
while (dev_iter != monitor_devices_.end()) {
|
||||
auto dev_iter = devices_.begin();
|
||||
while (dev_iter != devices_.end()) {
|
||||
uint64_t bdfid = (*dev_iter)->bdfid();
|
||||
if (tmp_map.find(bdfid) == tmp_map.end()) {
|
||||
dev_iter = monitor_devices_.erase(dev_iter);
|
||||
dev_iter = devices_.erase(dev_iter);
|
||||
continue;
|
||||
}
|
||||
dev_iter++;
|
||||
@@ -318,8 +295,8 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
// 1. construct kfd_node_map_ with gpu_id as key and *Device as value
|
||||
// 2. for each kfd node, write the corresponding dv_ind
|
||||
// 3. for each amdgpu device, write the corresponding gpu_id
|
||||
for (uint32_t dv_ind = 0; dv_ind < monitor_devices_.size(); ++dv_ind) {
|
||||
dev = monitor_devices_[dv_ind];
|
||||
for (uint32_t dv_ind = 0; dv_ind < devices_.size(); ++dv_ind) {
|
||||
dev = devices_[dv_ind];
|
||||
uint64_t bdfid = dev->bdfid();
|
||||
assert(tmp_map.find(bdfid) != tmp_map.end());
|
||||
if (tmp_map.find(bdfid) == tmp_map.end()) {
|
||||
@@ -337,7 +314,6 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
|
||||
void
|
||||
RocmSMI::Cleanup() {
|
||||
monitor_devices_.clear();
|
||||
devices_.clear();
|
||||
monitors_.clear();
|
||||
|
||||
@@ -399,30 +375,80 @@ void RocmSMI::GetEnvVariables(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
std::shared_ptr<Monitor>
|
||||
RocmSMI::FindMonitor(std::string monitor_path) {
|
||||
std::string tmp;
|
||||
std::string err_msg;
|
||||
std::string mon_name;
|
||||
std::shared_ptr<Monitor> m;
|
||||
|
||||
if (!FileExists(monitor_path.c_str())) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto mon_dir = opendir(monitor_path.c_str());
|
||||
|
||||
if (mon_dir == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
auto dentry = readdir(mon_dir);
|
||||
|
||||
while (dentry != nullptr) {
|
||||
if (dentry->d_name[0] == '.') {
|
||||
dentry = readdir(mon_dir);
|
||||
continue;
|
||||
}
|
||||
|
||||
mon_name = monitor_path;
|
||||
mon_name += "/";
|
||||
mon_name += dentry->d_name;
|
||||
tmp = mon_name + "/name";
|
||||
|
||||
if (FileExists(tmp.c_str())) {
|
||||
std::ifstream fs;
|
||||
fs.open(tmp);
|
||||
|
||||
if (!fs.is_open()) {
|
||||
err_msg = "Failed to open monitor file ";
|
||||
err_msg += tmp;
|
||||
err_msg += ".";
|
||||
perror(err_msg.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
std::string mon_type;
|
||||
fs >> mon_type;
|
||||
fs.close();
|
||||
|
||||
if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) {
|
||||
m = std::shared_ptr<Monitor>(new Monitor(mon_name, &env_vars_));
|
||||
m->setTempSensorLabelMap();
|
||||
m->setVoltSensorLabelMap();
|
||||
break;
|
||||
}
|
||||
}
|
||||
dentry = readdir(mon_dir);
|
||||
}
|
||||
|
||||
if (closedir(mon_dir)) {
|
||||
err_msg = "Failed to close monitor directory ";
|
||||
err_msg += kPathHWMonRoot;
|
||||
err_msg += ".";
|
||||
perror(err_msg.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return m;
|
||||
}
|
||||
void
|
||||
RocmSMI::AddToDeviceList(std::string dev_name) {
|
||||
auto ret = 0;
|
||||
|
||||
auto dev_path = std::string(kPathDRMRoot);
|
||||
dev_path += "/";
|
||||
dev_path += dev_name;
|
||||
|
||||
auto dev = std::shared_ptr<Device>(new Device(dev_path, &env_vars_));
|
||||
|
||||
auto m = monitors_.begin();
|
||||
|
||||
while (m != monitors_.end()) {
|
||||
ret = SameDevice(dev->path(), (*m)->path());
|
||||
|
||||
if (ret == 0) {
|
||||
dev->set_monitor(*m);
|
||||
m = monitors_.erase(m);
|
||||
break;
|
||||
} else {
|
||||
assert(ret == 1);
|
||||
++m;
|
||||
}
|
||||
}
|
||||
std::shared_ptr<Monitor> m = FindMonitor(dev_path + "/device/hwmon");
|
||||
dev->set_monitor(m);
|
||||
|
||||
std::string d_name = dev_name;
|
||||
uint32_t card_indx = GetDeviceIndex(d_name);
|
||||
@@ -439,10 +465,15 @@ static const uint32_t kAmdGpuId = 0x1002;
|
||||
|
||||
static bool isAMDGPU(std::string dev_path) {
|
||||
std::string vend_path = dev_path + "/device/vendor";
|
||||
std::string vbios_v_path = dev_path + "/device/vbios_version";
|
||||
if (!FileExists(vend_path.c_str())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!FileExists(vbios_v_path.c_str())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::ifstream fs;
|
||||
fs.open(vend_path);
|
||||
|
||||
@@ -463,7 +494,6 @@ static bool isAMDGPU(std::string dev_path) {
|
||||
}
|
||||
|
||||
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
uint32_t ret = 0;
|
||||
std::string err_msg;
|
||||
uint32_t count = 0;
|
||||
|
||||
@@ -471,12 +501,6 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
devices_.clear();
|
||||
monitors_.clear();
|
||||
|
||||
ret = DiscoverAMDMonitors();
|
||||
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
auto drm_dir = opendir(kPathDRMRoot);
|
||||
if (drm_dir == nullptr) {
|
||||
err_msg = "Failed to open drm root directory ";
|
||||
@@ -521,61 +545,6 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t RocmSMI::DiscoverAMDMonitors(void) {
|
||||
auto mon_dir = opendir(kPathHWMonRoot);
|
||||
|
||||
auto dentry = readdir(mon_dir);
|
||||
|
||||
std::string mon_name;
|
||||
std::string tmp;
|
||||
std::string err_msg;
|
||||
|
||||
while (dentry != nullptr) {
|
||||
if (dentry->d_name[0] == '.') {
|
||||
dentry = readdir(mon_dir);
|
||||
continue;
|
||||
}
|
||||
|
||||
mon_name = kPathHWMonRoot;
|
||||
mon_name += "/";
|
||||
mon_name += dentry->d_name;
|
||||
tmp = mon_name + "/name";
|
||||
|
||||
if (FileExists(tmp.c_str())) {
|
||||
std::ifstream fs;
|
||||
fs.open(tmp);
|
||||
|
||||
if (!fs.is_open()) {
|
||||
err_msg = "Failed to open monitor file ";
|
||||
err_msg += tmp;
|
||||
err_msg += ".";
|
||||
perror(err_msg.c_str());
|
||||
return 1;
|
||||
}
|
||||
std::string mon_type;
|
||||
fs >> mon_type;
|
||||
fs.close();
|
||||
|
||||
if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) {
|
||||
std::shared_ptr<Monitor> m =
|
||||
std::shared_ptr<Monitor>(new Monitor(mon_name, &env_vars_));
|
||||
m->setTempSensorLabelMap();
|
||||
m->setVoltSensorLabelMap();
|
||||
monitors_.push_back(m);
|
||||
}
|
||||
}
|
||||
dentry = readdir(mon_dir);
|
||||
}
|
||||
|
||||
if (closedir(mon_dir)) {
|
||||
err_msg = "Failed to close monitor directory ";
|
||||
err_msg += kPathHWMonRoot;
|
||||
err_msg += ".";
|
||||
perror(err_msg.c_str());
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Since these sysfs files require sudo access, we won't discover them
|
||||
// with rsmi_init() (and thus always require the user to use "sudo".
|
||||
|
||||
@@ -183,10 +183,10 @@ rsmi_status_t handleException() {
|
||||
pthread_mutex_t *GetMutex(uint32_t dv_ind) {
|
||||
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
|
||||
|
||||
if (dv_ind >= smi.monitor_devices().size()) {
|
||||
if (dv_ind >= smi.devices().size()) {
|
||||
return nullptr;
|
||||
}
|
||||
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dv_ind];
|
||||
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dv_ind];
|
||||
assert(dev != nullptr);
|
||||
|
||||
return dev->mutex();
|
||||
|
||||
Reference in New Issue
Block a user