Don't use hwmon# as indicator of gpu

Previously, during the rsmi_init discovery process, the existence
of an hwmon# directory was used to distinguish between gpus nodes
and non-gpu nodes. This isn't reliable in some scenarios. Instead,
the existence of the vbios_version file is used as an
indicator that the node is indeed a gpu.

Change-Id: Icfbe5c42ed0970077b05f25c3d209308a31bec85
This commit is contained in:
Chris Freehill
2021-01-28 18:25:53 -06:00
rodzic 20e2d260fb
commit ff9546aa62
7 zmienionych plików z 111 dodań i 143 usunięć
+2 -2
Wyświetl plik
@@ -52,13 +52,13 @@
#define CHECK_DV_IND_RANGE \
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); \
if (dv_ind >= smi.monitor_devices().size()) { \
if (dv_ind >= smi.devices().size()) { \
return RSMI_STATUS_INVALID_ARGS; \
} \
#define GET_DEV_FROM_INDX \
CHECK_DV_IND_RANGE \
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dv_ind]; \
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dv_ind]; \
assert(dev != nullptr);
+3 -4
Wyświetl plik
@@ -76,7 +76,8 @@ class RocmSMI {
void Cleanup(void);
std::vector<std::shared_ptr<amd::smi::Device>>&
monitor_devices() {return monitor_devices_;}
devices() {return devices_;}
uint32_t DiscoverAmdgpuDevices(void);
int DiscoverAMDPowerMonitors(bool force_update = false);
@@ -123,9 +124,7 @@ class RocmSMI {
std::map<uint32_t, uint32_t> dev_ind_to_node_ind_map_;
void AddToDeviceList(std::string dev_name);
void GetEnvVariables(void);
uint32_t DiscoverAMDMonitors(void);
std::vector<std::shared_ptr<amd::smi::Device>> monitor_devices_;
std::shared_ptr<Monitor> FindMonitor(std::string monitor_path);
RocmSMI_env_vars env_vars_;
uint64_t init_options_;
+17 -17
Wyświetl plik
@@ -391,7 +391,7 @@ static rsmi_status_t get_power_mon_value(amd::smi::PowerMonTypes type,
uint32_t dv_ind, uint64_t *val) {
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
if (dv_ind >= smi.monitor_devices().size() || val == nullptr) {
if (dv_ind >= smi.devices().size() || val == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
@@ -400,7 +400,7 @@ static rsmi_status_t get_power_mon_value(amd::smi::PowerMonTypes type,
return amd::smi::ErrnoToRsmiStatus(ret);
}
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dv_ind];
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dv_ind];
assert(dev != nullptr);
assert(dev->monitor() != nullptr);
@@ -462,9 +462,9 @@ rsmi_shut_down(void) {
#if DEBUG
int ret = 0;
#endif
for (uint32_t i = 0; i < smi.monitor_devices().size(); ++i) {
for (uint32_t i = 0; i < smi.devices().size(); ++i) {
#if DEBUG
ret = pthread_mutex_unlock(smi.monitor_devices()[i]->mutex());
ret = pthread_mutex_unlock(smi.devices()[i]->mutex());
if (ret != EPERM) { // We expect to get EPERM if the lock has already
// been released
if (ret == 0) {
@@ -476,7 +476,7 @@ rsmi_shut_down(void) {
}
}
#else
(void)pthread_mutex_unlock(smi.monitor_devices()[i]->mutex());
(void)pthread_mutex_unlock(smi.devices()[i]->mutex());
#endif
}
@@ -499,7 +499,7 @@ rsmi_num_monitor_devices(uint32_t *num_devices) {
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
*num_devices = static_cast<uint32_t>(smi.monitor_devices().size());
*num_devices = static_cast<uint32_t>(smi.devices().size());
return RSMI_STATUS_SUCCESS;
CATCH
}
@@ -1458,13 +1458,13 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind,
// Above call to rsmi_dev_get_gpu_clk_freq should have emitted an error if
// assert below is not true
assert(dv_ind < smi.monitor_devices().size());
assert(dv_ind < smi.devices().size());
std::string freq_enable_str =
bitfield_to_freq_string(freq_bitmask, freqs.num_supported);
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dv_ind];
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dv_ind];
assert(dev != nullptr);
ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL);
@@ -1912,12 +1912,12 @@ rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask) {
// Above call to rsmi_dev_pci_bandwidth_get() should have emitted an error
// if assert below is not true
assert(dv_ind < smi.monitor_devices().size());
assert(dv_ind < smi.devices().size());
std::string freq_enable_str =
bitfield_to_freq_string(bw_bitmask, bws.transfer_rate.num_supported);
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dv_ind];
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dv_ind];
assert(dev != nullptr);
ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL);
@@ -2994,7 +2994,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices,
}
*num_devices = static_cast<uint32_t>(gpu_set.size());
if (gpu_set.size() > smi.monitor_devices().size()) {
if (gpu_set.size() > smi.devices().size()) {
return RSMI_STATUS_UNEXPECTED_SIZE;
}
@@ -3672,11 +3672,11 @@ rsmi_event_notification_get(int timeout_ms,
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
std::vector<uint32_t> fd_indx_to_dev_id;
for (uint32_t i = 0; i < smi.monitor_devices().size(); ++i) {
if (smi.monitor_devices()[i]->evt_notif_anon_fd() == -1) {
for (uint32_t i = 0; i < smi.devices().size(); ++i) {
if (smi.devices()[i]->evt_notif_anon_fd() == -1) {
continue;
}
fds.push_back({smi.monitor_devices()[i]->evt_notif_anon_fd(),
fds.push_back({smi.devices()[i]->evt_notif_anon_fd(),
POLLIN | POLLRDNORM, 0});
fd_indx_to_dev_id.push_back(i);
}
@@ -3694,7 +3694,7 @@ rsmi_event_notification_get(int timeout_ms,
}
FILE *anon_fp =
smi.monitor_devices()[fd_indx_to_dev_id[i]]->evt_notif_anon_file_ptr();
smi.devices()[fd_indx_to_dev_id[i]]->evt_notif_anon_file_ptr();
data_item =
reinterpret_cast<rsmi_evt_notification_data_t *>(&data[*num_elem]);
@@ -3754,7 +3754,7 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) {
return RSMI_STATUS_INVALID_ARGS;
}
// close(dev->evt_notif_anon_fd());
FILE *anon_fp = smi.monitor_devices()[dv_ind]->evt_notif_anon_file_ptr();
FILE *anon_fp = smi.devices()[dv_ind]->evt_notif_anon_file_ptr();
fclose(anon_fp);
assert(errno == 0 || errno == EAGAIN);
dev->set_evt_notif_anon_file_ptr(nullptr);
@@ -3800,7 +3800,7 @@ rsmi_test_refcount(uint64_t refcnt_type) {
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
std::lock_guard<std::mutex> guard(*smi.bootstrap_mutex());
if (smi.ref_count() == 0 && smi.monitor_devices().size() != 0) {
if (smi.ref_count() == 0 && smi.devices().size() != 0) {
return -1;
}
+2 -2
Wyświetl plik
@@ -175,8 +175,8 @@ Event::Event(rsmi_event_type_t event, uint32_t dev_ind) :
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
assert(dev_ind < smi.monitor_devices().size());
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dev_ind];
assert(dev_ind < smi.devices().size());
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dev_ind];
assert(dev != nullptr);
+1 -1
Wyświetl plik
@@ -709,7 +709,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
size_t num = fread(p_binary_data, b_size, 1, ptr);
fclose(ptr);
if((num*b_size) != b_size){
if ((num*b_size) != b_size) {
return ENOENT;
}
return 0;
+84 -115
Wyświetl plik
@@ -118,10 +118,6 @@ static uint32_t GetDrmRenderMinor(const std::string s) {
return static_cast<uint32_t>(drm_minor);
}
static int SameDevice(const std::string fileA, const std::string fileB) {
return SameFile(fileA + "/device", fileB + "/device");
}
// Determine if provided string is a bdfid pci path directory of the form
// XXXX:XX:XX.X,
// domain:bus:device.function
@@ -217,28 +213,6 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
return 1;
}
// Call-back function to append to a vector of Devices
static uint32_t GetMonitorDevices(const std::shared_ptr<amd::smi::Device> &d,
void *p) {
std::string val_str;
uint64_t bdfid;
assert(p != nullptr);
std::vector<std::shared_ptr<amd::smi::Device>> *device_list =
reinterpret_cast<std::vector<std::shared_ptr<amd::smi::Device>> *>(p);
if (d->monitor() != nullptr) {
// Calculate BDFID and set for this device
if (ConstructBDFID(d->path(), &bdfid) != 0) {
std::cerr << "Failed to construct BDFID." << std::endl;
return 1;
}
d->set_bdfid(bdfid);
device_list->push_back(d);
}
return 0;
}
void
RocmSMI::Initialize(uint64_t flags) {
@@ -273,11 +247,14 @@ RocmSMI::Initialize(uint64_t flags) {
"DiscoverAmdgpuDevices() failed.");
}
// IterateSMIDevices will iterate through all the known devices and apply
// the provided call-back to each device found.
ret = IterateSMIDevices(GetMonitorDevices,
reinterpret_cast<void *>(&monitor_devices_));
uint64_t bdfid;
for (uint32_t i = 0; i < devices_.size(); ++i) {
if (ConstructBDFID(devices_[i]->path(), &bdfid) != 0) {
std::cerr << "Failed to construct BDFID." << std::endl;
ret = 1;
}
devices_[i]->set_bdfid(bdfid);
}
if (ret != 0) {
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
"Failed to initialize rocm_smi library (amdgpu node discovery).");
@@ -305,11 +282,11 @@ RocmSMI::Initialize(uint64_t flags) {
// Remove any drm nodes that don't have a corresponding readable kfd node.
// kfd nodes will not be added if their properties file is not readable.
auto dev_iter = monitor_devices_.begin();
while (dev_iter != monitor_devices_.end()) {
auto dev_iter = devices_.begin();
while (dev_iter != devices_.end()) {
uint64_t bdfid = (*dev_iter)->bdfid();
if (tmp_map.find(bdfid) == tmp_map.end()) {
dev_iter = monitor_devices_.erase(dev_iter);
dev_iter = devices_.erase(dev_iter);
continue;
}
dev_iter++;
@@ -318,8 +295,8 @@ RocmSMI::Initialize(uint64_t flags) {
// 1. construct kfd_node_map_ with gpu_id as key and *Device as value
// 2. for each kfd node, write the corresponding dv_ind
// 3. for each amdgpu device, write the corresponding gpu_id
for (uint32_t dv_ind = 0; dv_ind < monitor_devices_.size(); ++dv_ind) {
dev = monitor_devices_[dv_ind];
for (uint32_t dv_ind = 0; dv_ind < devices_.size(); ++dv_ind) {
dev = devices_[dv_ind];
uint64_t bdfid = dev->bdfid();
assert(tmp_map.find(bdfid) != tmp_map.end());
if (tmp_map.find(bdfid) == tmp_map.end()) {
@@ -337,7 +314,6 @@ RocmSMI::Initialize(uint64_t flags) {
void
RocmSMI::Cleanup() {
monitor_devices_.clear();
devices_.clear();
monitors_.clear();
@@ -399,30 +375,80 @@ void RocmSMI::GetEnvVariables(void) {
#endif
}
std::shared_ptr<Monitor>
RocmSMI::FindMonitor(std::string monitor_path) {
std::string tmp;
std::string err_msg;
std::string mon_name;
std::shared_ptr<Monitor> m;
if (!FileExists(monitor_path.c_str())) {
return nullptr;
}
auto mon_dir = opendir(monitor_path.c_str());
if (mon_dir == nullptr) {
return nullptr;
}
auto dentry = readdir(mon_dir);
while (dentry != nullptr) {
if (dentry->d_name[0] == '.') {
dentry = readdir(mon_dir);
continue;
}
mon_name = monitor_path;
mon_name += "/";
mon_name += dentry->d_name;
tmp = mon_name + "/name";
if (FileExists(tmp.c_str())) {
std::ifstream fs;
fs.open(tmp);
if (!fs.is_open()) {
err_msg = "Failed to open monitor file ";
err_msg += tmp;
err_msg += ".";
perror(err_msg.c_str());
return nullptr;
}
std::string mon_type;
fs >> mon_type;
fs.close();
if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) {
m = std::shared_ptr<Monitor>(new Monitor(mon_name, &env_vars_));
m->setTempSensorLabelMap();
m->setVoltSensorLabelMap();
break;
}
}
dentry = readdir(mon_dir);
}
if (closedir(mon_dir)) {
err_msg = "Failed to close monitor directory ";
err_msg += kPathHWMonRoot;
err_msg += ".";
perror(err_msg.c_str());
return nullptr;
}
return m;
}
void
RocmSMI::AddToDeviceList(std::string dev_name) {
auto ret = 0;
auto dev_path = std::string(kPathDRMRoot);
dev_path += "/";
dev_path += dev_name;
auto dev = std::shared_ptr<Device>(new Device(dev_path, &env_vars_));
auto m = monitors_.begin();
while (m != monitors_.end()) {
ret = SameDevice(dev->path(), (*m)->path());
if (ret == 0) {
dev->set_monitor(*m);
m = monitors_.erase(m);
break;
} else {
assert(ret == 1);
++m;
}
}
std::shared_ptr<Monitor> m = FindMonitor(dev_path + "/device/hwmon");
dev->set_monitor(m);
std::string d_name = dev_name;
uint32_t card_indx = GetDeviceIndex(d_name);
@@ -439,10 +465,15 @@ static const uint32_t kAmdGpuId = 0x1002;
static bool isAMDGPU(std::string dev_path) {
std::string vend_path = dev_path + "/device/vendor";
std::string vbios_v_path = dev_path + "/device/vbios_version";
if (!FileExists(vend_path.c_str())) {
return false;
}
if (!FileExists(vbios_v_path.c_str())) {
return false;
}
std::ifstream fs;
fs.open(vend_path);
@@ -463,7 +494,6 @@ static bool isAMDGPU(std::string dev_path) {
}
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
uint32_t ret = 0;
std::string err_msg;
uint32_t count = 0;
@@ -471,12 +501,6 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
devices_.clear();
monitors_.clear();
ret = DiscoverAMDMonitors();
if (ret) {
return ret;
}
auto drm_dir = opendir(kPathDRMRoot);
if (drm_dir == nullptr) {
err_msg = "Failed to open drm root directory ";
@@ -521,61 +545,6 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
return 0;
}
uint32_t RocmSMI::DiscoverAMDMonitors(void) {
auto mon_dir = opendir(kPathHWMonRoot);
auto dentry = readdir(mon_dir);
std::string mon_name;
std::string tmp;
std::string err_msg;
while (dentry != nullptr) {
if (dentry->d_name[0] == '.') {
dentry = readdir(mon_dir);
continue;
}
mon_name = kPathHWMonRoot;
mon_name += "/";
mon_name += dentry->d_name;
tmp = mon_name + "/name";
if (FileExists(tmp.c_str())) {
std::ifstream fs;
fs.open(tmp);
if (!fs.is_open()) {
err_msg = "Failed to open monitor file ";
err_msg += tmp;
err_msg += ".";
perror(err_msg.c_str());
return 1;
}
std::string mon_type;
fs >> mon_type;
fs.close();
if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) {
std::shared_ptr<Monitor> m =
std::shared_ptr<Monitor>(new Monitor(mon_name, &env_vars_));
m->setTempSensorLabelMap();
m->setVoltSensorLabelMap();
monitors_.push_back(m);
}
}
dentry = readdir(mon_dir);
}
if (closedir(mon_dir)) {
err_msg = "Failed to close monitor directory ";
err_msg += kPathHWMonRoot;
err_msg += ".";
perror(err_msg.c_str());
return 1;
}
return 0;
}
// Since these sysfs files require sudo access, we won't discover them
// with rsmi_init() (and thus always require the user to use "sudo".
+2 -2
Wyświetl plik
@@ -183,10 +183,10 @@ rsmi_status_t handleException() {
pthread_mutex_t *GetMutex(uint32_t dv_ind) {
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
if (dv_ind >= smi.monitor_devices().size()) {
if (dv_ind >= smi.devices().size()) {
return nullptr;
}
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dv_ind];
std::shared_ptr<amd::smi::Device> dev = smi.devices()[dv_ind];
assert(dev != nullptr);
return dev->mutex();