From ff9546aa62f2cbfbfe75dc30cad0c0f85a5198ca Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Thu, 28 Jan 2021 18:25:53 -0600 Subject: [PATCH] Don't use hwmon# as indicator of gpu Previously, during the rsmi_init discovery process, the existence of an hwmon# directory was used to distinguish between gpus nodes and non-gpu nodes. This isn't reliable in some scenarios. Instead, the existence of the vbios_version file is used as an indicator that the node is indeed a gpu. Change-Id: Icfbe5c42ed0970077b05f25c3d209308a31bec85 --- include/rocm_smi/rocm_smi_common.h | 4 +- include/rocm_smi/rocm_smi_main.h | 7 +- src/rocm_smi.cc | 34 ++--- src/rocm_smi_counters.cc | 4 +- src/rocm_smi_device.cc | 2 +- src/rocm_smi_main.cc | 199 ++++++++++++----------------- src/rocm_smi_utils.cc | 4 +- 7 files changed, 111 insertions(+), 143 deletions(-) diff --git a/include/rocm_smi/rocm_smi_common.h b/include/rocm_smi/rocm_smi_common.h index c488e595f6..98a2df5c62 100755 --- a/include/rocm_smi/rocm_smi_common.h +++ b/include/rocm_smi/rocm_smi_common.h @@ -52,13 +52,13 @@ #define CHECK_DV_IND_RANGE \ amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); \ - if (dv_ind >= smi.monitor_devices().size()) { \ + if (dv_ind >= smi.devices().size()) { \ return RSMI_STATUS_INVALID_ARGS; \ } \ #define GET_DEV_FROM_INDX \ CHECK_DV_IND_RANGE \ - std::shared_ptr dev = smi.monitor_devices()[dv_ind]; \ + std::shared_ptr dev = smi.devices()[dv_ind]; \ assert(dev != nullptr); diff --git a/include/rocm_smi/rocm_smi_main.h b/include/rocm_smi/rocm_smi_main.h index b6ebd07c76..b8141b4568 100755 --- a/include/rocm_smi/rocm_smi_main.h +++ b/include/rocm_smi/rocm_smi_main.h @@ -76,7 +76,8 @@ class RocmSMI { void Cleanup(void); std::vector>& - monitor_devices() {return monitor_devices_;} + devices() {return devices_;} + uint32_t DiscoverAmdgpuDevices(void); int DiscoverAMDPowerMonitors(bool force_update = false); @@ -123,9 +124,7 @@ class RocmSMI { std::map dev_ind_to_node_ind_map_; void AddToDeviceList(std::string dev_name); void GetEnvVariables(void); - uint32_t DiscoverAMDMonitors(void); - - std::vector> monitor_devices_; + std::shared_ptr FindMonitor(std::string monitor_path); RocmSMI_env_vars env_vars_; uint64_t init_options_; diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index b2d11ff425..d843127f7a 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -391,7 +391,7 @@ static rsmi_status_t get_power_mon_value(amd::smi::PowerMonTypes type, uint32_t dv_ind, uint64_t *val) { amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); - if (dv_ind >= smi.monitor_devices().size() || val == nullptr) { + if (dv_ind >= smi.devices().size() || val == nullptr) { return RSMI_STATUS_INVALID_ARGS; } @@ -400,7 +400,7 @@ static rsmi_status_t get_power_mon_value(amd::smi::PowerMonTypes type, return amd::smi::ErrnoToRsmiStatus(ret); } - std::shared_ptr dev = smi.monitor_devices()[dv_ind]; + std::shared_ptr dev = smi.devices()[dv_ind]; assert(dev != nullptr); assert(dev->monitor() != nullptr); @@ -462,9 +462,9 @@ rsmi_shut_down(void) { #if DEBUG int ret = 0; #endif - for (uint32_t i = 0; i < smi.monitor_devices().size(); ++i) { + for (uint32_t i = 0; i < smi.devices().size(); ++i) { #if DEBUG - ret = pthread_mutex_unlock(smi.monitor_devices()[i]->mutex()); + ret = pthread_mutex_unlock(smi.devices()[i]->mutex()); if (ret != EPERM) { // We expect to get EPERM if the lock has already // been released if (ret == 0) { @@ -476,7 +476,7 @@ rsmi_shut_down(void) { } } #else - (void)pthread_mutex_unlock(smi.monitor_devices()[i]->mutex()); + (void)pthread_mutex_unlock(smi.devices()[i]->mutex()); #endif } @@ -499,7 +499,7 @@ rsmi_num_monitor_devices(uint32_t *num_devices) { amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); - *num_devices = static_cast(smi.monitor_devices().size()); + *num_devices = static_cast(smi.devices().size()); return RSMI_STATUS_SUCCESS; CATCH } @@ -1458,13 +1458,13 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, // Above call to rsmi_dev_get_gpu_clk_freq should have emitted an error if // assert below is not true - assert(dv_ind < smi.monitor_devices().size()); + assert(dv_ind < smi.devices().size()); std::string freq_enable_str = bitfield_to_freq_string(freq_bitmask, freqs.num_supported); - std::shared_ptr dev = smi.monitor_devices()[dv_ind]; + std::shared_ptr dev = smi.devices()[dv_ind]; assert(dev != nullptr); ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL); @@ -1912,12 +1912,12 @@ rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask) { // Above call to rsmi_dev_pci_bandwidth_get() should have emitted an error // if assert below is not true - assert(dv_ind < smi.monitor_devices().size()); + assert(dv_ind < smi.devices().size()); std::string freq_enable_str = bitfield_to_freq_string(bw_bitmask, bws.transfer_rate.num_supported); - std::shared_ptr dev = smi.monitor_devices()[dv_ind]; + std::shared_ptr dev = smi.devices()[dv_ind]; assert(dev != nullptr); ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL); @@ -2994,7 +2994,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices, } *num_devices = static_cast(gpu_set.size()); - if (gpu_set.size() > smi.monitor_devices().size()) { + if (gpu_set.size() > smi.devices().size()) { return RSMI_STATUS_UNEXPECTED_SIZE; } @@ -3672,11 +3672,11 @@ rsmi_event_notification_get(int timeout_ms, amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); std::vector fd_indx_to_dev_id; - for (uint32_t i = 0; i < smi.monitor_devices().size(); ++i) { - if (smi.monitor_devices()[i]->evt_notif_anon_fd() == -1) { + for (uint32_t i = 0; i < smi.devices().size(); ++i) { + if (smi.devices()[i]->evt_notif_anon_fd() == -1) { continue; } - fds.push_back({smi.monitor_devices()[i]->evt_notif_anon_fd(), + fds.push_back({smi.devices()[i]->evt_notif_anon_fd(), POLLIN | POLLRDNORM, 0}); fd_indx_to_dev_id.push_back(i); } @@ -3694,7 +3694,7 @@ rsmi_event_notification_get(int timeout_ms, } FILE *anon_fp = - smi.monitor_devices()[fd_indx_to_dev_id[i]]->evt_notif_anon_file_ptr(); + smi.devices()[fd_indx_to_dev_id[i]]->evt_notif_anon_file_ptr(); data_item = reinterpret_cast(&data[*num_elem]); @@ -3754,7 +3754,7 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) { return RSMI_STATUS_INVALID_ARGS; } // close(dev->evt_notif_anon_fd()); - FILE *anon_fp = smi.monitor_devices()[dv_ind]->evt_notif_anon_file_ptr(); + FILE *anon_fp = smi.devices()[dv_ind]->evt_notif_anon_file_ptr(); fclose(anon_fp); assert(errno == 0 || errno == EAGAIN); dev->set_evt_notif_anon_file_ptr(nullptr); @@ -3800,7 +3800,7 @@ rsmi_test_refcount(uint64_t refcnt_type) { amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); std::lock_guard guard(*smi.bootstrap_mutex()); - if (smi.ref_count() == 0 && smi.monitor_devices().size() != 0) { + if (smi.ref_count() == 0 && smi.devices().size() != 0) { return -1; } diff --git a/src/rocm_smi_counters.cc b/src/rocm_smi_counters.cc index 60ab72c58b..cb58cc3d32 100755 --- a/src/rocm_smi_counters.cc +++ b/src/rocm_smi_counters.cc @@ -175,8 +175,8 @@ Event::Event(rsmi_event_type_t event, uint32_t dev_ind) : amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); - assert(dev_ind < smi.monitor_devices().size()); - std::shared_ptr dev = smi.monitor_devices()[dev_ind]; + assert(dev_ind < smi.devices().size()); + std::shared_ptr dev = smi.devices()[dev_ind]; assert(dev != nullptr); diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index 6b26660374..6d81e39a0d 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -709,7 +709,7 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, size_t num = fread(p_binary_data, b_size, 1, ptr); fclose(ptr); - if((num*b_size) != b_size){ + if ((num*b_size) != b_size) { return ENOENT; } return 0; diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index f1a3ec95cb..d760d55171 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -118,10 +118,6 @@ static uint32_t GetDrmRenderMinor(const std::string s) { return static_cast(drm_minor); } -static int SameDevice(const std::string fileA, const std::string fileB) { - return SameFile(fileA + "/device", fileB + "/device"); -} - // Determine if provided string is a bdfid pci path directory of the form // XXXX:XX:XX.X, // domain:bus:device.function @@ -217,28 +213,6 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) { return 1; } -// Call-back function to append to a vector of Devices -static uint32_t GetMonitorDevices(const std::shared_ptr &d, - void *p) { - std::string val_str; - uint64_t bdfid; - - assert(p != nullptr); - - std::vector> *device_list = - reinterpret_cast> *>(p); - - if (d->monitor() != nullptr) { - // Calculate BDFID and set for this device - if (ConstructBDFID(d->path(), &bdfid) != 0) { - std::cerr << "Failed to construct BDFID." << std::endl; - return 1; - } - d->set_bdfid(bdfid); - device_list->push_back(d); - } - return 0; -} void RocmSMI::Initialize(uint64_t flags) { @@ -273,11 +247,14 @@ RocmSMI::Initialize(uint64_t flags) { "DiscoverAmdgpuDevices() failed."); } - // IterateSMIDevices will iterate through all the known devices and apply - // the provided call-back to each device found. - ret = IterateSMIDevices(GetMonitorDevices, - reinterpret_cast(&monitor_devices_)); - + uint64_t bdfid; + for (uint32_t i = 0; i < devices_.size(); ++i) { + if (ConstructBDFID(devices_[i]->path(), &bdfid) != 0) { + std::cerr << "Failed to construct BDFID." << std::endl; + ret = 1; + } + devices_[i]->set_bdfid(bdfid); + } if (ret != 0) { throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, "Failed to initialize rocm_smi library (amdgpu node discovery)."); @@ -305,11 +282,11 @@ RocmSMI::Initialize(uint64_t flags) { // Remove any drm nodes that don't have a corresponding readable kfd node. // kfd nodes will not be added if their properties file is not readable. - auto dev_iter = monitor_devices_.begin(); - while (dev_iter != monitor_devices_.end()) { + auto dev_iter = devices_.begin(); + while (dev_iter != devices_.end()) { uint64_t bdfid = (*dev_iter)->bdfid(); if (tmp_map.find(bdfid) == tmp_map.end()) { - dev_iter = monitor_devices_.erase(dev_iter); + dev_iter = devices_.erase(dev_iter); continue; } dev_iter++; @@ -318,8 +295,8 @@ RocmSMI::Initialize(uint64_t flags) { // 1. construct kfd_node_map_ with gpu_id as key and *Device as value // 2. for each kfd node, write the corresponding dv_ind // 3. for each amdgpu device, write the corresponding gpu_id - for (uint32_t dv_ind = 0; dv_ind < monitor_devices_.size(); ++dv_ind) { - dev = monitor_devices_[dv_ind]; + for (uint32_t dv_ind = 0; dv_ind < devices_.size(); ++dv_ind) { + dev = devices_[dv_ind]; uint64_t bdfid = dev->bdfid(); assert(tmp_map.find(bdfid) != tmp_map.end()); if (tmp_map.find(bdfid) == tmp_map.end()) { @@ -337,7 +314,6 @@ RocmSMI::Initialize(uint64_t flags) { void RocmSMI::Cleanup() { - monitor_devices_.clear(); devices_.clear(); monitors_.clear(); @@ -399,30 +375,80 @@ void RocmSMI::GetEnvVariables(void) { #endif } +std::shared_ptr +RocmSMI::FindMonitor(std::string monitor_path) { + std::string tmp; + std::string err_msg; + std::string mon_name; + std::shared_ptr m; + + if (!FileExists(monitor_path.c_str())) { + return nullptr; + } + + auto mon_dir = opendir(monitor_path.c_str()); + + if (mon_dir == nullptr) { + return nullptr; + } + auto dentry = readdir(mon_dir); + + while (dentry != nullptr) { + if (dentry->d_name[0] == '.') { + dentry = readdir(mon_dir); + continue; + } + + mon_name = monitor_path; + mon_name += "/"; + mon_name += dentry->d_name; + tmp = mon_name + "/name"; + + if (FileExists(tmp.c_str())) { + std::ifstream fs; + fs.open(tmp); + + if (!fs.is_open()) { + err_msg = "Failed to open monitor file "; + err_msg += tmp; + err_msg += "."; + perror(err_msg.c_str()); + return nullptr; + } + std::string mon_type; + fs >> mon_type; + fs.close(); + + if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) { + m = std::shared_ptr(new Monitor(mon_name, &env_vars_)); + m->setTempSensorLabelMap(); + m->setVoltSensorLabelMap(); + break; + } + } + dentry = readdir(mon_dir); + } + + if (closedir(mon_dir)) { + err_msg = "Failed to close monitor directory "; + err_msg += kPathHWMonRoot; + err_msg += "."; + perror(err_msg.c_str()); + return nullptr; + } + + return m; +} void RocmSMI::AddToDeviceList(std::string dev_name) { - auto ret = 0; - auto dev_path = std::string(kPathDRMRoot); dev_path += "/"; dev_path += dev_name; auto dev = std::shared_ptr(new Device(dev_path, &env_vars_)); - auto m = monitors_.begin(); - - while (m != monitors_.end()) { - ret = SameDevice(dev->path(), (*m)->path()); - - if (ret == 0) { - dev->set_monitor(*m); - m = monitors_.erase(m); - break; - } else { - assert(ret == 1); - ++m; - } - } + std::shared_ptr m = FindMonitor(dev_path + "/device/hwmon"); + dev->set_monitor(m); std::string d_name = dev_name; uint32_t card_indx = GetDeviceIndex(d_name); @@ -439,10 +465,15 @@ static const uint32_t kAmdGpuId = 0x1002; static bool isAMDGPU(std::string dev_path) { std::string vend_path = dev_path + "/device/vendor"; + std::string vbios_v_path = dev_path + "/device/vbios_version"; if (!FileExists(vend_path.c_str())) { return false; } + if (!FileExists(vbios_v_path.c_str())) { + return false; + } + std::ifstream fs; fs.open(vend_path); @@ -463,7 +494,6 @@ static bool isAMDGPU(std::string dev_path) { } uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { - uint32_t ret = 0; std::string err_msg; uint32_t count = 0; @@ -471,12 +501,6 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { devices_.clear(); monitors_.clear(); - ret = DiscoverAMDMonitors(); - - if (ret) { - return ret; - } - auto drm_dir = opendir(kPathDRMRoot); if (drm_dir == nullptr) { err_msg = "Failed to open drm root directory "; @@ -521,61 +545,6 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { return 0; } -uint32_t RocmSMI::DiscoverAMDMonitors(void) { - auto mon_dir = opendir(kPathHWMonRoot); - - auto dentry = readdir(mon_dir); - - std::string mon_name; - std::string tmp; - std::string err_msg; - - while (dentry != nullptr) { - if (dentry->d_name[0] == '.') { - dentry = readdir(mon_dir); - continue; - } - - mon_name = kPathHWMonRoot; - mon_name += "/"; - mon_name += dentry->d_name; - tmp = mon_name + "/name"; - - if (FileExists(tmp.c_str())) { - std::ifstream fs; - fs.open(tmp); - - if (!fs.is_open()) { - err_msg = "Failed to open monitor file "; - err_msg += tmp; - err_msg += "."; - perror(err_msg.c_str()); - return 1; - } - std::string mon_type; - fs >> mon_type; - fs.close(); - - if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) { - std::shared_ptr m = - std::shared_ptr(new Monitor(mon_name, &env_vars_)); - m->setTempSensorLabelMap(); - m->setVoltSensorLabelMap(); - monitors_.push_back(m); - } - } - dentry = readdir(mon_dir); - } - - if (closedir(mon_dir)) { - err_msg = "Failed to close monitor directory "; - err_msg += kPathHWMonRoot; - err_msg += "."; - perror(err_msg.c_str()); - return 1; - } - return 0; -} // Since these sysfs files require sudo access, we won't discover them // with rsmi_init() (and thus always require the user to use "sudo". diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index 0044f22b0c..473d1e2b67 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -183,10 +183,10 @@ rsmi_status_t handleException() { pthread_mutex_t *GetMutex(uint32_t dv_ind) { amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); - if (dv_ind >= smi.monitor_devices().size()) { + if (dv_ind >= smi.devices().size()) { return nullptr; } - std::shared_ptr dev = smi.monitor_devices()[dv_ind]; + std::shared_ptr dev = smi.devices()[dv_ind]; assert(dev != nullptr); return dev->mutex();