From 95d3da04b95a5ad8cd22ddb4efb359e313efa990 Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Sat, 15 Feb 2020 20:39:07 -0600 Subject: [PATCH] Add rsmi_compute_process_gpus_get() Given a process ID, give the device indices that process is currently using. Also: * made corrections to how RSMI, amdgpu (ie, "card#") and KFD indicies translate from one another * add a few missing error codes to rsmi_status_string() * fix some formatting Change-Id: Icd2cae66bb4fec768da96af7cf9cf8b8b66ec7f9 [ROCm/rocm_smi_lib commit: 2d6e15190c7c45f191aa1c80234b0bcd15bf89e9] --- .../rocm-smi-lib/include/rocm_smi/rocm_smi.h | 48 ++- .../include/rocm_smi/rocm_smi_device.h | 14 +- .../include/rocm_smi/rocm_smi_kfd.h | 32 ++ .../include/rocm_smi/rocm_smi_main.h | 11 +- projects/rocm-smi-lib/src/rocm_smi.cc | 96 ++++- .../rocm-smi-lib/src/rocm_smi_counters.cc | 2 +- projects/rocm-smi-lib/src/rocm_smi_device.cc | 47 --- projects/rocm-smi-lib/src/rocm_smi_kfd.cc | 378 +++++++++++++++++- projects/rocm-smi-lib/src/rocm_smi_main.cc | 46 ++- projects/rocm-smi-lib/src/rocm_smi_monitor.cc | 5 + .../rocm_smi_test/functional/id_info_read.cc | 3 +- .../functional/process_info_read.cc | 31 +- 12 files changed, 604 insertions(+), 109 deletions(-) diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 8f6cae797d..c906cdbd8c 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -2301,7 +2301,7 @@ rsmi_dev_counter_group_supported(uint32_t dv_ind, rsmi_event_group_t group); * * @param[in] dv_ind a device index * - * @param[in] type the type of performance event to create + * @param[in] type the ::rsmi_event_type_t of performance event to create * * @param[inout] evnt_handle A pointer to a ::rsmi_event_handle_t which will be * associated with a newly allocated counter @@ -2448,24 +2448,64 @@ rsmi_compute_process_info_get(rsmi_process_info_t *procs, uint32_t *num_items); /** * @brief Get process information about a specific process * - * @details Given a pointer to an ::rsmi_process_info_t @p proc and a process id + * @details Given a pointer to an ::rsmi_process_info_t @p proc and a process + * id * @p pid, this function will write the process information for @p pid, if * available, to the memory pointed to by @p proc. * - * @param[in] pid The process ID for which process information is being requested + * @param[in] pid The process ID for which process information is being + * requested * * @param[inout] proc a pointer to a ::rsmi_process_info_t to which * process information for @p pid will be written if it is found. * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid - * @retval ::RSMI_STATUS_NOT_FOUND is returned if there was no process information + * @retval ::RSMI_STATUS_NOT_FOUND is returned if there was no process + * information * found for the provided @p pid * */ rsmi_status_t rsmi_compute_process_info_by_pid_get(uint32_t pid, rsmi_process_info_t *proc); +/** + * @brief Get the device indices currently being used by a process + * + * @details Given a process id @p pid, a non-NULL pointer to an array of + * uint32_t's @p dv_indices of length *@p num_devices, this function will + * write up to @p num_devices device indices to the memory pointed to by + * @p dv_indices. If @p dv_indices is not NULL, @p num_devices will be + * updated with the number of gpu's currently being used by process @p pid. + * If @p dv_indices is NULL, @p dv_indices will be updated with the number of + * gpus currently being used by @p pid. Calling this function with @p + * dv_indices being NULL is a way to determine how much memory is required + * for when @p dv_indices is not NULL. + * + * @param[in] pid The process id of the process for which the number of gpus + * currently being used is requested + * + * @param[inout] dv_indices a pointer to memory provided by the caller to + * which indices of devices currently being used by the process will be + * written. This may be NULL in which case only @p num_devices will be + * updated with the number of devices being used. + * + * @param[inout] num_devices A pointer to a uint32_t, which on input, should + * contain the amount of memory in uint32_t's which have been provided by the + * @p dv_indices argument. On output, if @p dv_indices is non-NULL, this will + * be updated with the number uint32_t's actually written. If @p dv_indices is + * NULL, this argument will be updated with the number devices being used. + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if there were more + * gpu indices that could have been written, but not enough space was + * provided as indicated by @p dv_indices and @p num_devices, on input. + * + */ +rsmi_status_t +rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices, + uint32_t *num_devices); /** @} */ // end of SysInfo diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h index 42e13c2ca6..3d02422d55 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h @@ -178,11 +178,9 @@ class Device { int readDevInfo(DevInfoTypes type, std::vector *retVec); int writeDevInfo(DevInfoTypes type, uint64_t val); int writeDevInfo(DevInfoTypes type, std::string val); - int populateKFDNodeProperties(bool force_update = false); - int getKFDNodeProperty(DevKFDNodePropTypes prop, uint64_t *val); - uint32_t index(void) const {return index_;} - void set_index(uint32_t index) {index_ = index;} + uint32_t index(void) const {return card_indx_;} + void set_card_index(uint32_t index) {card_indx_ = index;} uint32_t drm_render_minor(void) const {return drm_render_minor_;} void set_drm_render_minor(uint32_t minor) {drm_render_minor_ = minor;} static rsmi_dev_perf_level perfLvlStrToEnum(std::string s); @@ -192,6 +190,8 @@ class Device { evt::dev_evt_grp_set_t* supported_event_groups(void) { return &supported_event_groups_;} SupportedFuncMap *supported_funcs(void) {return &supported_funcs_;} + uint64_t kfd_gpu_id(void) const {return kfd_gpu_id_;} + void set_kfd_gpu_id(uint64_t id) {kfd_gpu_id_ = id;} void fillSupportedFuncs(void); void DumpSupportedFunctions(void); bool DeviceAPISupported(std::string name, uint64_t variant, @@ -202,20 +202,20 @@ class Device { std::shared_ptr power_monitor_; std::string path_; shared_mutex_t mutex_; - uint32_t index_; + uint32_t card_indx_; // This index corresponds to the drm index (ie, card#) uint32_t drm_render_minor_; const RocmSMI_env_vars *env_; template int openSysfsFileStream(DevInfoTypes type, T *fs, const char *str = nullptr); - int readDevInfoStr(DevInfoTypes type, std::string *retStr); int readDevInfoMultiLineStr(DevInfoTypes type, std::vector *retVec); int writeDevInfoStr(DevInfoTypes type, std::string valStr); uint64_t bdfid_; + uint64_t kfd_gpu_id_; std::unordered_set supported_event_groups_; - std::map kfdNodePropMap_; + // std::map kfdNodePropMap_; SupportedFuncMap supported_funcs_; }; diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h index 0ae429d5a7..1301b6729c 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h @@ -45,18 +45,50 @@ #include #include +#include +#include +#include #include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_device.h" namespace amd { namespace smi { +class KFDNode { + public: + explicit KFDNode(uint32_t node_ind) : node_indx_(node_ind) {} + ~KFDNode(); + + int Initialize(); + int ReadProperties(void); + int get_property_value(std::string property, uint64_t *value); + uint64_t gpu_id(void) const {return gpu_id_;} + std::string name(void) const {return name_;} + std::shared_ptr amdgpu_device(void) const {return amdgpu_device_;} + uint32_t amdgpu_dev_index(void) const {return amdgpu_dev_index_;} + void set_amdgpu_dev_index(uint32_t val) {amdgpu_dev_index_ = val;} + + private: + uint32_t node_indx_; + uint32_t amdgpu_dev_index_; + uint64_t gpu_id_; + std::string name_; + std::map properties_; + std::shared_ptr amdgpu_device_; +}; + +int +DiscoverKFDNodes(std::map> *nodes); + int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated, uint32_t *num_procs_found); int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc); +int +GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_count); int ReadKFDDeviceProperties(uint32_t dev_id, std::vector *retVec); diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h index 284f41698f..e7bca3db88 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h @@ -51,7 +51,10 @@ #include #include #include +#include +#include +#include "rocm_smi/rocm_smi_kfd.h" #include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_power_mon.h" @@ -71,7 +74,7 @@ class RocmSMI { static std::vector>& monitor_devices() {return s_monitor_devices;} - uint32_t DiscoverDevices(void); + uint32_t DiscoverAmdgpuDevices(void); uint32_t DiscoverAMDPowerMonitors(bool force_update = false); // Will execute "func" for every Device object known about, or until func @@ -84,17 +87,21 @@ class RocmSMI { uint32_t euid() const {return euid_;} + std::map> & kfd_node_map(void) { + return kfd_node_map_;} + private: std::vector> devices_; + std::map> kfd_node_map_; std::vector> monitors_; std::vector> power_mons_; - std::set amd_monitor_types_; void AddToDeviceList(std::string dev_name); void GetEnvVariables(void); uint32_t DiscoverAMDMonitors(void); static std::vector> s_monitor_devices; + RocmSMI_env_vars env_vars_; uint64_t init_options_; uint32_t euid_; diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index e6f6d4a95e..d1ab6b57ab 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,16 @@ static rsmi_status_t handleException() { std::shared_ptr dev = smi.monitor_devices()[dv_ind]; \ assert(dev != nullptr); + +#define GET_DEV_AND_KFDNODE_FROM_INDX \ + GET_DEV_FROM_INDX \ + std::shared_ptr kfd_node; \ + if (smi.kfd_node_map().find(dev->kfd_gpu_id()) == \ + smi.kfd_node_map().end()) { \ + return RSMI_INITIALIZATION_ERROR; \ + } \ + kfd_node = smi.kfd_node_map()[dev->kfd_gpu_id()]; + #define REQUIRE_ROOT_ACCESS \ if (amd::smi::RocmSMI::getInstance().euid()) { \ return RSMI_STATUS_PERMISSION; \ @@ -168,6 +179,7 @@ static rsmi_status_t errno_to_rsmi_status(uint32_t err) { case EISDIR: return RSMI_STATUS_FILE_ERROR; case EINTR: return RSMI_STATUS_INTERRUPT; case EIO: return RSMI_STATUS_UNEXPECTED_SIZE; + case ENXIO: return RSMI_STATUS_UNEXPECTED_DATA; default: return RSMI_STATUS_UNKNOWN_ERROR; } } @@ -208,7 +220,6 @@ static uint64_t get_multiplier_from_str(char units_char) { */ static uint64_t freq_string_to_int(const std::vector &freq_lines, bool *is_curr, uint32_t lanes[], uint32_t i) { - assert(i < freq_lines.size()); if (i >= freq_lines.size()) { throw amd::smi::rsmi_exception(RSMI_STATUS_INPUT_OUT_OF_BOUNDS, @@ -696,26 +707,15 @@ rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) { TRY - CHK_SUPPORT_NAME_ONLY(bdfid) + GET_DEV_AND_KFDNODE_FROM_INDX + CHK_API_SUPPORT_ONLY(bdfid, RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT) DEVICE_MUTEX *bdfid = dev->bdfid(); - int32_t ret = dev->populateKFDNodeProperties(); - - if (ret) { - return errno_to_rsmi_status(errno); - } - uint64_t domain = 0; - ret = dev->getKFDNodeProperty(amd::smi::kDevKFDNodePropDomain, &domain); - - if (ret == EINVAL) { - // "domain" is not found in properties file; just go with the 16 bit - // domain already found - return RSMI_STATUS_SUCCESS; - } + kfd_node->get_property_value("domain", &domain); // Replace the 16 bit domain originally set like this: // BDFID = (( & 0xffff) << 32) | (( & 0xff) << 8) | @@ -2329,15 +2329,30 @@ rsmi_status_string(rsmi_status_t status, const char **status_string) { " the call"; break; + case RSMI_STATUS_INTERRUPT: + *status_string = "An interrupt occurred while executing the function"; + break; + + case RSMI_STATUS_UNEXPECTED_SIZE: + *status_string = "Data (usually from reading a file) was out of" + " range from what was expected"; + break; + + case RSMI_STATUS_NO_DATA: + *status_string = "No data was found (usually from reading a file) " + "where data was expected"; + break; + + case RSMI_STATUS_UNEXPECTED_DATA: + *status_string = "Data (usually from reading a file) was not of the " + "type that was expected"; + break; + case RSMI_STATUS_UNKNOWN_ERROR: *status_string = "An unknown error prevented the call from completing" " successfully"; break; - case RSMI_STATUS_INTERRUPT: - *status_string = "An interrupt occurred while executing the function"; - break; - default: *status_string = "An unknown error occurred"; return RSMI_STATUS_UNKNOWN_ERROR; @@ -2696,6 +2711,49 @@ rsmi_compute_process_info_get(rsmi_process_info_t *procs, CATCH } +rsmi_status_t +rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices, + uint32_t *num_devices) { + TRY + + if (num_devices == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + std::unordered_set gpu_set; + int err = amd::smi::GetProcessGPUs(pid, &gpu_set); + + if (err) { + return errno_to_rsmi_status(err); + } + + uint32_t i = 0; + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); + + if (dv_indices != nullptr) { + for (auto it = gpu_set.begin(); i < *num_devices && it != gpu_set.end(); + ++it, ++i) { + uint64_t gpu_id_val = (*it); + dv_indices[i] = smi.kfd_node_map()[gpu_id_val]->amdgpu_dev_index(); + } + } + + if (dv_indices && *num_devices < gpu_set.size()) { + // In this case, *num_devices should already hold the number of items + // written to dv_devices. We just have to let the caller know there's more. + return RSMI_STATUS_INSUFFICIENT_SIZE; + } + + *num_devices = static_cast(gpu_set.size()); + if (gpu_set.size() > smi.monitor_devices().size()) { + return RSMI_STATUS_UNEXPECTED_SIZE; + } + + return RSMI_STATUS_SUCCESS; + + CATCH +} + rsmi_status_t rsmi_dev_memory_reserved_pages_get(uint32_t dv_ind, uint32_t *num_pages, rsmi_retired_page_record_t *records) { diff --git a/projects/rocm-smi-lib/src/rocm_smi_counters.cc b/projects/rocm-smi-lib/src/rocm_smi_counters.cc index b575741f7b..6541d956be 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_counters.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_counters.cc @@ -368,7 +368,7 @@ amd::smi::evt::Event::stopCounter(void) { return 0; } -static long +static ssize_t readn(int fd, void *buf, size_t n) { ssize_t left = n; ssize_t bytes; diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index 5a52548ad5..fa57a60fb8 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -777,53 +777,6 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { return 0; } -int Device::populateKFDNodeProperties(bool force_update) { - int ret; - - std::vector propVec; - - if (kfdNodePropMap_.size() > 0 && !force_update) { - return 0; - } - - ret = ReadKFDDeviceProperties(index_, &propVec); - - if (ret) { - return ret; - } - - std::string key_str; - // std::string val_str; - uint64_t val_int; // Assume all properties are unsigned integers for now - std::istringstream fs; - - for (uint32_t i = 0; i < propVec.size(); ++i) { - fs.str(propVec[i]); - fs >> key_str; - fs >> val_int; - - kfdNodePropMap_[key_str] = val_int; - - fs.str(""); - fs.clear(); - } - - return 0; -} - -int Device::getKFDNodeProperty(DevKFDNodePropTypes prop, uint64_t *val) { - assert(val != nullptr); - assert(kDevKFDPropNameMap.find(prop) != kDevKFDPropNameMap.end()); - - const char *prop_name = kDevKFDPropNameMap.at(prop); - if (kfdNodePropMap_.find(prop_name) == kfdNodePropMap_.end()) { - return EINVAL; - } - - *val = kfdNodePropMap_.at(prop_name); - return 0; -} - void Device::DumpSupportedFunctions(void) { SupportedFuncMapIt func_iter = supported_funcs_.begin(); diff --git a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc index ad3c6bfaea..2292d3ad82 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc @@ -47,7 +47,7 @@ #include #include -#include +#include #include #include #include @@ -57,6 +57,7 @@ #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" +#include "rocm_smi/rocm_smi_device.h" namespace amd { namespace smi { @@ -66,25 +67,75 @@ static const char *kKFDNodesPathRoot = "/sys/class/kfd/kfd/topology/nodes"; // Sysfs file names static const char *kKFDPasidFName = "pasid"; + + +// KFD Node Property strings +// static const char *kKFDNodePropCPU_CORES_COUNTStr = "cpu_cores_count"; +// static const char *kKFDNodePropSIMD_COUNTStr = "simd_count"; +// static const char *kKFDNodePropMEM_BANKS_COUNTStr = "mem_banks_count"; +// static const char *kKFDNodePropCACHES_COUNTStr = "caches_count"; +// static const char *kKFDNodePropIO_LINKS_COUNTStr = "io_links_count"; +// static const char *kKFDNodePropCPU_CORE_ID_BASEStr = "cpu_core_id_base"; +// static const char *kKFDNodePropSIMD_ID_BASEStr = "simd_id_base"; +// static const char *kKFDNodePropMAX_WAVES_PER_SIMDStr = "max_waves_per_simd"; +// static const char *kKFDNodePropLDS_SIZE_IN_KBStr = "lds_size_in_kb"; +// static const char *kKFDNodePropGDS_SIZE_IN_KBStr = "gds_size_in_kb"; +// static const char *kKFDNodePropNUM_GWSStr = "num_gws"; +// static const char *kKFDNodePropWAVE_FRONT_SIZEStr = "wave_front_size"; +// static const char *kKFDNodePropARRAY_COUNTStr = "array_count"; +// static const char *kKFDNodePropSIMD_ARRAYS_PER_ENGINEStr = +// "simd_arrays_per_engine"; +// static const char *kKFDNodePropCU_PER_SIMD_ARRAYStr = "cu_per_simd_array"; +// static const char *kKFDNodePropSIMD_PER_CUStr = "simd_per_cu"; +// static const char *kKFDNodePropMAX_SLOTS_SCRATCH_CUStr = +// "max_slots_scratch_cu"; +// static const char *kKFDNodePropVENDOR_IDStr = "vendor_id"; +// static const char *kKFDNodePropDEVICE_IDStr = "device_id"; +static const char *kKFDNodePropLOCATION_IDStr = "location_id"; +static const char *kKFDNodePropDOMAINStr = "domain"; +// static const char *kKFDNodePropDRM_RENDER_MINORStr = "drm_render_minor"; +// static const char *kKFDNodePropHIVE_IDStr = "hive_id"; +// static const char *kKFDNodePropNUM_SDMA_ENGINESStr = "num_sdma_engines"; +// static const char *kKFDNodePropNUM_SDMA_XGMI_ENGINESStr = +// "num_sdma_xgmi_engines"; +// static const char *kKFDNodePropNUM_SDMA_QUEUES_PER_ENGINEStr = +// "num_sdma_queues_per_engine"; +// static const char *kKFDNodePropNUM_CP_QUEUESStr = "num_cp_queues"; +// static const char *kKFDNodePropMAX_ENGINE_CLK_FCOMPUTEStr = +// "max_engine_clk_fcompute"; +// static const char *kKFDNodePropLOCAL_MEM_SIZEStr = "local_mem_size"; +// static const char *kKFDNodePropFW_VERSIONStr = "fw_version"; +// static const char *kKFDNodePropCAPABILITYStr = "capability"; +// static const char *kKFDNodePropDEBUG_PROPStr = "debug_prop"; +// static const char *kKFDNodePropSDMA_FW_VERSIOStr = "sdma_fw_versio"; +// static const char *kKFDNodePropMAX_ENGINE_CLK_CCOMPUTEStr = +// "max_engine_clk_ccompute"; + static bool is_number(const std::string &s) { return !s.empty() && std::all_of(s.begin(), s.end(), ::isdigit); } -int ReadKFDDeviceProperties(uint32_t dev_id, - std::vector *retVec) { +static std::string KFDDevicePath(uint32_t dev_id) { + std::string node_path = kKFDNodesPathRoot; + node_path += '/'; + node_path += std::to_string(dev_id); + return node_path; +} + +static int OpenKFDNodeFile(uint32_t dev_id, std::string node_file, + std::ifstream *fs) { std::string line; int ret; - std::ifstream fs; - std::string properties_path = kKFDNodesPathRoot; + std::string f_path; bool reg_file; - assert(retVec != nullptr); + assert(fs != nullptr); - properties_path += '/'; - properties_path += std::to_string(dev_id); - properties_path += "/properties"; + f_path = KFDDevicePath(dev_id); + f_path += "/"; + f_path += node_file; - ret = isRegularFile(properties_path, ®_file); + ret = isRegularFile(f_path, ®_file); if (ret != 0) { return ret; @@ -93,26 +144,104 @@ int ReadKFDDeviceProperties(uint32_t dev_id, return ENOENT; } - fs.open(properties_path); + fs->open(f_path); - if (!fs.is_open()) { + if (!fs->is_open()) { return errno; } + return 0; +} + +int ReadKFDDeviceProperties(uint32_t kfd_node_id, + std::vector *retVec) { + std::string line; + int ret; + std::ifstream fs; + std::string properties_path; + + assert(retVec != nullptr); + + ret = OpenKFDNodeFile(kfd_node_id, "properties", &fs); + + if (ret) { + return ret; + } + while (std::getline(fs, line)) { retVec->push_back(line); } if (retVec->size() == 0) { + fs.close(); return 0; } // Remove any *trailing* empty (whitespace) lines while (retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) { retVec->pop_back(); } + + fs.close(); return 0; } +static int ReadKFDGpuId(uint32_t kfd_node_id, uint64_t *gpu_id) { + std::string line; + int ret; + std::ifstream fs; + std::string gpu_id_str; + + assert(gpu_id != nullptr); + + ret = OpenKFDNodeFile(kfd_node_id, "gpu_id", &fs); + + if (ret) { + fs.close(); + return ret; + } + + std::stringstream ss; + ss << fs.rdbuf(); + fs.close(); + + gpu_id_str = ss.str(); + + gpu_id_str.erase(std::remove(gpu_id_str.begin(), gpu_id_str.end(), '\n'), + gpu_id_str.end()); + + if (!is_number(gpu_id_str)) { + return ENXIO; + } + + *gpu_id = std::stoi(gpu_id_str); + return 0; +} + +static int ReadKFDGpuName(uint32_t kfd_node_id, std::string *gpu_name) { + std::string line; + int ret; + std::ifstream fs; + + assert(gpu_name != nullptr); + + ret = OpenKFDNodeFile(kfd_node_id, "name", &fs); + + if (ret) { + fs.close(); + return ret; + } + + std::stringstream ss; + ss << fs.rdbuf(); + fs.close(); + + *gpu_name = ss.str(); + + gpu_name->erase(std::remove(gpu_name->begin(), gpu_name->end(), '\n'), + gpu_name->end()); + + return 0; +} int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated, uint32_t *num_procs_found) { @@ -128,7 +257,7 @@ int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated, } auto dentry = readdir(proc_dir); - std::string prod_id_str; + std::string proc_id_str; std::string tmp; while (dentry != nullptr) { @@ -137,29 +266,32 @@ int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated, continue; } - prod_id_str = dentry->d_name; - assert(is_number(prod_id_str) && "Unexpected file name in kfd/proc dir"); - if (!is_number(prod_id_str)) { + proc_id_str = dentry->d_name; + assert(is_number(proc_id_str) && "Unexpected file name in kfd/proc dir"); + if (!is_number(proc_id_str)) { + dentry = readdir(proc_dir); continue; } if (procs && *num_procs_found < num_allocated) { int err; std::string tmp; - procs[*num_procs_found].process_id = std::stoi(prod_id_str); + procs[*num_procs_found].process_id = std::stoi(proc_id_str); std::string pasid_str_path = kKFDProcPathRoot; pasid_str_path += "/"; - pasid_str_path += prod_id_str; + pasid_str_path += proc_id_str; pasid_str_path += "/"; pasid_str_path += kKFDPasidFName; err = ReadSysfsStr(pasid_str_path, &tmp); if (err) { - return err; + dentry = readdir(proc_dir); + continue; } assert(is_number(tmp) && "Unexpected value in pasid file"); if (!is_number(tmp)) { + closedir(proc_dir); return EINVAL; } procs[*num_procs_found].pasid = std::stoi(tmp); @@ -176,6 +308,81 @@ int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated, return 0; } +// Read the gpuid files found in all the dirs and put them in +// gpus_found. +// Directory structure: +// /sys/class/kfd/kfd/proc//queues//gpuid + +int GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_set) { + int err; + + assert(gpu_set != nullptr); + if (gpu_set == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + errno = 0; + + std::string queues_dir = kKFDProcPathRoot; + queues_dir += "/"; + queues_dir += std::to_string(pid); + queues_dir += "/queues"; + + auto queues_dir_hd = opendir(queues_dir.c_str()); + + if (queues_dir_hd == nullptr) { + std::string err_str = "Unable to open queues directory for process "; + err_str += std::to_string(pid); + perror(err_str.c_str()); + return ESRCH; + } + + auto q_dentry = readdir(queues_dir_hd); + + std::string q_gpu_id_str; + std::string q_dir; + + std::string tmp; + + while (q_dentry != nullptr) { + if (q_dentry->d_name[0] == '.') { + q_dentry = readdir(queues_dir_hd); + continue; + } + + if (!is_number(q_dentry->d_name)) { + q_dentry = readdir(queues_dir_hd); + continue; + } + + q_gpu_id_str = queues_dir + '/' + q_dentry->d_name + "/gpuid"; + + err = ReadSysfsStr(q_gpu_id_str, &tmp); + if (err) { + q_dentry = readdir(queues_dir_hd); + continue; + } + + uint64_t val; + try { + val = std::stoi(tmp); + } catch (...) { + std::cerr << "Error; read invalid data: " << tmp << " from " << + q_gpu_id_str << std::endl; + closedir(queues_dir_hd); + return ENXIO; // Return "no such device" if we read an invalid gpu id + } + gpu_set->insert(val); + + q_dentry = readdir(queues_dir_hd); + } + + errno = 0; + if (closedir(queues_dir_hd)) { + return errno; + } + return 0; +} + int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc) { assert(proc != nullptr); int err; @@ -208,5 +415,138 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc) { return 0; } +int DiscoverKFDNodes(std::map> *nodes) { + assert(nodes != nullptr); + + if (nodes == nullptr) { + return EINVAL; + } + assert(nodes->size() == 0); + + nodes->clear(); + + std::shared_ptr node; + uint32_t node_indx; + + auto kfd_node_dir = opendir(kKFDNodesPathRoot); + assert(kfd_node_dir != nullptr); + + auto dentry = readdir(kfd_node_dir); + while (dentry != nullptr) { + if (dentry->d_name[0] == '.') { + dentry = readdir(kfd_node_dir); + continue; + } + + if (!is_number(dentry->d_name)) { + dentry = readdir(kfd_node_dir); + continue; + } + + node_indx = std::stoi(dentry->d_name); + node = std::shared_ptr(new KFDNode(node_indx)); + + node->Initialize(); + + if (node->gpu_id() == 0) { + // Don't add; this is a cpu node. + dentry = readdir(kfd_node_dir); + continue; + } + + uint64_t kfd_gpu_node_bus_fn; + uint64_t kfd_gpu_node_domain; + int ret; + ret = + node->get_property_value(kKFDNodePropLOCATION_IDStr, + &kfd_gpu_node_bus_fn); + if (ret != 0) { + closedir(kfd_node_dir); + return ret; + } + ret = + node->get_property_value(kKFDNodePropDOMAINStr, &kfd_gpu_node_domain); + if (ret != 0) { + closedir(kfd_node_dir); + return ret; + } + + uint64_t kfd_bdfid = + (kfd_gpu_node_domain << 32) | (kfd_gpu_node_bus_fn); + (*nodes)[kfd_bdfid] = node; + + dentry = readdir(kfd_node_dir); + } + + if (closedir(kfd_node_dir)) { + return 1; + } + return 0; +} + +KFDNode::~KFDNode() { +} + +int KFDNode::ReadProperties(void) { + int ret; + + std::vector propVec; + + assert(properties_.size() == 0); + if (properties_.size() > 0) { + return 0; + } + + ret = ReadKFDDeviceProperties(node_indx_, &propVec); + + if (ret) { + return ret; + } + + std::string key_str; + // std::string val_str; + uint64_t val_int; // Assume all properties are unsigned integers for now + std::istringstream fs; + + for (uint32_t i = 0; i < propVec.size(); ++i) { + fs.str(propVec[i]); + fs >> key_str; + fs >> val_int; + + properties_[key_str] = val_int; + + fs.str(""); + fs.clear(); + } + + return 0; +} + +int +KFDNode::Initialize(void) { + int ret = 0; + ret = ReadProperties(); + if (ret) {return ret;} + + ret = ReadKFDGpuId(node_indx_, &gpu_id_); + if (ret) {return ret;} + + ret = ReadKFDGpuName(node_indx_, &name_); + + return ret; +} +int +KFDNode::get_property_value(std::string property, uint64_t *value) { + assert(value != nullptr); + if (value == nullptr) { + return EINVAL; + } + if (properties_.find(property) == properties_.end()) { + return EINVAL; + } + *value = properties_[property]; + return 0; +} + } // namespace smi } // namespace amd diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc index 8b3bcd5dd8..45c88e88de 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_main.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc @@ -55,12 +55,14 @@ #include #include #include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" +#include "rocm_smi/rocm_smi_kfd.h" static const char *kPathDRMRoot = "/sys/class/drm"; static const char *kPathHWMonRoot = "/sys/class/hwmon"; @@ -253,9 +255,9 @@ RocmSMI::Initialize(uint64_t flags) { ++i; } - // DiscoverDevices() will seach for devices and monitors and update internal - // data structures. - DiscoverDevices(); + // DiscoverAmdgpuDevices() will seach for devices and monitors and update + // internal data structures. + DiscoverAmdgpuDevices(); // IterateSMIDevices will iterate through all the known devices and apply // the provided call-back to each device found. @@ -264,7 +266,34 @@ RocmSMI::Initialize(uint64_t flags) { if (ret != 0) { throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, - "Failed to initialize rocm_smi library."); + "Failed to initialize rocm_smi library (amdgpu node discovery."); + } + + std::map> tmp_map; + ret = DiscoverKFDNodes(&tmp_map); + if (ret != 0) { + throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, + "Failed to initialize rocm_smi library (KFD node discovery)."); + } + + std::shared_ptr dev; + + // 1. construct kfd_node_map_ with gpu_id as key and *Device as value + // 2. for each kfd node, write the corresponding dv_ind + // 3. for each amdgpu device, write the corresponding gpu_id + for (uint32_t dv_ind = 0; dv_ind < s_monitor_devices.size(); ++dv_ind) { + dev = s_monitor_devices[dv_ind]; + uint64_t bdfid = dev->bdfid(); + assert(tmp_map.find(bdfid) != tmp_map.end()); + if (tmp_map.find(bdfid) == tmp_map.end()) { + throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, + "amdgpu device bdfid has no KFD matching node"); + } + + tmp_map[bdfid]->set_amdgpu_dev_index(dv_ind); + uint64_t gpu_id = tmp_map[bdfid]->gpu_id(); + dev->set_kfd_gpu_id(gpu_id); + kfd_node_map_[gpu_id] = tmp_map[bdfid]; } } @@ -345,10 +374,11 @@ RocmSMI::AddToDeviceList(std::string dev_name) { } std::string d_name = dev_name; - uint32_t d_index = GetDeviceIndex(d_name); + uint32_t card_indx = GetDeviceIndex(d_name); dev->set_drm_render_minor(GetDrmRenderMinor(dev_path)); - dev->set_index(d_index); - GetSupportedEventGroups(d_index, dev->supported_event_groups()); + dev->set_card_index(card_indx); + GetSupportedEventGroups(card_indx, dev->supported_event_groups()); + devices_.push_back(dev); return; @@ -381,7 +411,7 @@ static bool isAMDGPU(std::string dev_path) { return false; } -uint32_t RocmSMI::DiscoverDevices(void) { +uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { auto ret = 0; // If this gets called more than once, clear previous findings. diff --git a/projects/rocm-smi-lib/src/rocm_smi_monitor.cc b/projects/rocm-smi-lib/src/rocm_smi_monitor.cc index a31ca937b3..706db50be7 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_monitor.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_monitor.cc @@ -303,6 +303,7 @@ static int get_supported_sensors(std::string dir_path, std::string fn_reg_ex, std::string::size_type pos = fn_reg_ex.find('#'); if (pos == std::string::npos) { + closedir(hwmon_dir); return -1; } fn_reg_ex.erase(pos, 1); @@ -326,12 +327,16 @@ static int get_supported_sensors(std::string dir_path, std::string fn_reg_ex, assert(errno == 0); assert(*endptr == '\0'); if (errno) { + closedir(hwmon_dir); return -2; } sensors->push_back(mon_val); } dentry = readdir(hwmon_dir); } + if (closedir(hwmon_dir)) { + return errno; + } return 0; } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/id_info_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/id_info_read.cc index 834c76683e..dfd5426075 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/id_info_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/id_info_read.cc @@ -96,7 +96,8 @@ void TestIdInfoRead::Run(void) { for (uint32_t i = 0; i < num_monitor_devs(); ++i) { IF_VERB(STANDARD) { - std::cout << "\t**Device index: " << id << std::endl; + std::cout << "\t*************************" << std::endl; + std::cout << "\t**Device index: " << i << std::endl; } // Get the device ID, name, vendor ID and vendor name for the device diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/process_info_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/process_info_read.cc index 6010f6a910..2394b8c607 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/process_info_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/process_info_read.cc @@ -98,6 +98,10 @@ void TestProcInfoRead::Run(void) { TestBase::Run(); + uint32_t num_devices; + err = rsmi_num_monitor_devices(&num_devices); + CHK_ERR_ASRT(err) + err = rsmi_compute_process_info_get(nullptr, &num_proc_found); if (err != RSMI_STATUS_SUCCESS) { if (err == RSMI_STATUS_NOT_SUPPORTED) { @@ -119,6 +123,7 @@ void TestProcInfoRead::Run(void) { if (num_proc_found == 0) { return; } + procs = new rsmi_process_info_t[num_proc_found]; val_ui32 = num_proc_found; @@ -150,8 +155,32 @@ void TestProcInfoRead::Run(void) { // Reset to the number we actually read num_proc_found = val_ui32; if (num_proc_found) { - rsmi_process_info_t proc_info; + // Allocate the max we expect to get + uint32_t *dev_inds = new uint32_t[num_devices]; + uint32_t amt_allocd = num_devices; + for (uint32_t j = 0; j < num_proc_found; j++) { + err = rsmi_compute_process_gpus_get(procs[j].process_id, dev_inds, + &amt_allocd); + CHK_ERR_ASRT(err) + ASSERT_LE(amt_allocd, num_devices); + + std::cout << "\t**Process " << procs[j].process_id << + " is using devices with indices: "; + uint32_t i; + if (amt_allocd > 0) { + for (i = 0; i < amt_allocd - 1; ++i) { + std::cout << dev_inds[i] << ", "; + } + std::cout << dev_inds[i] << std::endl; + } + // Reset amt_allocd back to the amount acutally allocated + amt_allocd = num_devices; + } + + delete []dev_inds; + + rsmi_process_info_t proc_info; err = rsmi_compute_process_info_by_pid_get(procs[0].process_id, &proc_info); if (err == RSMI_STATUS_NOT_FOUND) {