Add rsmi_compute_process_gpus_get()

Given a process ID, give the device indices that process is
currently using.

Also:
* made corrections to how RSMI, amdgpu (ie, "card#") and
  KFD indicies translate from one another
* add a few missing error codes to rsmi_status_string()
* fix some formatting

Change-Id: Icd2cae66bb4fec768da96af7cf9cf8b8b66ec7f9


[ROCm/rocm_smi_lib commit: 2d6e15190c]
Cette révision appartient à :
Chris Freehill
2020-02-15 20:39:07 -06:00
Parent 386bab024e
révision 95d3da04b9
12 fichiers modifiés avec 604 ajouts et 109 suppressions
+44 -4
Voir le fichier
@@ -2301,7 +2301,7 @@ rsmi_dev_counter_group_supported(uint32_t dv_ind, rsmi_event_group_t group);
*
* @param[in] dv_ind a device index
*
* @param[in] type the type of performance event to create
* @param[in] type the ::rsmi_event_type_t of performance event to create
*
* @param[inout] evnt_handle A pointer to a ::rsmi_event_handle_t which will be
* associated with a newly allocated counter
@@ -2448,24 +2448,64 @@ rsmi_compute_process_info_get(rsmi_process_info_t *procs, uint32_t *num_items);
/**
* @brief Get process information about a specific process
*
* @details Given a pointer to an ::rsmi_process_info_t @p proc and a process id
* @details Given a pointer to an ::rsmi_process_info_t @p proc and a process
* id
* @p pid, this function will write the process information for @p pid, if
* available, to the memory pointed to by @p proc.
*
* @param[in] pid The process ID for which process information is being requested
* @param[in] pid The process ID for which process information is being
* requested
*
* @param[inout] proc a pointer to a ::rsmi_process_info_t to which
* process information for @p pid will be written if it is found.
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
* @retval ::RSMI_STATUS_NOT_FOUND is returned if there was no process information
* @retval ::RSMI_STATUS_NOT_FOUND is returned if there was no process
* information
* found for the provided @p pid
*
*/
rsmi_status_t
rsmi_compute_process_info_by_pid_get(uint32_t pid, rsmi_process_info_t *proc);
/**
* @brief Get the device indices currently being used by a process
*
* @details Given a process id @p pid, a non-NULL pointer to an array of
* uint32_t's @p dv_indices of length *@p num_devices, this function will
* write up to @p num_devices device indices to the memory pointed to by
* @p dv_indices. If @p dv_indices is not NULL, @p num_devices will be
* updated with the number of gpu's currently being used by process @p pid.
* If @p dv_indices is NULL, @p dv_indices will be updated with the number of
* gpus currently being used by @p pid. Calling this function with @p
* dv_indices being NULL is a way to determine how much memory is required
* for when @p dv_indices is not NULL.
*
* @param[in] pid The process id of the process for which the number of gpus
* currently being used is requested
*
* @param[inout] dv_indices a pointer to memory provided by the caller to
* which indices of devices currently being used by the process will be
* written. This may be NULL in which case only @p num_devices will be
* updated with the number of devices being used.
*
* @param[inout] num_devices A pointer to a uint32_t, which on input, should
* contain the amount of memory in uint32_t's which have been provided by the
* @p dv_indices argument. On output, if @p dv_indices is non-NULL, this will
* be updated with the number uint32_t's actually written. If @p dv_indices is
* NULL, this argument will be updated with the number devices being used.
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
* @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if there were more
* gpu indices that could have been written, but not enough space was
* provided as indicated by @p dv_indices and @p num_devices, on input.
*
*/
rsmi_status_t
rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices,
uint32_t *num_devices);
/** @} */ // end of SysInfo
+7 -7
Voir le fichier
@@ -178,11 +178,9 @@ class Device {
int readDevInfo(DevInfoTypes type, std::vector<std::string> *retVec);
int writeDevInfo(DevInfoTypes type, uint64_t val);
int writeDevInfo(DevInfoTypes type, std::string val);
int populateKFDNodeProperties(bool force_update = false);
int getKFDNodeProperty(DevKFDNodePropTypes prop, uint64_t *val);
uint32_t index(void) const {return index_;}
void set_index(uint32_t index) {index_ = index;}
uint32_t index(void) const {return card_indx_;}
void set_card_index(uint32_t index) {card_indx_ = index;}
uint32_t drm_render_minor(void) const {return drm_render_minor_;}
void set_drm_render_minor(uint32_t minor) {drm_render_minor_ = minor;}
static rsmi_dev_perf_level perfLvlStrToEnum(std::string s);
@@ -192,6 +190,8 @@ class Device {
evt::dev_evt_grp_set_t* supported_event_groups(void) {
return &supported_event_groups_;}
SupportedFuncMap *supported_funcs(void) {return &supported_funcs_;}
uint64_t kfd_gpu_id(void) const {return kfd_gpu_id_;}
void set_kfd_gpu_id(uint64_t id) {kfd_gpu_id_ = id;}
void fillSupportedFuncs(void);
void DumpSupportedFunctions(void);
bool DeviceAPISupported(std::string name, uint64_t variant,
@@ -202,20 +202,20 @@ class Device {
std::shared_ptr<PowerMon> power_monitor_;
std::string path_;
shared_mutex_t mutex_;
uint32_t index_;
uint32_t card_indx_; // This index corresponds to the drm index (ie, card#)
uint32_t drm_render_minor_;
const RocmSMI_env_vars *env_;
template <typename T> int openSysfsFileStream(DevInfoTypes type, T *fs,
const char *str = nullptr);
int readDevInfoStr(DevInfoTypes type, std::string *retStr);
int readDevInfoMultiLineStr(DevInfoTypes type,
std::vector<std::string> *retVec);
int writeDevInfoStr(DevInfoTypes type, std::string valStr);
uint64_t bdfid_;
uint64_t kfd_gpu_id_;
std::unordered_set<rsmi_event_group_t,
evt::RSMIEventGrpHashFunction> supported_event_groups_;
std::map<std::string, uint64_t> kfdNodePropMap_;
// std::map<std::string, uint64_t> kfdNodePropMap_;
SupportedFuncMap supported_funcs_;
};
+32
Voir le fichier
@@ -45,18 +45,50 @@
#include <string>
#include <vector>
#include <unordered_set>
#include <memory>
#include <map>
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_device.h"
namespace amd {
namespace smi {
class KFDNode {
public:
explicit KFDNode(uint32_t node_ind) : node_indx_(node_ind) {}
~KFDNode();
int Initialize();
int ReadProperties(void);
int get_property_value(std::string property, uint64_t *value);
uint64_t gpu_id(void) const {return gpu_id_;}
std::string name(void) const {return name_;}
std::shared_ptr<Device> amdgpu_device(void) const {return amdgpu_device_;}
uint32_t amdgpu_dev_index(void) const {return amdgpu_dev_index_;}
void set_amdgpu_dev_index(uint32_t val) {amdgpu_dev_index_ = val;}
private:
uint32_t node_indx_;
uint32_t amdgpu_dev_index_;
uint64_t gpu_id_;
std::string name_;
std::map<std::string, uint64_t> properties_;
std::shared_ptr<Device> amdgpu_device_;
};
int
DiscoverKFDNodes(std::map<uint64_t, std::shared_ptr<KFDNode>> *nodes);
int
GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated,
uint32_t *num_procs_found);
int
GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc);
int
GetProcessGPUs(uint32_t pid, std::unordered_set<uint64_t> *gpu_count);
int
ReadKFDDeviceProperties(uint32_t dev_id, std::vector<std::string> *retVec);
+9 -2
Voir le fichier
@@ -51,7 +51,10 @@
#include <set>
#include <string>
#include <cstdint>
#include <unordered_map>
#include <map>
#include "rocm_smi/rocm_smi_kfd.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi_monitor.h"
#include "rocm_smi/rocm_smi_power_mon.h"
@@ -71,7 +74,7 @@ class RocmSMI {
static std::vector<std::shared_ptr<amd::smi::Device>>&
monitor_devices() {return s_monitor_devices;}
uint32_t DiscoverDevices(void);
uint32_t DiscoverAmdgpuDevices(void);
uint32_t DiscoverAMDPowerMonitors(bool force_update = false);
// Will execute "func" for every Device object known about, or until func
@@ -84,17 +87,21 @@ class RocmSMI {
uint32_t euid() const {return euid_;}
std::map<uint64_t, std::shared_ptr<KFDNode>> & kfd_node_map(void) {
return kfd_node_map_;}
private:
std::vector<std::shared_ptr<Device>> devices_;
std::map<uint64_t, std::shared_ptr<KFDNode>> kfd_node_map_;
std::vector<std::shared_ptr<Monitor>> monitors_;
std::vector<std::shared_ptr<PowerMon>> power_mons_;
std::set<std::string> amd_monitor_types_;
void AddToDeviceList(std::string dev_name);
void GetEnvVariables(void);
uint32_t DiscoverAMDMonitors(void);
static std::vector<std::shared_ptr<amd::smi::Device>> s_monitor_devices;
RocmSMI_env_vars env_vars_;
uint64_t init_options_;
uint32_t euid_;
+77 -19
Voir le fichier
@@ -53,6 +53,7 @@
#include <bitset>
#include <cstdint>
#include <unordered_map>
#include <unordered_set>
#include <map>
#include <fstream>
#include <iostream>
@@ -108,6 +109,16 @@ static rsmi_status_t handleException() {
std::shared_ptr<amd::smi::Device> dev = smi.monitor_devices()[dv_ind]; \
assert(dev != nullptr);
#define GET_DEV_AND_KFDNODE_FROM_INDX \
GET_DEV_FROM_INDX \
std::shared_ptr<amd::smi::KFDNode> kfd_node; \
if (smi.kfd_node_map().find(dev->kfd_gpu_id()) == \
smi.kfd_node_map().end()) { \
return RSMI_INITIALIZATION_ERROR; \
} \
kfd_node = smi.kfd_node_map()[dev->kfd_gpu_id()];
#define REQUIRE_ROOT_ACCESS \
if (amd::smi::RocmSMI::getInstance().euid()) { \
return RSMI_STATUS_PERMISSION; \
@@ -168,6 +179,7 @@ static rsmi_status_t errno_to_rsmi_status(uint32_t err) {
case EISDIR: return RSMI_STATUS_FILE_ERROR;
case EINTR: return RSMI_STATUS_INTERRUPT;
case EIO: return RSMI_STATUS_UNEXPECTED_SIZE;
case ENXIO: return RSMI_STATUS_UNEXPECTED_DATA;
default: return RSMI_STATUS_UNKNOWN_ERROR;
}
}
@@ -208,7 +220,6 @@ static uint64_t get_multiplier_from_str(char units_char) {
*/
static uint64_t freq_string_to_int(const std::vector<std::string> &freq_lines,
bool *is_curr, uint32_t lanes[], uint32_t i) {
assert(i < freq_lines.size());
if (i >= freq_lines.size()) {
throw amd::smi::rsmi_exception(RSMI_STATUS_INPUT_OUT_OF_BOUNDS,
@@ -696,26 +707,15 @@ rsmi_status_t
rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) {
TRY
CHK_SUPPORT_NAME_ONLY(bdfid)
GET_DEV_AND_KFDNODE_FROM_INDX
CHK_API_SUPPORT_ONLY(bdfid, RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT)
DEVICE_MUTEX
*bdfid = dev->bdfid();
int32_t ret = dev->populateKFDNodeProperties();
if (ret) {
return errno_to_rsmi_status(errno);
}
uint64_t domain = 0;
ret = dev->getKFDNodeProperty(amd::smi::kDevKFDNodePropDomain, &domain);
if (ret == EINVAL) {
// "domain" is not found in properties file; just go with the 16 bit
// domain already found
return RSMI_STATUS_SUCCESS;
}
kfd_node->get_property_value("domain", &domain);
// Replace the 16 bit domain originally set like this:
// BDFID = ((<DOMAIN> & 0xffff) << 32) | ((<BUS> & 0xff) << 8) |
@@ -2329,15 +2329,30 @@ rsmi_status_string(rsmi_status_t status, const char **status_string) {
" the call";
break;
case RSMI_STATUS_INTERRUPT:
*status_string = "An interrupt occurred while executing the function";
break;
case RSMI_STATUS_UNEXPECTED_SIZE:
*status_string = "Data (usually from reading a file) was out of"
" range from what was expected";
break;
case RSMI_STATUS_NO_DATA:
*status_string = "No data was found (usually from reading a file) "
"where data was expected";
break;
case RSMI_STATUS_UNEXPECTED_DATA:
*status_string = "Data (usually from reading a file) was not of the "
"type that was expected";
break;
case RSMI_STATUS_UNKNOWN_ERROR:
*status_string = "An unknown error prevented the call from completing"
" successfully";
break;
case RSMI_STATUS_INTERRUPT:
*status_string = "An interrupt occurred while executing the function";
break;
default:
*status_string = "An unknown error occurred";
return RSMI_STATUS_UNKNOWN_ERROR;
@@ -2696,6 +2711,49 @@ rsmi_compute_process_info_get(rsmi_process_info_t *procs,
CATCH
}
rsmi_status_t
rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices,
uint32_t *num_devices) {
TRY
if (num_devices == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
std::unordered_set<uint64_t> gpu_set;
int err = amd::smi::GetProcessGPUs(pid, &gpu_set);
if (err) {
return errno_to_rsmi_status(err);
}
uint32_t i = 0;
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
if (dv_indices != nullptr) {
for (auto it = gpu_set.begin(); i < *num_devices && it != gpu_set.end();
++it, ++i) {
uint64_t gpu_id_val = (*it);
dv_indices[i] = smi.kfd_node_map()[gpu_id_val]->amdgpu_dev_index();
}
}
if (dv_indices && *num_devices < gpu_set.size()) {
// In this case, *num_devices should already hold the number of items
// written to dv_devices. We just have to let the caller know there's more.
return RSMI_STATUS_INSUFFICIENT_SIZE;
}
*num_devices = static_cast<uint32_t>(gpu_set.size());
if (gpu_set.size() > smi.monitor_devices().size()) {
return RSMI_STATUS_UNEXPECTED_SIZE;
}
return RSMI_STATUS_SUCCESS;
CATCH
}
rsmi_status_t
rsmi_dev_memory_reserved_pages_get(uint32_t dv_ind, uint32_t *num_pages,
rsmi_retired_page_record_t *records) {
+1 -1
Voir le fichier
@@ -368,7 +368,7 @@ amd::smi::evt::Event::stopCounter(void) {
return 0;
}
static long
static ssize_t
readn(int fd, void *buf, size_t n) {
ssize_t left = n;
ssize_t bytes;
-47
Voir le fichier
@@ -777,53 +777,6 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) {
return 0;
}
int Device::populateKFDNodeProperties(bool force_update) {
int ret;
std::vector<std::string> propVec;
if (kfdNodePropMap_.size() > 0 && !force_update) {
return 0;
}
ret = ReadKFDDeviceProperties(index_, &propVec);
if (ret) {
return ret;
}
std::string key_str;
// std::string val_str;
uint64_t val_int; // Assume all properties are unsigned integers for now
std::istringstream fs;
for (uint32_t i = 0; i < propVec.size(); ++i) {
fs.str(propVec[i]);
fs >> key_str;
fs >> val_int;
kfdNodePropMap_[key_str] = val_int;
fs.str("");
fs.clear();
}
return 0;
}
int Device::getKFDNodeProperty(DevKFDNodePropTypes prop, uint64_t *val) {
assert(val != nullptr);
assert(kDevKFDPropNameMap.find(prop) != kDevKFDPropNameMap.end());
const char *prop_name = kDevKFDPropNameMap.at(prop);
if (kfdNodePropMap_.find(prop_name) == kfdNodePropMap_.end()) {
return EINVAL;
}
*val = kfdNodePropMap_.at(prop_name);
return 0;
}
void Device::DumpSupportedFunctions(void) {
SupportedFuncMapIt func_iter = supported_funcs_.begin();
+359 -19
Voir le fichier
@@ -47,7 +47,7 @@
#include <algorithm>
#include <string>
#include <map>
#include <unordered_set>
#include <fstream>
#include <cstdint>
#include <iostream>
@@ -57,6 +57,7 @@
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_device.h"
namespace amd {
namespace smi {
@@ -66,25 +67,75 @@ static const char *kKFDNodesPathRoot = "/sys/class/kfd/kfd/topology/nodes";
// Sysfs file names
static const char *kKFDPasidFName = "pasid";
// KFD Node Property strings
// static const char *kKFDNodePropCPU_CORES_COUNTStr = "cpu_cores_count";
// static const char *kKFDNodePropSIMD_COUNTStr = "simd_count";
// static const char *kKFDNodePropMEM_BANKS_COUNTStr = "mem_banks_count";
// static const char *kKFDNodePropCACHES_COUNTStr = "caches_count";
// static const char *kKFDNodePropIO_LINKS_COUNTStr = "io_links_count";
// static const char *kKFDNodePropCPU_CORE_ID_BASEStr = "cpu_core_id_base";
// static const char *kKFDNodePropSIMD_ID_BASEStr = "simd_id_base";
// static const char *kKFDNodePropMAX_WAVES_PER_SIMDStr = "max_waves_per_simd";
// static const char *kKFDNodePropLDS_SIZE_IN_KBStr = "lds_size_in_kb";
// static const char *kKFDNodePropGDS_SIZE_IN_KBStr = "gds_size_in_kb";
// static const char *kKFDNodePropNUM_GWSStr = "num_gws";
// static const char *kKFDNodePropWAVE_FRONT_SIZEStr = "wave_front_size";
// static const char *kKFDNodePropARRAY_COUNTStr = "array_count";
// static const char *kKFDNodePropSIMD_ARRAYS_PER_ENGINEStr =
// "simd_arrays_per_engine";
// static const char *kKFDNodePropCU_PER_SIMD_ARRAYStr = "cu_per_simd_array";
// static const char *kKFDNodePropSIMD_PER_CUStr = "simd_per_cu";
// static const char *kKFDNodePropMAX_SLOTS_SCRATCH_CUStr =
// "max_slots_scratch_cu";
// static const char *kKFDNodePropVENDOR_IDStr = "vendor_id";
// static const char *kKFDNodePropDEVICE_IDStr = "device_id";
static const char *kKFDNodePropLOCATION_IDStr = "location_id";
static const char *kKFDNodePropDOMAINStr = "domain";
// static const char *kKFDNodePropDRM_RENDER_MINORStr = "drm_render_minor";
// static const char *kKFDNodePropHIVE_IDStr = "hive_id";
// static const char *kKFDNodePropNUM_SDMA_ENGINESStr = "num_sdma_engines";
// static const char *kKFDNodePropNUM_SDMA_XGMI_ENGINESStr =
// "num_sdma_xgmi_engines";
// static const char *kKFDNodePropNUM_SDMA_QUEUES_PER_ENGINEStr =
// "num_sdma_queues_per_engine";
// static const char *kKFDNodePropNUM_CP_QUEUESStr = "num_cp_queues";
// static const char *kKFDNodePropMAX_ENGINE_CLK_FCOMPUTEStr =
// "max_engine_clk_fcompute";
// static const char *kKFDNodePropLOCAL_MEM_SIZEStr = "local_mem_size";
// static const char *kKFDNodePropFW_VERSIONStr = "fw_version";
// static const char *kKFDNodePropCAPABILITYStr = "capability";
// static const char *kKFDNodePropDEBUG_PROPStr = "debug_prop";
// static const char *kKFDNodePropSDMA_FW_VERSIOStr = "sdma_fw_versio";
// static const char *kKFDNodePropMAX_ENGINE_CLK_CCOMPUTEStr =
// "max_engine_clk_ccompute";
static bool is_number(const std::string &s) {
return !s.empty() && std::all_of(s.begin(), s.end(), ::isdigit);
}
int ReadKFDDeviceProperties(uint32_t dev_id,
std::vector<std::string> *retVec) {
static std::string KFDDevicePath(uint32_t dev_id) {
std::string node_path = kKFDNodesPathRoot;
node_path += '/';
node_path += std::to_string(dev_id);
return node_path;
}
static int OpenKFDNodeFile(uint32_t dev_id, std::string node_file,
std::ifstream *fs) {
std::string line;
int ret;
std::ifstream fs;
std::string properties_path = kKFDNodesPathRoot;
std::string f_path;
bool reg_file;
assert(retVec != nullptr);
assert(fs != nullptr);
properties_path += '/';
properties_path += std::to_string(dev_id);
properties_path += "/properties";
f_path = KFDDevicePath(dev_id);
f_path += "/";
f_path += node_file;
ret = isRegularFile(properties_path, &reg_file);
ret = isRegularFile(f_path, &reg_file);
if (ret != 0) {
return ret;
@@ -93,26 +144,104 @@ int ReadKFDDeviceProperties(uint32_t dev_id,
return ENOENT;
}
fs.open(properties_path);
fs->open(f_path);
if (!fs.is_open()) {
if (!fs->is_open()) {
return errno;
}
return 0;
}
int ReadKFDDeviceProperties(uint32_t kfd_node_id,
std::vector<std::string> *retVec) {
std::string line;
int ret;
std::ifstream fs;
std::string properties_path;
assert(retVec != nullptr);
ret = OpenKFDNodeFile(kfd_node_id, "properties", &fs);
if (ret) {
return ret;
}
while (std::getline(fs, line)) {
retVec->push_back(line);
}
if (retVec->size() == 0) {
fs.close();
return 0;
}
// Remove any *trailing* empty (whitespace) lines
while (retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) {
retVec->pop_back();
}
fs.close();
return 0;
}
static int ReadKFDGpuId(uint32_t kfd_node_id, uint64_t *gpu_id) {
std::string line;
int ret;
std::ifstream fs;
std::string gpu_id_str;
assert(gpu_id != nullptr);
ret = OpenKFDNodeFile(kfd_node_id, "gpu_id", &fs);
if (ret) {
fs.close();
return ret;
}
std::stringstream ss;
ss << fs.rdbuf();
fs.close();
gpu_id_str = ss.str();
gpu_id_str.erase(std::remove(gpu_id_str.begin(), gpu_id_str.end(), '\n'),
gpu_id_str.end());
if (!is_number(gpu_id_str)) {
return ENXIO;
}
*gpu_id = std::stoi(gpu_id_str);
return 0;
}
static int ReadKFDGpuName(uint32_t kfd_node_id, std::string *gpu_name) {
std::string line;
int ret;
std::ifstream fs;
assert(gpu_name != nullptr);
ret = OpenKFDNodeFile(kfd_node_id, "name", &fs);
if (ret) {
fs.close();
return ret;
}
std::stringstream ss;
ss << fs.rdbuf();
fs.close();
*gpu_name = ss.str();
gpu_name->erase(std::remove(gpu_name->begin(), gpu_name->end(), '\n'),
gpu_name->end());
return 0;
}
int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated,
uint32_t *num_procs_found) {
@@ -128,7 +257,7 @@ int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated,
}
auto dentry = readdir(proc_dir);
std::string prod_id_str;
std::string proc_id_str;
std::string tmp;
while (dentry != nullptr) {
@@ -137,29 +266,32 @@ int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated,
continue;
}
prod_id_str = dentry->d_name;
assert(is_number(prod_id_str) && "Unexpected file name in kfd/proc dir");
if (!is_number(prod_id_str)) {
proc_id_str = dentry->d_name;
assert(is_number(proc_id_str) && "Unexpected file name in kfd/proc dir");
if (!is_number(proc_id_str)) {
dentry = readdir(proc_dir);
continue;
}
if (procs && *num_procs_found < num_allocated) {
int err;
std::string tmp;
procs[*num_procs_found].process_id = std::stoi(prod_id_str);
procs[*num_procs_found].process_id = std::stoi(proc_id_str);
std::string pasid_str_path = kKFDProcPathRoot;
pasid_str_path += "/";
pasid_str_path += prod_id_str;
pasid_str_path += proc_id_str;
pasid_str_path += "/";
pasid_str_path += kKFDPasidFName;
err = ReadSysfsStr(pasid_str_path, &tmp);
if (err) {
return err;
dentry = readdir(proc_dir);
continue;
}
assert(is_number(tmp) && "Unexpected value in pasid file");
if (!is_number(tmp)) {
closedir(proc_dir);
return EINVAL;
}
procs[*num_procs_found].pasid = std::stoi(tmp);
@@ -176,6 +308,81 @@ int GetProcessInfo(rsmi_process_info_t *procs, uint32_t num_allocated,
return 0;
}
// Read the gpuid files found in all the <queue id> dirs and put them in
// gpus_found.
// Directory structure:
// /sys/class/kfd/kfd/proc/<pid>/queues/<queue id>/gpuid
int GetProcessGPUs(uint32_t pid, std::unordered_set<uint64_t> *gpu_set) {
int err;
assert(gpu_set != nullptr);
if (gpu_set == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
errno = 0;
std::string queues_dir = kKFDProcPathRoot;
queues_dir += "/";
queues_dir += std::to_string(pid);
queues_dir += "/queues";
auto queues_dir_hd = opendir(queues_dir.c_str());
if (queues_dir_hd == nullptr) {
std::string err_str = "Unable to open queues directory for process ";
err_str += std::to_string(pid);
perror(err_str.c_str());
return ESRCH;
}
auto q_dentry = readdir(queues_dir_hd);
std::string q_gpu_id_str;
std::string q_dir;
std::string tmp;
while (q_dentry != nullptr) {
if (q_dentry->d_name[0] == '.') {
q_dentry = readdir(queues_dir_hd);
continue;
}
if (!is_number(q_dentry->d_name)) {
q_dentry = readdir(queues_dir_hd);
continue;
}
q_gpu_id_str = queues_dir + '/' + q_dentry->d_name + "/gpuid";
err = ReadSysfsStr(q_gpu_id_str, &tmp);
if (err) {
q_dentry = readdir(queues_dir_hd);
continue;
}
uint64_t val;
try {
val = std::stoi(tmp);
} catch (...) {
std::cerr << "Error; read invalid data: " << tmp << " from " <<
q_gpu_id_str << std::endl;
closedir(queues_dir_hd);
return ENXIO; // Return "no such device" if we read an invalid gpu id
}
gpu_set->insert(val);
q_dentry = readdir(queues_dir_hd);
}
errno = 0;
if (closedir(queues_dir_hd)) {
return errno;
}
return 0;
}
int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc) {
assert(proc != nullptr);
int err;
@@ -208,5 +415,138 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc) {
return 0;
}
int DiscoverKFDNodes(std::map<uint64_t, std::shared_ptr<KFDNode>> *nodes) {
assert(nodes != nullptr);
if (nodes == nullptr) {
return EINVAL;
}
assert(nodes->size() == 0);
nodes->clear();
std::shared_ptr<KFDNode> node;
uint32_t node_indx;
auto kfd_node_dir = opendir(kKFDNodesPathRoot);
assert(kfd_node_dir != nullptr);
auto dentry = readdir(kfd_node_dir);
while (dentry != nullptr) {
if (dentry->d_name[0] == '.') {
dentry = readdir(kfd_node_dir);
continue;
}
if (!is_number(dentry->d_name)) {
dentry = readdir(kfd_node_dir);
continue;
}
node_indx = std::stoi(dentry->d_name);
node = std::shared_ptr<KFDNode>(new KFDNode(node_indx));
node->Initialize();
if (node->gpu_id() == 0) {
// Don't add; this is a cpu node.
dentry = readdir(kfd_node_dir);
continue;
}
uint64_t kfd_gpu_node_bus_fn;
uint64_t kfd_gpu_node_domain;
int ret;
ret =
node->get_property_value(kKFDNodePropLOCATION_IDStr,
&kfd_gpu_node_bus_fn);
if (ret != 0) {
closedir(kfd_node_dir);
return ret;
}
ret =
node->get_property_value(kKFDNodePropDOMAINStr, &kfd_gpu_node_domain);
if (ret != 0) {
closedir(kfd_node_dir);
return ret;
}
uint64_t kfd_bdfid =
(kfd_gpu_node_domain << 32) | (kfd_gpu_node_bus_fn);
(*nodes)[kfd_bdfid] = node;
dentry = readdir(kfd_node_dir);
}
if (closedir(kfd_node_dir)) {
return 1;
}
return 0;
}
KFDNode::~KFDNode() {
}
int KFDNode::ReadProperties(void) {
int ret;
std::vector<std::string> propVec;
assert(properties_.size() == 0);
if (properties_.size() > 0) {
return 0;
}
ret = ReadKFDDeviceProperties(node_indx_, &propVec);
if (ret) {
return ret;
}
std::string key_str;
// std::string val_str;
uint64_t val_int; // Assume all properties are unsigned integers for now
std::istringstream fs;
for (uint32_t i = 0; i < propVec.size(); ++i) {
fs.str(propVec[i]);
fs >> key_str;
fs >> val_int;
properties_[key_str] = val_int;
fs.str("");
fs.clear();
}
return 0;
}
int
KFDNode::Initialize(void) {
int ret = 0;
ret = ReadProperties();
if (ret) {return ret;}
ret = ReadKFDGpuId(node_indx_, &gpu_id_);
if (ret) {return ret;}
ret = ReadKFDGpuName(node_indx_, &name_);
return ret;
}
int
KFDNode::get_property_value(std::string property, uint64_t *value) {
assert(value != nullptr);
if (value == nullptr) {
return EINVAL;
}
if (properties_.find(property) == properties_.end()) {
return EINVAL;
}
*value = properties_[property];
return 0;
}
} // namespace smi
} // namespace amd
+38 -8
Voir le fichier
@@ -55,12 +55,14 @@
#include <utility>
#include <functional>
#include <cerrno>
#include <unordered_map>
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_kfd.h"
static const char *kPathDRMRoot = "/sys/class/drm";
static const char *kPathHWMonRoot = "/sys/class/hwmon";
@@ -253,9 +255,9 @@ RocmSMI::Initialize(uint64_t flags) {
++i;
}
// DiscoverDevices() will seach for devices and monitors and update internal
// data structures.
DiscoverDevices();
// DiscoverAmdgpuDevices() will seach for devices and monitors and update
// internal data structures.
DiscoverAmdgpuDevices();
// IterateSMIDevices will iterate through all the known devices and apply
// the provided call-back to each device found.
@@ -264,7 +266,34 @@ RocmSMI::Initialize(uint64_t flags) {
if (ret != 0) {
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
"Failed to initialize rocm_smi library.");
"Failed to initialize rocm_smi library (amdgpu node discovery.");
}
std::map<uint64_t, std::shared_ptr<KFDNode>> tmp_map;
ret = DiscoverKFDNodes(&tmp_map);
if (ret != 0) {
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
"Failed to initialize rocm_smi library (KFD node discovery).");
}
std::shared_ptr<amd::smi::Device> dev;
// 1. construct kfd_node_map_ with gpu_id as key and *Device as value
// 2. for each kfd node, write the corresponding dv_ind
// 3. for each amdgpu device, write the corresponding gpu_id
for (uint32_t dv_ind = 0; dv_ind < s_monitor_devices.size(); ++dv_ind) {
dev = s_monitor_devices[dv_ind];
uint64_t bdfid = dev->bdfid();
assert(tmp_map.find(bdfid) != tmp_map.end());
if (tmp_map.find(bdfid) == tmp_map.end()) {
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
"amdgpu device bdfid has no KFD matching node");
}
tmp_map[bdfid]->set_amdgpu_dev_index(dv_ind);
uint64_t gpu_id = tmp_map[bdfid]->gpu_id();
dev->set_kfd_gpu_id(gpu_id);
kfd_node_map_[gpu_id] = tmp_map[bdfid];
}
}
@@ -345,10 +374,11 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
}
std::string d_name = dev_name;
uint32_t d_index = GetDeviceIndex(d_name);
uint32_t card_indx = GetDeviceIndex(d_name);
dev->set_drm_render_minor(GetDrmRenderMinor(dev_path));
dev->set_index(d_index);
GetSupportedEventGroups(d_index, dev->supported_event_groups());
dev->set_card_index(card_indx);
GetSupportedEventGroups(card_indx, dev->supported_event_groups());
devices_.push_back(dev);
return;
@@ -381,7 +411,7 @@ static bool isAMDGPU(std::string dev_path) {
return false;
}
uint32_t RocmSMI::DiscoverDevices(void) {
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
auto ret = 0;
// If this gets called more than once, clear previous findings.
+5
Voir le fichier
@@ -303,6 +303,7 @@ static int get_supported_sensors(std::string dir_path, std::string fn_reg_ex,
std::string::size_type pos = fn_reg_ex.find('#');
if (pos == std::string::npos) {
closedir(hwmon_dir);
return -1;
}
fn_reg_ex.erase(pos, 1);
@@ -326,12 +327,16 @@ static int get_supported_sensors(std::string dir_path, std::string fn_reg_ex,
assert(errno == 0);
assert(*endptr == '\0');
if (errno) {
closedir(hwmon_dir);
return -2;
}
sensors->push_back(mon_val);
}
dentry = readdir(hwmon_dir);
}
if (closedir(hwmon_dir)) {
return errno;
}
return 0;
}
+2 -1
Voir le fichier
@@ -96,7 +96,8 @@ void TestIdInfoRead::Run(void) {
for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
IF_VERB(STANDARD) {
std::cout << "\t**Device index: " << id << std::endl;
std::cout << "\t*************************" << std::endl;
std::cout << "\t**Device index: " << i << std::endl;
}
// Get the device ID, name, vendor ID and vendor name for the device
+30 -1
Voir le fichier
@@ -98,6 +98,10 @@ void TestProcInfoRead::Run(void) {
TestBase::Run();
uint32_t num_devices;
err = rsmi_num_monitor_devices(&num_devices);
CHK_ERR_ASRT(err)
err = rsmi_compute_process_info_get(nullptr, &num_proc_found);
if (err != RSMI_STATUS_SUCCESS) {
if (err == RSMI_STATUS_NOT_SUPPORTED) {
@@ -119,6 +123,7 @@ void TestProcInfoRead::Run(void) {
if (num_proc_found == 0) {
return;
}
procs = new rsmi_process_info_t[num_proc_found];
val_ui32 = num_proc_found;
@@ -150,8 +155,32 @@ void TestProcInfoRead::Run(void) {
// Reset to the number we actually read
num_proc_found = val_ui32;
if (num_proc_found) {
rsmi_process_info_t proc_info;
// Allocate the max we expect to get
uint32_t *dev_inds = new uint32_t[num_devices];
uint32_t amt_allocd = num_devices;
for (uint32_t j = 0; j < num_proc_found; j++) {
err = rsmi_compute_process_gpus_get(procs[j].process_id, dev_inds,
&amt_allocd);
CHK_ERR_ASRT(err)
ASSERT_LE(amt_allocd, num_devices);
std::cout << "\t**Process " << procs[j].process_id <<
" is using devices with indices: ";
uint32_t i;
if (amt_allocd > 0) {
for (i = 0; i < amt_allocd - 1; ++i) {
std::cout << dev_inds[i] << ", ";
}
std::cout << dev_inds[i] << std::endl;
}
// Reset amt_allocd back to the amount acutally allocated
amt_allocd = num_devices;
}
delete []dev_inds;
rsmi_process_info_t proc_info;
err = rsmi_compute_process_info_by_pid_get(procs[0].process_id,
&proc_info);
if (err == RSMI_STATUS_NOT_FOUND) {