Add GPU partition nodes

* Updates:
    - Fixed infinit loop on systems
      which did not have VRAM files
    - Fixed concise info from throwing exception
      with no amdgpu driver loaded
    - Fix for ability to see all nodes when
      after switching partitions (mirrors
      original card display/settings)
    - Added to logs build type, lib path,
      and set env. variables

Change-Id: Ic0333df355144ce2242cecea93fe4ce51caf311c
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/rocm_smi_lib commit: ed6777a8e7]
This commit is contained in:
Charis Poag
2023-08-25 22:25:25 -05:00
parent 9da052436a
commit d975792f47
11 changed files with 390 additions and 67 deletions
@@ -118,6 +118,10 @@ GetProcessGPUs(uint32_t pid, std::unordered_set<uint64_t> *gpu_count);
int
ReadKFDDeviceProperties(uint32_t dev_id, std::vector<std::string> *retVec);
int read_node_properties(uint32_t node, std::string property_name,
uint64_t *val);
int get_gpu_id(uint32_t node, uint64_t *gpu_id);
} // namespace smi
} // namespace amd
@@ -113,7 +113,8 @@ class RocmSMI {
uint64_t *weight);
int get_node_index(uint32_t dv_ind, uint32_t *node_ind);
const RocmSMI_env_vars& getEnv(void);
void printEnvVarInfo(void);
std::string getRSMIEnvVarInfo(void);
void debugRSMIEnvVarInfo();
bool isLoggingOn(void);
uint32_t getLogSetting(void);
static const std::map<amd::smi::DevInfoTypes, std::string> devInfoTypesStrings;
@@ -99,13 +99,17 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type,
rsmi_status_t ErrnoToRsmiStatus(int err);
std::string getRSMIStatusString(rsmi_status_t ret);
std::tuple<bool, std::string, std::string, std::string, std::string,
std::string, std::string, std::string, std::string>
std::string, std::string, std::string, std::string,
std::string, std::string, std::string>
getSystemDetails(void);
void logSystemDetails(void);
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str);
void logHexDump(const char *desc, const void *addr, const size_t len,
size_t perLine);
bool isSystemBigEndian();
std::string getBuildType();
std::string getMyLibPath();
int subDirectoryCountInPath(const std::string path);
template <typename T>
std::string print_int_as_hex(T i, bool showHexNotation=true) {
std::stringstream ss;
+1 -1
View File
@@ -72,7 +72,7 @@ target_include_directories(${OAM_EXAMPLE_EXE} PRIVATE ${OAM_INC_LIST})
target_link_libraries(${OAM_EXAMPLE_EXE} ${OAM_TARGET})
add_library(${OAM_TARGET} ${CMN_SRC_LIST} ${OAM_SRC_LIST}
${CMN_INC_LIST} ${OAM_INC_LIST})
target_link_libraries(${OAM_TARGET} pthread rt)
target_link_libraries(${OAM_TARGET} pthread rt dl)
target_include_directories(${OAM_TARGET} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/include ${COMMON_PROJ_ROOT}/common/shared_mutex)
@@ -1594,7 +1594,9 @@ def showAllConcise(deviceList):
printLogSpacer(' Concise Info ')
deviceList.sort()
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
temp_type = '(' + temp_type_lst[0] + ')'
if len(deviceList) >= 1:
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
available_temp_type = temp_type.lower()
available_temp_type = available_temp_type.replace('(', '')
available_temp_type = available_temp_type.replace(')', '')
@@ -1843,7 +1845,8 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr))
else:
logging.debug('PCIe clock is unsupported on device[{}]'.format(device))
printLogSpacer()
if not concise:
printLogSpacer()
def showCurrentFans(deviceList):
@@ -2786,7 +2789,9 @@ def getGraphColor(percentage):
def showTempGraph(deviceList):
deviceList.sort()
(temp_type, temp_value) = findFirstAvailableTemp(deviceList[0])
temp_type = '(' + temp_type_lst[0] + ')'
if len(deviceList) >= 1:
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
printLogSpacer(' Temperature Graph ' + temp_type + ' ')
temp_type = temp_type.lower()
temp_type = temp_type.replace('(', '')
@@ -80,7 +80,7 @@ add_executable(${SMI_EXAMPLE_EXE} "example/rocm_smi_example.cc")
target_link_libraries(${SMI_EXAMPLE_EXE} ${ROCM_SMI_TARGET})
add_library(${ROCM_SMI_TARGET} ${CMN_SRC_LIST} ${SMI_SRC_LIST}
${CMN_INC_LIST} ${SMI_INC_LIST})
target_link_libraries(${ROCM_SMI_TARGET} pthread rt)
target_link_libraries(${ROCM_SMI_TARGET} pthread rt dl)
target_include_directories(${ROCM_SMI_TARGET} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR} ${COMMON_PROJ_ROOT}/common/shared_mutex)
+40 -1
View File
@@ -2991,10 +2991,24 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
if (mem_type == RSMI_MEM_TYPE_VRAM && *total == 0) {
GET_DEV_AND_KFDNODE_FROM_INDX
if (kfd_node->get_total_memory(total) == 0 && *total > 0) {
ss << __PRETTY_FUNCTION__
<< " | inside success fallback... "
<< " | Device #: " << std::to_string(dv_ind)
<< " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file)
<< " | Data: total = " << std::to_string(*total)
<< " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS);
LOG_DEBUG(ss);
return RSMI_STATUS_SUCCESS;
}
}
ss << __PRETTY_FUNCTION__
<< " | after fallback... "
<< " | Device #: " << std::to_string(dv_ind)
<< " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file)
<< " | Data: total = " << std::to_string(*total)
<< " | ret = " << getRSMIStatusString(ret);
LOG_DEBUG(ss);
return ret;
CATCH
}
@@ -3036,11 +3050,36 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
GET_DEV_AND_KFDNODE_FROM_INDX
uint64_t total = 0;
ret = get_dev_value_int(amd::smi::kDevMemTotVRAM, dv_ind, &total);
if (total != 0) return ret; // do not need to fallback
if (total != 0) {
ss << __PRETTY_FUNCTION__
<< " no fallback needed! - "
<< " | Device #: " << std::to_string(dv_ind)
<< " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file)
<< " | Data: Used = " << std::to_string(*used)
<< " | Data: total = " << std::to_string(total)
<< " | ret = " << getRSMIStatusString(ret);
LOG_DEBUG(ss);
return ret; // do not need to fallback
}
if ( kfd_node->get_used_memory(used) == 0 ) {
ss << __PRETTY_FUNCTION__
<< " | in fallback == success ..."
<< " | Device #: " << std::to_string(dv_ind)
<< " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file)
<< " | Data: Used = " << std::to_string(*used)
<< " | Data: total = " << std::to_string(total)
<< " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS);
LOG_DEBUG(ss);
return RSMI_STATUS_SUCCESS;
}
}
ss << __PRETTY_FUNCTION__
<< " | at end!!!! after fallback ..."
<< " | Device #: " << std::to_string(dv_ind)
<< " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file)
<< " | Data: Used = " << std::to_string(*used)
<< " | ret = " << getRSMIStatusString(ret);
LOG_DEBUG(ss);
return ret;
CATCH
+89 -2
View File
@@ -63,6 +63,7 @@
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_logger.h"
namespace amd {
namespace smi {
@@ -775,20 +776,30 @@ KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth,
// /sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties
// size_in_bytes 68702699520
int KFDNode::get_total_memory(uint64_t* total) {
if (total == nullptr) return EINVAL;
std::ostringstream ss;
if (total == nullptr) {
return EINVAL;
}
*total = 0;
std::string f_path = kKFDNodesPathRoot;
f_path += "/";
f_path += std::to_string(node_indx_);
f_path += "/mem_banks";
int subDirCount = subDirectoryCountInPath(f_path);
ss << __PRETTY_FUNCTION__ << " | [before loop] Within " << f_path
<< " has subdirectory count = " << std::to_string(subDirCount);
LOG_DEBUG(ss);
auto kfd_node_dir = opendir(f_path.c_str());
if (kfd_node_dir == nullptr) {
return errno;
}
auto dentry = readdir(kfd_node_dir);
while (dentry != nullptr) {
while (dentry != nullptr && subDirCount > 0) {
ss << __PRETTY_FUNCTION__ << " | [inside loop] Within " << f_path
<< " has subdirectory count = " << std::to_string(subDirCount);
LOG_DEBUG(ss);
if (dentry->d_name[0] == '.') {
dentry = readdir(kfd_node_dir);
continue;
@@ -822,6 +833,7 @@ int KFDNode::get_total_memory(uint64_t* total) {
}
}
} // end loop for lines in property file
subDirCount--;
} // end loop for mem_bank directory
if (closedir(kfd_node_dir)) {
@@ -862,5 +874,80 @@ int KFDNode::get_used_memory(uint64_t* used) {
return 1;
}
// /sys/class/kfd/kfd/topology/nodes/*/properties
int read_node_properties(uint32_t node, std::string property_name,
uint64_t *val) {
std::ostringstream ss;
int retVal = EINVAL;
if (property_name.empty() || val == nullptr) {
ss << __PRETTY_FUNCTION__
<< " | Issue: Could not read node #" << std::to_string(node)
<< ", property_name is empty or *val is nullptr "
<< " | return = " << std::to_string(retVal)
<< " | ";
LOG_DEBUG(ss);
return retVal;
}
std::shared_ptr<KFDNode> myNode = std::shared_ptr<KFDNode>(new KFDNode(node));
myNode->Initialize();
if (KFDNodeSupported(node)) {
retVal = myNode->get_property_value(property_name, val);
ss << __PRETTY_FUNCTION__
<< " | Successfully read node #" << std::to_string(node)
<< " for property_name = " << property_name
<< " | Data (" << property_name << ") * val = "
<< std::to_string(*val)
<< " | return = " << std::to_string(retVal)
<< " | ";
LOG_DEBUG(ss);
} else {
retVal = 1;
ss << __PRETTY_FUNCTION__
<< " | Issue: Could not read node #" << std::to_string(node)
<< ", KFD node was an unsupported node."
<< " | return = " << std::to_string(retVal)
<< " | ";
LOG_ERROR(ss);
}
return retVal;
}
// /sys/class/kfd/kfd/topology/nodes/*/gpu_id
int get_gpu_id(uint32_t node, uint64_t *gpu_id) {
std::ostringstream ss;
int retVal = EINVAL;
if (gpu_id == nullptr) {
ss << __PRETTY_FUNCTION__
<< " | Issue: Could not read node #" << std::to_string(node)
<< ", gpu_id is a nullptr "
<< " | return = " << std::to_string(retVal)
<< " | ";
LOG_DEBUG(ss);
return retVal;
}
std::shared_ptr<KFDNode> myNode = std::shared_ptr<KFDNode>(new KFDNode(node));
myNode->Initialize();
if (KFDNodeSupported(node)) {
retVal = ReadKFDGpuId(node, gpu_id);
ss << __PRETTY_FUNCTION__
<< " | Successfully read node #" << std::to_string(node)
<< " for gpu_id"
<< " | Data (gpu_id) *gpu_id = "
<< std::to_string(*gpu_id)
<< " | return = " << std::to_string(retVal)
<< " | ";
LOG_DEBUG(ss);
} else {
retVal = 1;
ss << __PRETTY_FUNCTION__
<< " | Issue: Could not read node #" << std::to_string(node)
<< ", KFD node was an unsupported node."
<< " | return = " << std::to_string(retVal)
<< " | ";
LOG_ERROR(ss);
}
return retVal;
}
} // namespace smi
} // namespace amd
+153 -41
View File
@@ -333,7 +333,7 @@ RocmSMI::Initialize(uint64_t flags) {
GetEnvVariables();
// To help debug env variable issues
// printEnvVarInfo();
// debugRSMIEnvVarInfo();
while (!std::string(kAMDMonitorTypes[i]).empty()) {
amd_monitor_types_.insert(kAMDMonitorTypes[i]);
@@ -390,7 +390,7 @@ RocmSMI::Initialize(uint64_t flags) {
uint64_t bdfid = (*dev_iter)->bdfid();
if (tmp_map.find(bdfid) == tmp_map.end()) {
ss << __PRETTY_FUNCTION__ << " | removing device = "
<< (*dev_iter)->path();
<< (*dev_iter)->path() << "; bdfid = " << std::to_string(bdfid);
dev_iter = devices_.erase(dev_iter);
LOG_DEBUG(ss);
continue;
@@ -549,48 +549,54 @@ uint32_t RocmSMI::getLogSetting() {
return this->env_vars_.logging_on;
}
void RocmSMI::printEnvVarInfo(void) {
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_output_bitfield = "
<< ((env_vars_.debug_output_bitfield == 0) ? "<undefined>"
: std::to_string(env_vars_.debug_output_bitfield))
<< std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_DRM_root_override = "
<< ((env_vars_.path_DRM_root_override == nullptr)
? "<undefined>" : env_vars_.path_DRM_root_override)
<< std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_HWMon_root_override = "
<< ((env_vars_.path_HWMon_root_override == nullptr)
? "<undefined>" : env_vars_.path_HWMon_root_override)
<< std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_power_root_override = "
<< ((env_vars_.path_power_root_override == nullptr)
? "<undefined>" : env_vars_.path_power_root_override)
<< std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_inf_loop = "
<< ((env_vars_.debug_inf_loop == 0) ? "<undefined>"
: std::to_string(env_vars_.debug_inf_loop))
<< std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
void RocmSMI::debugRSMIEnvVarInfo(void) {
std::cout << __PRETTY_FUNCTION__
<< RocmSMI::getInstance().getRSMIEnvVarInfo();
}
std::string RocmSMI::getRSMIEnvVarInfo(void) {
std::ostringstream ss;
ss << "\n\tRSMI_DEBUG_BITFIELD = "
<< ((env_vars_.debug_output_bitfield == 0) ? "<undefined>"
: std::to_string(env_vars_.debug_output_bitfield))
<< std::endl;
ss << "\tRSMI_DEBUG_DRM_ROOT_OVERRIDE = "
<< ((env_vars_.path_DRM_root_override == nullptr)
? "<undefined>" : env_vars_.path_DRM_root_override)
<< std::endl;
ss << "\tRSMI_DEBUG_HWMON_ROOT_OVERRIDE = "
<< ((env_vars_.path_HWMon_root_override == nullptr)
? "<undefined>" : env_vars_.path_HWMon_root_override)
<< std::endl;
ss << "\tRSMI_DEBUG_PP_ROOT_OVERRIDE = "
<< ((env_vars_.path_power_root_override == nullptr)
? "<undefined>" : env_vars_.path_power_root_override)
<< std::endl;
ss << "\tRSMI_DEBUG_INFINITE_LOOP = "
<< ((env_vars_.debug_inf_loop == 0) ? "<undefined>"
: std::to_string(env_vars_.debug_inf_loop))
<< std::endl;
ss << "\tRSMI_LOGGING = "
<< getLogSetting() << std::endl;
bool isLoggingOn = RocmSMI::isLoggingOn();
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
<< (isLoggingOn ? "true" : "false") << std::endl;
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {";
bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false;
ss << "\tRSMI_LOGGING (are logs on) = "
<< (isLoggingOn ? "TRUE" : "FALSE") << std::endl;
ss << "\tRSMI_DEBUG_ENUM_OVERRIDE = {";
if (env_vars_.enum_overrides.empty()) {
std::cout << "}" << std::endl;
return;
ss << "}" << std::endl;
return ss.str();
}
for (auto it=env_vars_.enum_overrides.begin();
it != env_vars_.enum_overrides.end(); ++it) {
DevInfoTypes type = static_cast<DevInfoTypes>(*it);
std::cout << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type)
+ ")");
ss << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) + ")");
auto temp_it = it;
if(++temp_it != env_vars_.enum_overrides.end()) {
std::cout << ", ";
ss << ", ";
}
}
std::cout << "}" << std::endl;
ss << "}" << std::endl;
return ss.str();
}
std::shared_ptr<Monitor>
@@ -692,8 +698,7 @@ static bool isAMDGPU(std::string dev_path) {
std::string vend_path = dev_path + "/device/vendor";
if (!FileExists(vend_path.c_str())) {
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
"is an amdgpu device - FALSE");
<< " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE");
LOG_DEBUG(ss);
return isAmdGpu;
}
@@ -703,8 +708,7 @@ static bool isAMDGPU(std::string dev_path) {
if (!fs.is_open()) {
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
"is an amdgpu device - FALSE");
<< " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE");
LOG_DEBUG(ss);
return isAmdGpu;
}
@@ -719,8 +723,7 @@ static bool isAMDGPU(std::string dev_path) {
isAmdGpu = true;
}
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
"is an amdgpu device - FALSE");
<< " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE");
LOG_DEBUG(ss);
return isAmdGpu;
}
@@ -728,6 +731,7 @@ static bool isAMDGPU(std::string dev_path) {
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
std::string err_msg;
uint32_t count = 0;
std::ostringstream ss;
// If this gets called more than once, clear previous findings.
devices_.clear();
@@ -754,17 +758,125 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
}
dentry = readdir(drm_dir);
}
ss << __PRETTY_FUNCTION__ << " | Discovered a potential of "
<< std::to_string(count) << " cards" << " | ";
LOG_DEBUG(ss);
struct systemNode {
uint32_t s_node_id = 0;
uint64_t s_gpu_id = 0;
uint64_t s_unique_id = 0;
};
// allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id}
std::multimap<uint64_t, systemNode> allSystemNodes;
uint32_t node_id = 0;
while (true) {
uint64_t gpu_id = 0, unique_id = 0;
int ret_gpu_id = get_gpu_id(node_id, &gpu_id);
int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id);
if (ret_gpu_id == 0 || ret_unique_id == 0) {
systemNode myNode;
myNode.s_node_id = node_id;
myNode.s_gpu_id = gpu_id;
myNode.s_unique_id = unique_id;
if(gpu_id != 0) { // only add gpu nodes, 0 = CPU
allSystemNodes.emplace(unique_id, myNode);
}
} else {
break;
}
node_id++;
}
ss << __PRETTY_FUNCTION__ << " | Ordered system nodes found = {";
for(auto i: allSystemNodes) {
ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
<< "; gpu_id = " << std::to_string(i.second.s_gpu_id)
<< "; unique_id = " << std::to_string(i.second.s_unique_id)
<< "], "
;
}
ss << "}";
LOG_DEBUG(ss);
// Discover all root cards & gpu partitions associated with each
for (uint32_t node_id = 0; node_id < count; node_id++) {
std::string path = kPathDRMRoot;
path += "/card";
path += std::to_string(node_id);
uint64_t primary_unique_id = 0;
// each identified gpu card node is a primary node for
// potential matching unique ids
if (isAMDGPU(path) ||
(init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) {
std::string d_name = "card";
d_name += std::to_string(node_id);
AddToDeviceList(d_name);
}
ss << __PRETTY_FUNCTION__
<< " | Ordered system nodes seen in lookup = {";
for (auto i : allSystemNodes) {
ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
<< "; gpu_id = " << std::to_string(i.second.s_gpu_id)
<< "; unique_id = " << std::to_string(i.second.s_unique_id)
<< "], ";
}
ss << "}";
LOG_DEBUG(ss);
uint64_t temp_primary_unique_id = 0;
if (allSystemNodes.empty()) {
continue;
}
// get lowest key 1st to keep order of nodes matching card
uint32_t lowest_NodeId = 0;
uint32_t curr_NodeId = 0;
for (auto it = allSystemNodes.begin(), end = allSystemNodes.end();
it != end; it = allSystemNodes.upper_bound(it->first)) {
curr_NodeId = it->second.s_node_id;
if (it == allSystemNodes.begin()) {
lowest_NodeId = it->second.s_node_id;
}
if (curr_NodeId <= lowest_NodeId) {
lowest_NodeId = curr_NodeId;
temp_primary_unique_id = it->second.s_unique_id;
}
}
ss << __PRETTY_FUNCTION__
<< " | lowest_NodeId = " << std::to_string(lowest_NodeId)
<< " | curr_NodeId = " << std::to_string(curr_NodeId)
<< " | temp_primary_unique_id = "
<< std::to_string(temp_primary_unique_id);
LOG_DEBUG(ss);
if (temp_primary_unique_id != 0) {
primary_unique_id = temp_primary_unique_id;
} else {
allSystemNodes.erase(primary_unique_id);
continue;
}
auto numb_nodes = allSystemNodes.count(primary_unique_id);
ss << __PRETTY_FUNCTION__ << " | REFRESH - primary_unique_id = "
<< std::to_string(primary_unique_id) << " has "
<< std::to_string(numb_nodes) << " known gpu nodes";
LOG_DEBUG(ss);
while (numb_nodes > 1) {
std::string secNode = "card";
secNode += std::to_string(node_id); // add the primary node id
AddToDeviceList(secNode);
numb_nodes--;
}
// remove already added nodes associated with current card
auto erasedNodes = allSystemNodes.erase(primary_unique_id);
ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = "
<< std::to_string(primary_unique_id) << " erased "
<< std::to_string(erasedNodes) << " nodes";
LOG_DEBUG(ss);
}
}
if (closedir(drm_dir)) {
+85 -15
View File
@@ -40,12 +40,17 @@
* DEALINGS WITH THE SOFTWARE.
*
*/
#define _GNU_SOURCE 1 // REQUIRED: to utilize some GNU features/functions, see
// _GNU_SOURCE functions which check
#include <assert.h>
#include <errno.h>
#include <sys/stat.h>
#include <unistd.h>
#include <dirent.h>
#include <glob.h>
#include <sys/stat.h>
#include <sys/utsname.h>
#include <unistd.h>
#include <dlfcn.h>
#include <algorithm>
#include <cassert>
@@ -612,7 +617,8 @@ std::string getRSMIStatusString(rsmi_status_t ret) {
// Big Endian (BE), multi-bit symbols encoded as big endian (MSB first)
// Little Endian (LE), multi-bit symbols encoded as little endian (LSB first)
std::tuple<bool, std::string, std::string, std::string, std::string,
std::string, std::string, std::string, std::string>
std::string, std::string, std::string, std::string,
std::string, std::string, std::string>
getSystemDetails(void) {
struct utsname buf;
bool errorDetected = false;
@@ -625,6 +631,9 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
std::string domainName = "<undefined>";
std::string os_distribution = "<undefined>";
std::string endianness = "<undefined>";
std::string rocm_lib_path = "<undefined>";
std::string rocm_build_type = "<undefined>";
std::string rocm_env_variables = "<undefined>";
if (uname(&buf) < 0) {
errorDetected = true;
@@ -659,9 +668,13 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
endianness = "Little Endian, multi-bit symbols encoded as"
" little endian (LSB first)";
}
rocm_build_type = getBuildType();
rocm_lib_path = getMyLibPath();
rocm_env_variables = RocmSMI::getInstance().getRSMIEnvVarInfo();
return std::make_tuple(errorDetected, sysname, nodename, release,
version, machine, domainName, os_distribution,
endianness);
endianness, rocm_build_type, rocm_lib_path,
rocm_env_variables);
}
// If logging is enabled through RSMI_LOGGING environment variable.
@@ -669,17 +682,12 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
void logSystemDetails(void) {
std::ostringstream ss;
bool errorDetected;
std::string sysname;
std::string node;
std::string release;
std::string version;
std::string machine;
std::string domain;
std::string distName;
std::string endianness;
std::string sysname, node, release, version, machine, domain, distName,
endianness, rocm_build_type, lib_path, rocm_env_vars;
std::tie(errorDetected, sysname, node, release, version, machine, domain,
distName, endianness) = getSystemDetails();
if (!errorDetected) {
distName, endianness, rocm_build_type, lib_path,
rocm_env_vars) = getSystemDetails();
if (errorDetected == false) {
ss << "====== Gathered system details ============\n"
<< "SYSTEM NAME: " << sysname << "\n"
<< "OS DISTRIBUTION: " << distName << "\n"
@@ -688,7 +696,10 @@ void logSystemDetails(void) {
<< "VERSION: " << version << "\n"
<< "MACHINE TYPE: " << machine << "\n"
<< "DOMAIN: " << domain << "\n"
<< "ENDIANNESS: " << endianness << "\n";
<< "ENDIANNESS: " << endianness << "\n"
<< "ROCM BUILD TYPE: " << rocm_build_type << "\n"
<< "ROCM-SMI-LIB PATH: " << lib_path << "\n"
<< "ROCM ENV VARIABLES: " << rocm_env_vars << "\n";
LOG_INFO(ss);
} else {
ss << "====== Gathered system details ============\n"
@@ -786,6 +797,36 @@ bool isSystemBigEndian() {
return isBigEndian;
}
std::string getBuildType() {
std::string build = "<unknown>";
#ifndef DEBUG
build = "release";
#else
build = "debug";
#endif
return build;
}
const char *my_fname(void) {
std::string emptyRet="";
#ifdef _GNU_SOURCE
Dl_info dl_info;
dladdr((void *)my_fname, &dl_info);
return (dl_info.dli_fname);
#else
return emptyRet.c_str();
#endif
}
std::string getMyLibPath(void) {
std::string libName = "rocm-smi-lib";
std::string path = std::string(my_fname());
if (path.empty()) {
path = "Could not find library path for " + libName;
}
return path;
}
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str)
{
auto result = rsmi_status_t::RSMI_STATUS_SUCCESS;
@@ -807,6 +848,35 @@ rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str)
return result;
}
int subDirectoryCountInPath(const std::string path) {
int dir_count = 0;
struct dirent *dent;
DIR *srcdir = opendir(path.c_str());
if (srcdir == NULL) {
perror("opendir");
return -1;
}
while ((dent = readdir(srcdir)) != NULL) {
struct stat st;
if (strcmp(dent->d_name, ".") == 0 || strcmp(dent->d_name, "..") == 0) {
continue;
}
if (fstatat(dirfd(srcdir), dent->d_name, &st, 0) < 0) {
perror(dent->d_name);
continue;
}
if (S_ISDIR(st.st_mode)) {
dir_count++;
}
}
closedir(srcdir);
return dir_count;
}
} // namespace smi
} // namespace amd
@@ -67,7 +67,8 @@ target_link_libraries(
PUBLIC GTest::gtest_main
PUBLIC c
PUBLIC stdc++
PUBLIC pthread)
PUBLIC pthread
PUBLIC dl)
install(TARGETS ${RSMITST} gtest gtest_main
DESTINATION ${SHARE_INSTALL_PREFIX}/rsmitst_tests