Add GPU partition nodes
* Updates:
- Fixed infinit loop on systems
which did not have VRAM files
- Fixed concise info from throwing exception
with no amdgpu driver loaded
- Fix for ability to see all nodes when
after switching partitions (mirrors
original card display/settings)
- Added to logs build type, lib path,
and set env. variables
Change-Id: Ic0333df355144ce2242cecea93fe4ce51caf311c
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
[ROCm/rocm_smi_lib commit: ed6777a8e7]
This commit is contained in:
@@ -118,6 +118,10 @@ GetProcessGPUs(uint32_t pid, std::unordered_set<uint64_t> *gpu_count);
|
||||
int
|
||||
ReadKFDDeviceProperties(uint32_t dev_id, std::vector<std::string> *retVec);
|
||||
|
||||
int read_node_properties(uint32_t node, std::string property_name,
|
||||
uint64_t *val);
|
||||
int get_gpu_id(uint32_t node, uint64_t *gpu_id);
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -113,7 +113,8 @@ class RocmSMI {
|
||||
uint64_t *weight);
|
||||
int get_node_index(uint32_t dv_ind, uint32_t *node_ind);
|
||||
const RocmSMI_env_vars& getEnv(void);
|
||||
void printEnvVarInfo(void);
|
||||
std::string getRSMIEnvVarInfo(void);
|
||||
void debugRSMIEnvVarInfo();
|
||||
bool isLoggingOn(void);
|
||||
uint32_t getLogSetting(void);
|
||||
static const std::map<amd::smi::DevInfoTypes, std::string> devInfoTypesStrings;
|
||||
|
||||
@@ -99,13 +99,17 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type,
|
||||
rsmi_status_t ErrnoToRsmiStatus(int err);
|
||||
std::string getRSMIStatusString(rsmi_status_t ret);
|
||||
std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string, std::string>
|
||||
std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string>
|
||||
getSystemDetails(void);
|
||||
void logSystemDetails(void);
|
||||
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str);
|
||||
void logHexDump(const char *desc, const void *addr, const size_t len,
|
||||
size_t perLine);
|
||||
bool isSystemBigEndian();
|
||||
std::string getBuildType();
|
||||
std::string getMyLibPath();
|
||||
int subDirectoryCountInPath(const std::string path);
|
||||
template <typename T>
|
||||
std::string print_int_as_hex(T i, bool showHexNotation=true) {
|
||||
std::stringstream ss;
|
||||
|
||||
@@ -72,7 +72,7 @@ target_include_directories(${OAM_EXAMPLE_EXE} PRIVATE ${OAM_INC_LIST})
|
||||
target_link_libraries(${OAM_EXAMPLE_EXE} ${OAM_TARGET})
|
||||
add_library(${OAM_TARGET} ${CMN_SRC_LIST} ${OAM_SRC_LIST}
|
||||
${CMN_INC_LIST} ${OAM_INC_LIST})
|
||||
target_link_libraries(${OAM_TARGET} pthread rt)
|
||||
target_link_libraries(${OAM_TARGET} pthread rt dl)
|
||||
target_include_directories(${OAM_TARGET} PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include ${COMMON_PROJ_ROOT}/common/shared_mutex)
|
||||
|
||||
|
||||
@@ -1594,7 +1594,9 @@ def showAllConcise(deviceList):
|
||||
|
||||
printLogSpacer(' Concise Info ')
|
||||
deviceList.sort()
|
||||
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
|
||||
temp_type = '(' + temp_type_lst[0] + ')'
|
||||
if len(deviceList) >= 1:
|
||||
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
|
||||
available_temp_type = temp_type.lower()
|
||||
available_temp_type = available_temp_type.replace('(', '')
|
||||
available_temp_type = available_temp_type.replace(')', '')
|
||||
@@ -1843,7 +1845,8 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False):
|
||||
printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr))
|
||||
else:
|
||||
logging.debug('PCIe clock is unsupported on device[{}]'.format(device))
|
||||
printLogSpacer()
|
||||
if not concise:
|
||||
printLogSpacer()
|
||||
|
||||
|
||||
def showCurrentFans(deviceList):
|
||||
@@ -2786,7 +2789,9 @@ def getGraphColor(percentage):
|
||||
|
||||
def showTempGraph(deviceList):
|
||||
deviceList.sort()
|
||||
(temp_type, temp_value) = findFirstAvailableTemp(deviceList[0])
|
||||
temp_type = '(' + temp_type_lst[0] + ')'
|
||||
if len(deviceList) >= 1:
|
||||
(temp_type, _) = findFirstAvailableTemp(deviceList[0])
|
||||
printLogSpacer(' Temperature Graph ' + temp_type + ' ')
|
||||
temp_type = temp_type.lower()
|
||||
temp_type = temp_type.replace('(', '')
|
||||
|
||||
@@ -80,7 +80,7 @@ add_executable(${SMI_EXAMPLE_EXE} "example/rocm_smi_example.cc")
|
||||
target_link_libraries(${SMI_EXAMPLE_EXE} ${ROCM_SMI_TARGET})
|
||||
add_library(${ROCM_SMI_TARGET} ${CMN_SRC_LIST} ${SMI_SRC_LIST}
|
||||
${CMN_INC_LIST} ${SMI_INC_LIST})
|
||||
target_link_libraries(${ROCM_SMI_TARGET} pthread rt)
|
||||
target_link_libraries(${ROCM_SMI_TARGET} pthread rt dl)
|
||||
target_include_directories(${ROCM_SMI_TARGET} PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR} ${COMMON_PROJ_ROOT}/common/shared_mutex)
|
||||
|
||||
|
||||
@@ -2991,10 +2991,24 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
|
||||
if (mem_type == RSMI_MEM_TYPE_VRAM && *total == 0) {
|
||||
GET_DEV_AND_KFDNODE_FROM_INDX
|
||||
if (kfd_node->get_total_memory(total) == 0 && *total > 0) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | inside success fallback... "
|
||||
<< " | Device #: " << std::to_string(dv_ind)
|
||||
<< " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file)
|
||||
<< " | Data: total = " << std::to_string(*total)
|
||||
<< " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS);
|
||||
LOG_DEBUG(ss);
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | after fallback... "
|
||||
<< " | Device #: " << std::to_string(dv_ind)
|
||||
<< " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file)
|
||||
<< " | Data: total = " << std::to_string(*total)
|
||||
<< " | ret = " << getRSMIStatusString(ret);
|
||||
LOG_DEBUG(ss);
|
||||
return ret;
|
||||
CATCH
|
||||
}
|
||||
@@ -3036,11 +3050,36 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
|
||||
GET_DEV_AND_KFDNODE_FROM_INDX
|
||||
uint64_t total = 0;
|
||||
ret = get_dev_value_int(amd::smi::kDevMemTotVRAM, dv_ind, &total);
|
||||
if (total != 0) return ret; // do not need to fallback
|
||||
if (total != 0) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " no fallback needed! - "
|
||||
<< " | Device #: " << std::to_string(dv_ind)
|
||||
<< " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file)
|
||||
<< " | Data: Used = " << std::to_string(*used)
|
||||
<< " | Data: total = " << std::to_string(total)
|
||||
<< " | ret = " << getRSMIStatusString(ret);
|
||||
LOG_DEBUG(ss);
|
||||
return ret; // do not need to fallback
|
||||
}
|
||||
if ( kfd_node->get_used_memory(used) == 0 ) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | in fallback == success ..."
|
||||
<< " | Device #: " << std::to_string(dv_ind)
|
||||
<< " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file)
|
||||
<< " | Data: Used = " << std::to_string(*used)
|
||||
<< " | Data: total = " << std::to_string(total)
|
||||
<< " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS);
|
||||
LOG_DEBUG(ss);
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | at end!!!! after fallback ..."
|
||||
<< " | Device #: " << std::to_string(dv_ind)
|
||||
<< " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file)
|
||||
<< " | Data: Used = " << std::to_string(*used)
|
||||
<< " | ret = " << getRSMIStatusString(ret);
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return ret;
|
||||
CATCH
|
||||
|
||||
@@ -63,6 +63,7 @@
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_logger.h"
|
||||
|
||||
namespace amd {
|
||||
namespace smi {
|
||||
@@ -775,20 +776,30 @@ KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth,
|
||||
// /sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties
|
||||
// size_in_bytes 68702699520
|
||||
int KFDNode::get_total_memory(uint64_t* total) {
|
||||
if (total == nullptr) return EINVAL;
|
||||
std::ostringstream ss;
|
||||
if (total == nullptr) {
|
||||
return EINVAL;
|
||||
}
|
||||
*total = 0;
|
||||
|
||||
std::string f_path = kKFDNodesPathRoot;
|
||||
f_path += "/";
|
||||
f_path += std::to_string(node_indx_);
|
||||
f_path += "/mem_banks";
|
||||
int subDirCount = subDirectoryCountInPath(f_path);
|
||||
ss << __PRETTY_FUNCTION__ << " | [before loop] Within " << f_path
|
||||
<< " has subdirectory count = " << std::to_string(subDirCount);
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
auto kfd_node_dir = opendir(f_path.c_str());
|
||||
if (kfd_node_dir == nullptr) {
|
||||
return errno;
|
||||
}
|
||||
auto dentry = readdir(kfd_node_dir);
|
||||
while (dentry != nullptr) {
|
||||
while (dentry != nullptr && subDirCount > 0) {
|
||||
ss << __PRETTY_FUNCTION__ << " | [inside loop] Within " << f_path
|
||||
<< " has subdirectory count = " << std::to_string(subDirCount);
|
||||
LOG_DEBUG(ss);
|
||||
if (dentry->d_name[0] == '.') {
|
||||
dentry = readdir(kfd_node_dir);
|
||||
continue;
|
||||
@@ -822,6 +833,7 @@ int KFDNode::get_total_memory(uint64_t* total) {
|
||||
}
|
||||
}
|
||||
} // end loop for lines in property file
|
||||
subDirCount--;
|
||||
} // end loop for mem_bank directory
|
||||
|
||||
if (closedir(kfd_node_dir)) {
|
||||
@@ -862,5 +874,80 @@ int KFDNode::get_used_memory(uint64_t* used) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// /sys/class/kfd/kfd/topology/nodes/*/properties
|
||||
int read_node_properties(uint32_t node, std::string property_name,
|
||||
uint64_t *val) {
|
||||
std::ostringstream ss;
|
||||
int retVal = EINVAL;
|
||||
if (property_name.empty() || val == nullptr) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: Could not read node #" << std::to_string(node)
|
||||
<< ", property_name is empty or *val is nullptr "
|
||||
<< " | return = " << std::to_string(retVal)
|
||||
<< " | ";
|
||||
LOG_DEBUG(ss);
|
||||
return retVal;
|
||||
}
|
||||
std::shared_ptr<KFDNode> myNode = std::shared_ptr<KFDNode>(new KFDNode(node));
|
||||
myNode->Initialize();
|
||||
if (KFDNodeSupported(node)) {
|
||||
retVal = myNode->get_property_value(property_name, val);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Successfully read node #" << std::to_string(node)
|
||||
<< " for property_name = " << property_name
|
||||
<< " | Data (" << property_name << ") * val = "
|
||||
<< std::to_string(*val)
|
||||
<< " | return = " << std::to_string(retVal)
|
||||
<< " | ";
|
||||
LOG_DEBUG(ss);
|
||||
} else {
|
||||
retVal = 1;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: Could not read node #" << std::to_string(node)
|
||||
<< ", KFD node was an unsupported node."
|
||||
<< " | return = " << std::to_string(retVal)
|
||||
<< " | ";
|
||||
LOG_ERROR(ss);
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
// /sys/class/kfd/kfd/topology/nodes/*/gpu_id
|
||||
int get_gpu_id(uint32_t node, uint64_t *gpu_id) {
|
||||
std::ostringstream ss;
|
||||
int retVal = EINVAL;
|
||||
if (gpu_id == nullptr) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: Could not read node #" << std::to_string(node)
|
||||
<< ", gpu_id is a nullptr "
|
||||
<< " | return = " << std::to_string(retVal)
|
||||
<< " | ";
|
||||
LOG_DEBUG(ss);
|
||||
return retVal;
|
||||
}
|
||||
std::shared_ptr<KFDNode> myNode = std::shared_ptr<KFDNode>(new KFDNode(node));
|
||||
myNode->Initialize();
|
||||
if (KFDNodeSupported(node)) {
|
||||
retVal = ReadKFDGpuId(node, gpu_id);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Successfully read node #" << std::to_string(node)
|
||||
<< " for gpu_id"
|
||||
<< " | Data (gpu_id) *gpu_id = "
|
||||
<< std::to_string(*gpu_id)
|
||||
<< " | return = " << std::to_string(retVal)
|
||||
<< " | ";
|
||||
LOG_DEBUG(ss);
|
||||
} else {
|
||||
retVal = 1;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: Could not read node #" << std::to_string(node)
|
||||
<< ", KFD node was an unsupported node."
|
||||
<< " | return = " << std::to_string(retVal)
|
||||
<< " | ";
|
||||
LOG_ERROR(ss);
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
@@ -333,7 +333,7 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
|
||||
GetEnvVariables();
|
||||
// To help debug env variable issues
|
||||
// printEnvVarInfo();
|
||||
// debugRSMIEnvVarInfo();
|
||||
|
||||
while (!std::string(kAMDMonitorTypes[i]).empty()) {
|
||||
amd_monitor_types_.insert(kAMDMonitorTypes[i]);
|
||||
@@ -390,7 +390,7 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
uint64_t bdfid = (*dev_iter)->bdfid();
|
||||
if (tmp_map.find(bdfid) == tmp_map.end()) {
|
||||
ss << __PRETTY_FUNCTION__ << " | removing device = "
|
||||
<< (*dev_iter)->path();
|
||||
<< (*dev_iter)->path() << "; bdfid = " << std::to_string(bdfid);
|
||||
dev_iter = devices_.erase(dev_iter);
|
||||
LOG_DEBUG(ss);
|
||||
continue;
|
||||
@@ -549,48 +549,54 @@ uint32_t RocmSMI::getLogSetting() {
|
||||
return this->env_vars_.logging_on;
|
||||
}
|
||||
|
||||
void RocmSMI::printEnvVarInfo(void) {
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_output_bitfield = "
|
||||
<< ((env_vars_.debug_output_bitfield == 0) ? "<undefined>"
|
||||
: std::to_string(env_vars_.debug_output_bitfield))
|
||||
<< std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_DRM_root_override = "
|
||||
<< ((env_vars_.path_DRM_root_override == nullptr)
|
||||
? "<undefined>" : env_vars_.path_DRM_root_override)
|
||||
<< std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_HWMon_root_override = "
|
||||
<< ((env_vars_.path_HWMon_root_override == nullptr)
|
||||
? "<undefined>" : env_vars_.path_HWMon_root_override)
|
||||
<< std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_power_root_override = "
|
||||
<< ((env_vars_.path_power_root_override == nullptr)
|
||||
? "<undefined>" : env_vars_.path_power_root_override)
|
||||
<< std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_inf_loop = "
|
||||
<< ((env_vars_.debug_inf_loop == 0) ? "<undefined>"
|
||||
: std::to_string(env_vars_.debug_inf_loop))
|
||||
<< std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
|
||||
void RocmSMI::debugRSMIEnvVarInfo(void) {
|
||||
std::cout << __PRETTY_FUNCTION__
|
||||
<< RocmSMI::getInstance().getRSMIEnvVarInfo();
|
||||
}
|
||||
|
||||
std::string RocmSMI::getRSMIEnvVarInfo(void) {
|
||||
std::ostringstream ss;
|
||||
ss << "\n\tRSMI_DEBUG_BITFIELD = "
|
||||
<< ((env_vars_.debug_output_bitfield == 0) ? "<undefined>"
|
||||
: std::to_string(env_vars_.debug_output_bitfield))
|
||||
<< std::endl;
|
||||
ss << "\tRSMI_DEBUG_DRM_ROOT_OVERRIDE = "
|
||||
<< ((env_vars_.path_DRM_root_override == nullptr)
|
||||
? "<undefined>" : env_vars_.path_DRM_root_override)
|
||||
<< std::endl;
|
||||
ss << "\tRSMI_DEBUG_HWMON_ROOT_OVERRIDE = "
|
||||
<< ((env_vars_.path_HWMon_root_override == nullptr)
|
||||
? "<undefined>" : env_vars_.path_HWMon_root_override)
|
||||
<< std::endl;
|
||||
ss << "\tRSMI_DEBUG_PP_ROOT_OVERRIDE = "
|
||||
<< ((env_vars_.path_power_root_override == nullptr)
|
||||
? "<undefined>" : env_vars_.path_power_root_override)
|
||||
<< std::endl;
|
||||
ss << "\tRSMI_DEBUG_INFINITE_LOOP = "
|
||||
<< ((env_vars_.debug_inf_loop == 0) ? "<undefined>"
|
||||
: std::to_string(env_vars_.debug_inf_loop))
|
||||
<< std::endl;
|
||||
ss << "\tRSMI_LOGGING = "
|
||||
<< getLogSetting() << std::endl;
|
||||
bool isLoggingOn = RocmSMI::isLoggingOn();
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = "
|
||||
<< (isLoggingOn ? "true" : "false") << std::endl;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {";
|
||||
bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false;
|
||||
ss << "\tRSMI_LOGGING (are logs on) = "
|
||||
<< (isLoggingOn ? "TRUE" : "FALSE") << std::endl;
|
||||
ss << "\tRSMI_DEBUG_ENUM_OVERRIDE = {";
|
||||
if (env_vars_.enum_overrides.empty()) {
|
||||
std::cout << "}" << std::endl;
|
||||
return;
|
||||
ss << "}" << std::endl;
|
||||
return ss.str();
|
||||
}
|
||||
for (auto it=env_vars_.enum_overrides.begin();
|
||||
it != env_vars_.enum_overrides.end(); ++it) {
|
||||
DevInfoTypes type = static_cast<DevInfoTypes>(*it);
|
||||
std::cout << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type)
|
||||
+ ")");
|
||||
ss << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) + ")");
|
||||
auto temp_it = it;
|
||||
if(++temp_it != env_vars_.enum_overrides.end()) {
|
||||
std::cout << ", ";
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
std::cout << "}" << std::endl;
|
||||
ss << "}" << std::endl;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::shared_ptr<Monitor>
|
||||
@@ -692,8 +698,7 @@ static bool isAMDGPU(std::string dev_path) {
|
||||
std::string vend_path = dev_path + "/device/vendor";
|
||||
if (!FileExists(vend_path.c_str())) {
|
||||
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
|
||||
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
|
||||
"is an amdgpu device - FALSE");
|
||||
<< " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE");
|
||||
LOG_DEBUG(ss);
|
||||
return isAmdGpu;
|
||||
}
|
||||
@@ -703,8 +708,7 @@ static bool isAMDGPU(std::string dev_path) {
|
||||
|
||||
if (!fs.is_open()) {
|
||||
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
|
||||
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
|
||||
"is an amdgpu device - FALSE");
|
||||
<< " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE");
|
||||
LOG_DEBUG(ss);
|
||||
return isAmdGpu;
|
||||
}
|
||||
@@ -719,8 +723,7 @@ static bool isAMDGPU(std::string dev_path) {
|
||||
isAmdGpu = true;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path
|
||||
<< " is " << (isAmdGpu ? "is an amdgpu device - TRUE":
|
||||
"is an amdgpu device - FALSE");
|
||||
<< " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE");
|
||||
LOG_DEBUG(ss);
|
||||
return isAmdGpu;
|
||||
}
|
||||
@@ -728,6 +731,7 @@ static bool isAMDGPU(std::string dev_path) {
|
||||
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
std::string err_msg;
|
||||
uint32_t count = 0;
|
||||
std::ostringstream ss;
|
||||
|
||||
// If this gets called more than once, clear previous findings.
|
||||
devices_.clear();
|
||||
@@ -754,17 +758,125 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
|
||||
}
|
||||
dentry = readdir(drm_dir);
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << " | Discovered a potential of "
|
||||
<< std::to_string(count) << " cards" << " | ";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
struct systemNode {
|
||||
uint32_t s_node_id = 0;
|
||||
uint64_t s_gpu_id = 0;
|
||||
uint64_t s_unique_id = 0;
|
||||
};
|
||||
// allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id}
|
||||
std::multimap<uint64_t, systemNode> allSystemNodes;
|
||||
uint32_t node_id = 0;
|
||||
while (true) {
|
||||
uint64_t gpu_id = 0, unique_id = 0;
|
||||
int ret_gpu_id = get_gpu_id(node_id, &gpu_id);
|
||||
int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id);
|
||||
if (ret_gpu_id == 0 || ret_unique_id == 0) {
|
||||
systemNode myNode;
|
||||
myNode.s_node_id = node_id;
|
||||
myNode.s_gpu_id = gpu_id;
|
||||
myNode.s_unique_id = unique_id;
|
||||
if(gpu_id != 0) { // only add gpu nodes, 0 = CPU
|
||||
allSystemNodes.emplace(unique_id, myNode);
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
node_id++;
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | Ordered system nodes found = {";
|
||||
for(auto i: allSystemNodes) {
|
||||
ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
|
||||
<< "; gpu_id = " << std::to_string(i.second.s_gpu_id)
|
||||
<< "; unique_id = " << std::to_string(i.second.s_unique_id)
|
||||
<< "], "
|
||||
;
|
||||
}
|
||||
ss << "}";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
// Discover all root cards & gpu partitions associated with each
|
||||
for (uint32_t node_id = 0; node_id < count; node_id++) {
|
||||
std::string path = kPathDRMRoot;
|
||||
path += "/card";
|
||||
path += std::to_string(node_id);
|
||||
uint64_t primary_unique_id = 0;
|
||||
|
||||
// each identified gpu card node is a primary node for
|
||||
// potential matching unique ids
|
||||
if (isAMDGPU(path) ||
|
||||
(init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) {
|
||||
std::string d_name = "card";
|
||||
d_name += std::to_string(node_id);
|
||||
AddToDeviceList(d_name);
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Ordered system nodes seen in lookup = {";
|
||||
for (auto i : allSystemNodes) {
|
||||
ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
|
||||
<< "; gpu_id = " << std::to_string(i.second.s_gpu_id)
|
||||
<< "; unique_id = " << std::to_string(i.second.s_unique_id)
|
||||
<< "], ";
|
||||
}
|
||||
ss << "}";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
uint64_t temp_primary_unique_id = 0;
|
||||
if (allSystemNodes.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// get lowest key 1st to keep order of nodes matching card
|
||||
uint32_t lowest_NodeId = 0;
|
||||
uint32_t curr_NodeId = 0;
|
||||
|
||||
for (auto it = allSystemNodes.begin(), end = allSystemNodes.end();
|
||||
it != end; it = allSystemNodes.upper_bound(it->first)) {
|
||||
curr_NodeId = it->second.s_node_id;
|
||||
if (it == allSystemNodes.begin()) {
|
||||
lowest_NodeId = it->second.s_node_id;
|
||||
}
|
||||
if (curr_NodeId <= lowest_NodeId) {
|
||||
lowest_NodeId = curr_NodeId;
|
||||
temp_primary_unique_id = it->second.s_unique_id;
|
||||
}
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | lowest_NodeId = " << std::to_string(lowest_NodeId)
|
||||
<< " | curr_NodeId = " << std::to_string(curr_NodeId)
|
||||
<< " | temp_primary_unique_id = "
|
||||
<< std::to_string(temp_primary_unique_id);
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
if (temp_primary_unique_id != 0) {
|
||||
primary_unique_id = temp_primary_unique_id;
|
||||
} else {
|
||||
allSystemNodes.erase(primary_unique_id);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto numb_nodes = allSystemNodes.count(primary_unique_id);
|
||||
ss << __PRETTY_FUNCTION__ << " | REFRESH - primary_unique_id = "
|
||||
<< std::to_string(primary_unique_id) << " has "
|
||||
<< std::to_string(numb_nodes) << " known gpu nodes";
|
||||
LOG_DEBUG(ss);
|
||||
while (numb_nodes > 1) {
|
||||
std::string secNode = "card";
|
||||
secNode += std::to_string(node_id); // add the primary node id
|
||||
AddToDeviceList(secNode);
|
||||
numb_nodes--;
|
||||
}
|
||||
// remove already added nodes associated with current card
|
||||
auto erasedNodes = allSystemNodes.erase(primary_unique_id);
|
||||
ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = "
|
||||
<< std::to_string(primary_unique_id) << " erased "
|
||||
<< std::to_string(erasedNodes) << " nodes";
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
}
|
||||
|
||||
if (closedir(drm_dir)) {
|
||||
|
||||
@@ -40,12 +40,17 @@
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#define _GNU_SOURCE 1 // REQUIRED: to utilize some GNU features/functions, see
|
||||
// _GNU_SOURCE functions which check
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include <dirent.h>
|
||||
#include <glob.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/utsname.h>
|
||||
#include <unistd.h>
|
||||
#include <dlfcn.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
@@ -612,7 +617,8 @@ std::string getRSMIStatusString(rsmi_status_t ret) {
|
||||
// Big Endian (BE), multi-bit symbols encoded as big endian (MSB first)
|
||||
// Little Endian (LE), multi-bit symbols encoded as little endian (LSB first)
|
||||
std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string, std::string>
|
||||
std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string>
|
||||
getSystemDetails(void) {
|
||||
struct utsname buf;
|
||||
bool errorDetected = false;
|
||||
@@ -625,6 +631,9 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
std::string domainName = "<undefined>";
|
||||
std::string os_distribution = "<undefined>";
|
||||
std::string endianness = "<undefined>";
|
||||
std::string rocm_lib_path = "<undefined>";
|
||||
std::string rocm_build_type = "<undefined>";
|
||||
std::string rocm_env_variables = "<undefined>";
|
||||
|
||||
if (uname(&buf) < 0) {
|
||||
errorDetected = true;
|
||||
@@ -659,9 +668,13 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
endianness = "Little Endian, multi-bit symbols encoded as"
|
||||
" little endian (LSB first)";
|
||||
}
|
||||
rocm_build_type = getBuildType();
|
||||
rocm_lib_path = getMyLibPath();
|
||||
rocm_env_variables = RocmSMI::getInstance().getRSMIEnvVarInfo();
|
||||
return std::make_tuple(errorDetected, sysname, nodename, release,
|
||||
version, machine, domainName, os_distribution,
|
||||
endianness);
|
||||
endianness, rocm_build_type, rocm_lib_path,
|
||||
rocm_env_variables);
|
||||
}
|
||||
|
||||
// If logging is enabled through RSMI_LOGGING environment variable.
|
||||
@@ -669,17 +682,12 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
void logSystemDetails(void) {
|
||||
std::ostringstream ss;
|
||||
bool errorDetected;
|
||||
std::string sysname;
|
||||
std::string node;
|
||||
std::string release;
|
||||
std::string version;
|
||||
std::string machine;
|
||||
std::string domain;
|
||||
std::string distName;
|
||||
std::string endianness;
|
||||
std::string sysname, node, release, version, machine, domain, distName,
|
||||
endianness, rocm_build_type, lib_path, rocm_env_vars;
|
||||
std::tie(errorDetected, sysname, node, release, version, machine, domain,
|
||||
distName, endianness) = getSystemDetails();
|
||||
if (!errorDetected) {
|
||||
distName, endianness, rocm_build_type, lib_path,
|
||||
rocm_env_vars) = getSystemDetails();
|
||||
if (errorDetected == false) {
|
||||
ss << "====== Gathered system details ============\n"
|
||||
<< "SYSTEM NAME: " << sysname << "\n"
|
||||
<< "OS DISTRIBUTION: " << distName << "\n"
|
||||
@@ -688,7 +696,10 @@ void logSystemDetails(void) {
|
||||
<< "VERSION: " << version << "\n"
|
||||
<< "MACHINE TYPE: " << machine << "\n"
|
||||
<< "DOMAIN: " << domain << "\n"
|
||||
<< "ENDIANNESS: " << endianness << "\n";
|
||||
<< "ENDIANNESS: " << endianness << "\n"
|
||||
<< "ROCM BUILD TYPE: " << rocm_build_type << "\n"
|
||||
<< "ROCM-SMI-LIB PATH: " << lib_path << "\n"
|
||||
<< "ROCM ENV VARIABLES: " << rocm_env_vars << "\n";
|
||||
LOG_INFO(ss);
|
||||
} else {
|
||||
ss << "====== Gathered system details ============\n"
|
||||
@@ -786,6 +797,36 @@ bool isSystemBigEndian() {
|
||||
return isBigEndian;
|
||||
}
|
||||
|
||||
std::string getBuildType() {
|
||||
std::string build = "<unknown>";
|
||||
#ifndef DEBUG
|
||||
build = "release";
|
||||
#else
|
||||
build = "debug";
|
||||
#endif
|
||||
return build;
|
||||
}
|
||||
|
||||
const char *my_fname(void) {
|
||||
std::string emptyRet="";
|
||||
#ifdef _GNU_SOURCE
|
||||
Dl_info dl_info;
|
||||
dladdr((void *)my_fname, &dl_info);
|
||||
return (dl_info.dli_fname);
|
||||
#else
|
||||
return emptyRet.c_str();
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string getMyLibPath(void) {
|
||||
std::string libName = "rocm-smi-lib";
|
||||
std::string path = std::string(my_fname());
|
||||
if (path.empty()) {
|
||||
path = "Could not find library path for " + libName;
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str)
|
||||
{
|
||||
auto result = rsmi_status_t::RSMI_STATUS_SUCCESS;
|
||||
@@ -807,6 +848,35 @@ rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str)
|
||||
return result;
|
||||
}
|
||||
|
||||
int subDirectoryCountInPath(const std::string path) {
|
||||
int dir_count = 0;
|
||||
struct dirent *dent;
|
||||
DIR *srcdir = opendir(path.c_str());
|
||||
|
||||
if (srcdir == NULL) {
|
||||
perror("opendir");
|
||||
return -1;
|
||||
}
|
||||
|
||||
while ((dent = readdir(srcdir)) != NULL) {
|
||||
struct stat st;
|
||||
|
||||
if (strcmp(dent->d_name, ".") == 0 || strcmp(dent->d_name, "..") == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (fstatat(dirfd(srcdir), dent->d_name, &st, 0) < 0) {
|
||||
perror(dent->d_name);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (S_ISDIR(st.st_mode)) {
|
||||
dir_count++;
|
||||
}
|
||||
}
|
||||
closedir(srcdir);
|
||||
return dir_count;
|
||||
}
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
@@ -67,7 +67,8 @@ target_link_libraries(
|
||||
PUBLIC GTest::gtest_main
|
||||
PUBLIC c
|
||||
PUBLIC stdc++
|
||||
PUBLIC pthread)
|
||||
PUBLIC pthread
|
||||
PUBLIC dl)
|
||||
|
||||
install(TARGETS ${RSMITST} gtest gtest_main
|
||||
DESTINATION ${SHARE_INSTALL_PREFIX}/rsmitst_tests
|
||||
|
||||
Reference in New Issue
Block a user