diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h index 9cf8fd8e40..90c7f6ff3b 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h @@ -118,6 +118,10 @@ GetProcessGPUs(uint32_t pid, std::unordered_set *gpu_count); int ReadKFDDeviceProperties(uint32_t dev_id, std::vector *retVec); +int read_node_properties(uint32_t node, std::string property_name, + uint64_t *val); +int get_gpu_id(uint32_t node, uint64_t *gpu_id); + } // namespace smi } // namespace amd diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h index f276bd85bb..8b60324988 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h @@ -113,7 +113,8 @@ class RocmSMI { uint64_t *weight); int get_node_index(uint32_t dv_ind, uint32_t *node_ind); const RocmSMI_env_vars& getEnv(void); - void printEnvVarInfo(void); + std::string getRSMIEnvVarInfo(void); + void debugRSMIEnvVarInfo(); bool isLoggingOn(void); uint32_t getLogSetting(void); static const std::map devInfoTypesStrings; diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h index 5ba813a273..49a3521dc1 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h @@ -99,13 +99,17 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type, rsmi_status_t ErrnoToRsmiStatus(int err); std::string getRSMIStatusString(rsmi_status_t ret); std::tuple + std::string, std::string, std::string, std::string, + std::string, std::string, std::string> getSystemDetails(void); void logSystemDetails(void); rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str); void logHexDump(const char *desc, const void *addr, const size_t len, size_t perLine); bool isSystemBigEndian(); +std::string getBuildType(); +std::string getMyLibPath(); +int subDirectoryCountInPath(const std::string path); template std::string print_int_as_hex(T i, bool showHexNotation=true) { std::stringstream ss; diff --git a/projects/rocm-smi-lib/oam/CMakeLists.txt b/projects/rocm-smi-lib/oam/CMakeLists.txt index dc674b0bb0..6927d245e6 100644 --- a/projects/rocm-smi-lib/oam/CMakeLists.txt +++ b/projects/rocm-smi-lib/oam/CMakeLists.txt @@ -72,7 +72,7 @@ target_include_directories(${OAM_EXAMPLE_EXE} PRIVATE ${OAM_INC_LIST}) target_link_libraries(${OAM_EXAMPLE_EXE} ${OAM_TARGET}) add_library(${OAM_TARGET} ${CMN_SRC_LIST} ${OAM_SRC_LIST} ${CMN_INC_LIST} ${OAM_INC_LIST}) -target_link_libraries(${OAM_TARGET} pthread rt) +target_link_libraries(${OAM_TARGET} pthread rt dl) target_include_directories(${OAM_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include ${COMMON_PROJ_ROOT}/common/shared_mutex) diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index d0bb5ab365..1d4c7e69a1 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -1594,7 +1594,9 @@ def showAllConcise(deviceList): printLogSpacer(' Concise Info ') deviceList.sort() - (temp_type, _) = findFirstAvailableTemp(deviceList[0]) + temp_type = '(' + temp_type_lst[0] + ')' + if len(deviceList) >= 1: + (temp_type, _) = findFirstAvailableTemp(deviceList[0]) available_temp_type = temp_type.lower() available_temp_type = available_temp_type.replace('(', '') available_temp_type = available_temp_type.replace(')', '') @@ -1843,7 +1845,8 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): printLog(device, 'pcie clock level', '{} ({})'.format(current_f, fr)) else: logging.debug('PCIe clock is unsupported on device[{}]'.format(device)) - printLogSpacer() + if not concise: + printLogSpacer() def showCurrentFans(deviceList): @@ -2786,7 +2789,9 @@ def getGraphColor(percentage): def showTempGraph(deviceList): deviceList.sort() - (temp_type, temp_value) = findFirstAvailableTemp(deviceList[0]) + temp_type = '(' + temp_type_lst[0] + ')' + if len(deviceList) >= 1: + (temp_type, _) = findFirstAvailableTemp(deviceList[0]) printLogSpacer(' Temperature Graph ' + temp_type + ' ') temp_type = temp_type.lower() temp_type = temp_type.replace('(', '') diff --git a/projects/rocm-smi-lib/rocm_smi/CMakeLists.txt b/projects/rocm-smi-lib/rocm_smi/CMakeLists.txt index ae8c017933..08b2599542 100755 --- a/projects/rocm-smi-lib/rocm_smi/CMakeLists.txt +++ b/projects/rocm-smi-lib/rocm_smi/CMakeLists.txt @@ -80,7 +80,7 @@ add_executable(${SMI_EXAMPLE_EXE} "example/rocm_smi_example.cc") target_link_libraries(${SMI_EXAMPLE_EXE} ${ROCM_SMI_TARGET}) add_library(${ROCM_SMI_TARGET} ${CMN_SRC_LIST} ${SMI_SRC_LIST} ${CMN_INC_LIST} ${SMI_INC_LIST}) -target_link_libraries(${ROCM_SMI_TARGET} pthread rt) +target_link_libraries(${ROCM_SMI_TARGET} pthread rt dl) target_include_directories(${ROCM_SMI_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${COMMON_PROJ_ROOT}/common/shared_mutex) diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 506e784206..d8bd892ac8 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -2991,10 +2991,24 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, if (mem_type == RSMI_MEM_TYPE_VRAM && *total == 0) { GET_DEV_AND_KFDNODE_FROM_INDX if (kfd_node->get_total_memory(total) == 0 && *total > 0) { + ss << __PRETTY_FUNCTION__ + << " | inside success fallback... " + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: total = " << std::to_string(*total) + << " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS); + LOG_DEBUG(ss); return RSMI_STATUS_SUCCESS; } } + ss << __PRETTY_FUNCTION__ + << " | after fallback... " + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: total = " << std::to_string(*total) + << " | ret = " << getRSMIStatusString(ret); + LOG_DEBUG(ss); return ret; CATCH } @@ -3036,11 +3050,36 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, GET_DEV_AND_KFDNODE_FROM_INDX uint64_t total = 0; ret = get_dev_value_int(amd::smi::kDevMemTotVRAM, dv_ind, &total); - if (total != 0) return ret; // do not need to fallback + if (total != 0) { + ss << __PRETTY_FUNCTION__ + << " no fallback needed! - " + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: Used = " << std::to_string(*used) + << " | Data: total = " << std::to_string(total) + << " | ret = " << getRSMIStatusString(ret); + LOG_DEBUG(ss); + return ret; // do not need to fallback + } if ( kfd_node->get_used_memory(used) == 0 ) { + ss << __PRETTY_FUNCTION__ + << " | in fallback == success ..." + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: Used = " << std::to_string(*used) + << " | Data: total = " << std::to_string(total) + << " | ret = " << getRSMIStatusString(RSMI_STATUS_SUCCESS); + LOG_DEBUG(ss); return RSMI_STATUS_SUCCESS; } } + ss << __PRETTY_FUNCTION__ + << " | at end!!!! after fallback ..." + << " | Device #: " << std::to_string(dv_ind) + << " | Type = " << RocmSMI::devInfoTypesStrings.at(mem_type_file) + << " | Data: Used = " << std::to_string(*used) + << " | ret = " << getRSMIStatusString(ret); + LOG_DEBUG(ss); return ret; CATCH diff --git a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc index afe567d80d..7fe9004cc3 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc @@ -63,6 +63,7 @@ #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_logger.h" namespace amd { namespace smi { @@ -775,20 +776,30 @@ KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth, // /sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties // size_in_bytes 68702699520 int KFDNode::get_total_memory(uint64_t* total) { - if (total == nullptr) return EINVAL; + std::ostringstream ss; + if (total == nullptr) { + return EINVAL; + } *total = 0; std::string f_path = kKFDNodesPathRoot; f_path += "/"; f_path += std::to_string(node_indx_); f_path += "/mem_banks"; + int subDirCount = subDirectoryCountInPath(f_path); + ss << __PRETTY_FUNCTION__ << " | [before loop] Within " << f_path + << " has subdirectory count = " << std::to_string(subDirCount); + LOG_DEBUG(ss); auto kfd_node_dir = opendir(f_path.c_str()); if (kfd_node_dir == nullptr) { return errno; } auto dentry = readdir(kfd_node_dir); - while (dentry != nullptr) { + while (dentry != nullptr && subDirCount > 0) { + ss << __PRETTY_FUNCTION__ << " | [inside loop] Within " << f_path + << " has subdirectory count = " << std::to_string(subDirCount); + LOG_DEBUG(ss); if (dentry->d_name[0] == '.') { dentry = readdir(kfd_node_dir); continue; @@ -822,6 +833,7 @@ int KFDNode::get_total_memory(uint64_t* total) { } } } // end loop for lines in property file + subDirCount--; } // end loop for mem_bank directory if (closedir(kfd_node_dir)) { @@ -862,5 +874,80 @@ int KFDNode::get_used_memory(uint64_t* used) { return 1; } +// /sys/class/kfd/kfd/topology/nodes/*/properties +int read_node_properties(uint32_t node, std::string property_name, + uint64_t *val) { + std::ostringstream ss; + int retVal = EINVAL; + if (property_name.empty() || val == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", property_name is empty or *val is nullptr " + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + return retVal; + } + std::shared_ptr myNode = std::shared_ptr(new KFDNode(node)); + myNode->Initialize(); + if (KFDNodeSupported(node)) { + retVal = myNode->get_property_value(property_name, val); + ss << __PRETTY_FUNCTION__ + << " | Successfully read node #" << std::to_string(node) + << " for property_name = " << property_name + << " | Data (" << property_name << ") * val = " + << std::to_string(*val) + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + } else { + retVal = 1; + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", KFD node was an unsupported node." + << " | return = " << std::to_string(retVal) + << " | "; + LOG_ERROR(ss); + } + return retVal; +} + +// /sys/class/kfd/kfd/topology/nodes/*/gpu_id +int get_gpu_id(uint32_t node, uint64_t *gpu_id) { + std::ostringstream ss; + int retVal = EINVAL; + if (gpu_id == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", gpu_id is a nullptr " + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + return retVal; + } + std::shared_ptr myNode = std::shared_ptr(new KFDNode(node)); + myNode->Initialize(); + if (KFDNodeSupported(node)) { + retVal = ReadKFDGpuId(node, gpu_id); + ss << __PRETTY_FUNCTION__ + << " | Successfully read node #" << std::to_string(node) + << " for gpu_id" + << " | Data (gpu_id) *gpu_id = " + << std::to_string(*gpu_id) + << " | return = " << std::to_string(retVal) + << " | "; + LOG_DEBUG(ss); + } else { + retVal = 1; + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not read node #" << std::to_string(node) + << ", KFD node was an unsupported node." + << " | return = " << std::to_string(retVal) + << " | "; + LOG_ERROR(ss); + } + return retVal; +} + } // namespace smi } // namespace amd diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc index 1eb973c17e..831e382b93 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_main.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc @@ -333,7 +333,7 @@ RocmSMI::Initialize(uint64_t flags) { GetEnvVariables(); // To help debug env variable issues - // printEnvVarInfo(); + // debugRSMIEnvVarInfo(); while (!std::string(kAMDMonitorTypes[i]).empty()) { amd_monitor_types_.insert(kAMDMonitorTypes[i]); @@ -390,7 +390,7 @@ RocmSMI::Initialize(uint64_t flags) { uint64_t bdfid = (*dev_iter)->bdfid(); if (tmp_map.find(bdfid) == tmp_map.end()) { ss << __PRETTY_FUNCTION__ << " | removing device = " - << (*dev_iter)->path(); + << (*dev_iter)->path() << "; bdfid = " << std::to_string(bdfid); dev_iter = devices_.erase(dev_iter); LOG_DEBUG(ss); continue; @@ -549,48 +549,54 @@ uint32_t RocmSMI::getLogSetting() { return this->env_vars_.logging_on; } -void RocmSMI::printEnvVarInfo(void) { - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_output_bitfield = " - << ((env_vars_.debug_output_bitfield == 0) ? "" - : std::to_string(env_vars_.debug_output_bitfield)) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_DRM_root_override = " - << ((env_vars_.path_DRM_root_override == nullptr) - ? "" : env_vars_.path_DRM_root_override) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_HWMon_root_override = " - << ((env_vars_.path_HWMon_root_override == nullptr) - ? "" : env_vars_.path_HWMon_root_override) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.path_power_root_override = " - << ((env_vars_.path_power_root_override == nullptr) - ? "" : env_vars_.path_power_root_override) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.debug_inf_loop = " - << ((env_vars_.debug_inf_loop == 0) ? "" - : std::to_string(env_vars_.debug_inf_loop)) - << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " +void RocmSMI::debugRSMIEnvVarInfo(void) { + std::cout << __PRETTY_FUNCTION__ + << RocmSMI::getInstance().getRSMIEnvVarInfo(); +} + +std::string RocmSMI::getRSMIEnvVarInfo(void) { + std::ostringstream ss; + ss << "\n\tRSMI_DEBUG_BITFIELD = " + << ((env_vars_.debug_output_bitfield == 0) ? "" + : std::to_string(env_vars_.debug_output_bitfield)) + << std::endl; + ss << "\tRSMI_DEBUG_DRM_ROOT_OVERRIDE = " + << ((env_vars_.path_DRM_root_override == nullptr) + ? "" : env_vars_.path_DRM_root_override) + << std::endl; + ss << "\tRSMI_DEBUG_HWMON_ROOT_OVERRIDE = " + << ((env_vars_.path_HWMon_root_override == nullptr) + ? "" : env_vars_.path_HWMon_root_override) + << std::endl; + ss << "\tRSMI_DEBUG_PP_ROOT_OVERRIDE = " + << ((env_vars_.path_power_root_override == nullptr) + ? "" : env_vars_.path_power_root_override) + << std::endl; + ss << "\tRSMI_DEBUG_INFINITE_LOOP = " + << ((env_vars_.debug_inf_loop == 0) ? "" + : std::to_string(env_vars_.debug_inf_loop)) + << std::endl; + ss << "\tRSMI_LOGGING = " << getLogSetting() << std::endl; - bool isLoggingOn = RocmSMI::isLoggingOn(); - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " - << (isLoggingOn ? "true" : "false") << std::endl; - std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {"; + bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false; + ss << "\tRSMI_LOGGING (are logs on) = " + << (isLoggingOn ? "TRUE" : "FALSE") << std::endl; + ss << "\tRSMI_DEBUG_ENUM_OVERRIDE = {"; if (env_vars_.enum_overrides.empty()) { - std::cout << "}" << std::endl; - return; + ss << "}" << std::endl; + return ss.str(); } for (auto it=env_vars_.enum_overrides.begin(); it != env_vars_.enum_overrides.end(); ++it) { DevInfoTypes type = static_cast(*it); - std::cout << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) - + ")"); + ss << (std::to_string(*it) + " (" + devInfoTypesStrings.at(type) + ")"); auto temp_it = it; if(++temp_it != env_vars_.enum_overrides.end()) { - std::cout << ", "; + ss << ", "; } } - std::cout << "}" << std::endl; + ss << "}" << std::endl; + return ss.str(); } std::shared_ptr @@ -692,8 +698,7 @@ static bool isAMDGPU(std::string dev_path) { std::string vend_path = dev_path + "/device/vendor"; if (!FileExists(vend_path.c_str())) { ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path - << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": - "is an amdgpu device - FALSE"); + << " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE"); LOG_DEBUG(ss); return isAmdGpu; } @@ -703,8 +708,7 @@ static bool isAMDGPU(std::string dev_path) { if (!fs.is_open()) { ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path - << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": - "is an amdgpu device - FALSE"); + << " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE"); LOG_DEBUG(ss); return isAmdGpu; } @@ -719,8 +723,7 @@ static bool isAMDGPU(std::string dev_path) { isAmdGpu = true; } ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path - << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": - "is an amdgpu device - FALSE"); + << " is an amdgpu device - " << (isAmdGpu ? "TRUE": " FALSE"); LOG_DEBUG(ss); return isAmdGpu; } @@ -728,6 +731,7 @@ static bool isAMDGPU(std::string dev_path) { uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { std::string err_msg; uint32_t count = 0; + std::ostringstream ss; // If this gets called more than once, clear previous findings. devices_.clear(); @@ -754,17 +758,125 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { } dentry = readdir(drm_dir); } + ss << __PRETTY_FUNCTION__ << " | Discovered a potential of " + << std::to_string(count) << " cards" << " | "; + LOG_DEBUG(ss); + struct systemNode { + uint32_t s_node_id = 0; + uint64_t s_gpu_id = 0; + uint64_t s_unique_id = 0; + }; + // allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id} + std::multimap allSystemNodes; + uint32_t node_id = 0; + while (true) { + uint64_t gpu_id = 0, unique_id = 0; + int ret_gpu_id = get_gpu_id(node_id, &gpu_id); + int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id); + if (ret_gpu_id == 0 || ret_unique_id == 0) { + systemNode myNode; + myNode.s_node_id = node_id; + myNode.s_gpu_id = gpu_id; + myNode.s_unique_id = unique_id; + if(gpu_id != 0) { // only add gpu nodes, 0 = CPU + allSystemNodes.emplace(unique_id, myNode); + } + } else { + break; + } + node_id++; + } + + ss << __PRETTY_FUNCTION__ << " | Ordered system nodes found = {"; + for(auto i: allSystemNodes) { + ss << "\n[node_id = " << std::to_string(i.second.s_node_id) + << "; gpu_id = " << std::to_string(i.second.s_gpu_id) + << "; unique_id = " << std::to_string(i.second.s_unique_id) + << "], " + ; + } + ss << "}"; + LOG_DEBUG(ss); + + // Discover all root cards & gpu partitions associated with each for (uint32_t node_id = 0; node_id < count; node_id++) { std::string path = kPathDRMRoot; path += "/card"; path += std::to_string(node_id); + uint64_t primary_unique_id = 0; + + // each identified gpu card node is a primary node for + // potential matching unique ids if (isAMDGPU(path) || (init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) { std::string d_name = "card"; d_name += std::to_string(node_id); AddToDeviceList(d_name); - } + + ss << __PRETTY_FUNCTION__ + << " | Ordered system nodes seen in lookup = {"; + for (auto i : allSystemNodes) { + ss << "\n[node_id = " << std::to_string(i.second.s_node_id) + << "; gpu_id = " << std::to_string(i.second.s_gpu_id) + << "; unique_id = " << std::to_string(i.second.s_unique_id) + << "], "; + } + ss << "}"; + LOG_DEBUG(ss); + + uint64_t temp_primary_unique_id = 0; + if (allSystemNodes.empty()) { + continue; + } + + // get lowest key 1st to keep order of nodes matching card + uint32_t lowest_NodeId = 0; + uint32_t curr_NodeId = 0; + + for (auto it = allSystemNodes.begin(), end = allSystemNodes.end(); + it != end; it = allSystemNodes.upper_bound(it->first)) { + curr_NodeId = it->second.s_node_id; + if (it == allSystemNodes.begin()) { + lowest_NodeId = it->second.s_node_id; + } + if (curr_NodeId <= lowest_NodeId) { + lowest_NodeId = curr_NodeId; + temp_primary_unique_id = it->second.s_unique_id; + } + } + ss << __PRETTY_FUNCTION__ + << " | lowest_NodeId = " << std::to_string(lowest_NodeId) + << " | curr_NodeId = " << std::to_string(curr_NodeId) + << " | temp_primary_unique_id = " + << std::to_string(temp_primary_unique_id); + LOG_DEBUG(ss); + + if (temp_primary_unique_id != 0) { + primary_unique_id = temp_primary_unique_id; + } else { + allSystemNodes.erase(primary_unique_id); + continue; + } + + auto numb_nodes = allSystemNodes.count(primary_unique_id); + ss << __PRETTY_FUNCTION__ << " | REFRESH - primary_unique_id = " + << std::to_string(primary_unique_id) << " has " + << std::to_string(numb_nodes) << " known gpu nodes"; + LOG_DEBUG(ss); + while (numb_nodes > 1) { + std::string secNode = "card"; + secNode += std::to_string(node_id); // add the primary node id + AddToDeviceList(secNode); + numb_nodes--; + } + // remove already added nodes associated with current card + auto erasedNodes = allSystemNodes.erase(primary_unique_id); + ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = " + << std::to_string(primary_unique_id) << " erased " + << std::to_string(erasedNodes) << " nodes"; + LOG_DEBUG(ss); + } } if (closedir(drm_dir)) { diff --git a/projects/rocm-smi-lib/src/rocm_smi_utils.cc b/projects/rocm-smi-lib/src/rocm_smi_utils.cc index 973d555d26..1e9a444320 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_utils.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_utils.cc @@ -40,12 +40,17 @@ * DEALINGS WITH THE SOFTWARE. * */ - +#define _GNU_SOURCE 1 // REQUIRED: to utilize some GNU features/functions, see + // _GNU_SOURCE functions which check +#include +#include +#include +#include #include #include #include #include -#include +#include #include #include @@ -612,7 +617,8 @@ std::string getRSMIStatusString(rsmi_status_t ret) { // Big Endian (BE), multi-bit symbols encoded as big endian (MSB first) // Little Endian (LE), multi-bit symbols encoded as little endian (LSB first) std::tuple + std::string, std::string, std::string, std::string, + std::string, std::string, std::string> getSystemDetails(void) { struct utsname buf; bool errorDetected = false; @@ -625,6 +631,9 @@ std::tupled_name, ".") == 0 || strcmp(dent->d_name, "..") == 0) { + continue; + } + + if (fstatat(dirfd(srcdir), dent->d_name, &st, 0) < 0) { + perror(dent->d_name); + continue; + } + + if (S_ISDIR(st.st_mode)) { + dir_count++; + } + } + closedir(srcdir); + return dir_count; +} } // namespace smi } // namespace amd diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt b/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt index 9c8ed197dc..b2347e0bff 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt @@ -67,7 +67,8 @@ target_link_libraries( PUBLIC GTest::gtest_main PUBLIC c PUBLIC stdc++ - PUBLIC pthread) + PUBLIC pthread + PUBLIC dl) install(TARGETS ${RSMITST} gtest gtest_main DESTINATION ${SHARE_INSTALL_PREFIX}/rsmitst_tests