diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc index 7c061ddbf0..83d141dc74 100755 --- a/rocm_smi/example/rocm_smi_example.cc +++ b/rocm_smi/example/rocm_smi_example.cc @@ -58,8 +58,8 @@ #define PRINT_RSMI_ERR(RET) { \ if (RET != RSMI_STATUS_SUCCESS) { \ std::cout << "[ERROR] RSMI call returned " << (RET) \ - << " at line " << __LINE__ << std::endl; \ - std::cout << amd::smi::getRSMIStatusString(RET) << std::endl; \ + << " at line " << __LINE__ << "\n"; \ + std::cout << amd::smi::getRSMIStatusString(RET) << "\n"; \ } \ } @@ -718,7 +718,7 @@ int main() { rsmi_num_monitor_devices(&num_monitor_devs); for (uint32_t i = 0; i < num_monitor_devs; ++i) { - std::cout << "\t**Device #: " << std::dec << i << std::endl; + std::cout << "\t**Device #: " << std::dec << i << "\n"; ret = rsmi_dev_id_get(i, &val_ui16); CHK_RSMI_RET_I(ret) std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << "\n"; @@ -770,8 +770,9 @@ int main() { uint64_t max_bandwidth = 0; ret = rsmi_minmax_bandwidth_get(0, i, &min_bandwidth, &max_bandwidth); CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret) - std::cout << "\nMinimum Bandwidth: " << min_bandwidth - << "\nMaximum Bandwidth: " << max_bandwidth; + std::cout << "\n\t**\tMinimum Bandwidth: " << std::dec << min_bandwidth + << "\n\t**\tMaximum Bandwidth: " << std::dec + << max_bandwidth << "\n"; } else { std::cout << "Not Supported\n"; } @@ -818,7 +819,7 @@ int main() { ret = rsmi_dev_temp_metric_get(i, RSMI_TEMP_TYPE_EDGE, rsmi_temperature_metric_t::RSMI_TEMP_CURRENT, &val_i64); if (ret == RSMI_STATUS_SUCCESS) { - std::cout << val_i64/1000 << "C" << "\n"; + std::cout << std::dec << val_i64/1000 << " C" << "\n"; } CHK_RSMI_NOT_SUPPORTED_RET(ret) @@ -826,7 +827,7 @@ int main() { ret = rsmi_dev_temp_metric_get(i, RSMI_TEMP_TYPE_JUNCTION, rsmi_temperature_metric_t::RSMI_TEMP_CURRENT, &val_i64); if (ret == RSMI_STATUS_SUCCESS) { - std::cout << (val_i64 / 1000) << "C" << std::endl; + std::cout << std::dec << (val_i64 / 1000) << " C" << "\n"; } CHK_RSMI_NOT_SUPPORTED_RET(ret) @@ -874,14 +875,14 @@ int main() { std::cout << "\t**Average Power Usage: "; ret = rsmi_dev_power_ave_get(i, 0, &val_ui64); if (ret == RSMI_STATUS_SUCCESS) { - std::cout << convert_mw_to_w(val_ui64) << " W" << std::endl; + std::cout << convert_mw_to_w(val_ui64) << " W" << "\n"; } CHK_RSMI_NOT_SUPPORTED_RET(ret) std::cout << "\t**Current Socket Power Usage: "; ret = rsmi_dev_current_socket_power_get(i, &val_ui64); if (ret == RSMI_STATUS_SUCCESS) { - std::cout << convert_mw_to_w(val_ui64) << " W" << std::endl; + std::cout << convert_mw_to_w(val_ui64) << " W" << "\n"; } CHK_RSMI_NOT_SUPPORTED_RET(ret) @@ -889,7 +890,7 @@ int main() { ret = rsmi_dev_power_get(i, &val_ui64, &power_type); if (ret == RSMI_STATUS_SUCCESS) { std::cout << "[" << amd::smi::power_type_string(power_type) << "] " - << convert_mw_to_w(val_ui64) << " W" << std::endl; + << convert_mw_to_w(val_ui64) << " W" << "\n"; } CHK_RSMI_NOT_SUPPORTED_RET(ret) std::cout << "\t=======" << "\n"; @@ -902,7 +903,7 @@ int main() { return 0; } - for (uint32_t i = 0; i< num_monitor_devs; ++i) { + for (uint32_t i = 0; i < num_monitor_devs; ++i) { ret = test_set_overdrive(i); CHK_AND_PRINT_RSMI_ERR_RET(ret) diff --git a/rocm_smi/include/rocm_smi/rocm_smi_main.h b/rocm_smi/include/rocm_smi/rocm_smi_main.h index 8b60324988..1cd2ec343f 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_main.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_main.h @@ -128,7 +128,7 @@ class RocmSMI { std::map, std::shared_ptr> io_link_map_; std::map dev_ind_to_node_ind_map_; - void AddToDeviceList(std::string dev_name); + void AddToDeviceList(std::string dev_name, uint64_t bdfid = 0); void GetEnvVariables(void); std::shared_ptr FindMonitor(std::string monitor_path); diff --git a/rocm_smi/src/rocm_smi_kfd.cc b/rocm_smi/src/rocm_smi_kfd.cc index 39e5e4791b..68fccb9a49 100755 --- a/rocm_smi/src/rocm_smi_kfd.cc +++ b/rocm_smi/src/rocm_smi_kfd.cc @@ -954,9 +954,12 @@ int KFDNode::get_cache_info(rsmi_gpu_cache_info_t *info) { int read_node_properties(uint32_t node, std::string property_name, uint64_t *val) { std::ostringstream ss; + std::string propertiesFullPath = "/sys/class/kfd/kfd/topology/nodes/" + + std::to_string(node) + "/properties"; int retVal = EINVAL; if (property_name.empty() || val == nullptr) { ss << __PRETTY_FUNCTION__ + << " | File: " << propertiesFullPath << " | Issue: Could not read node #" << std::to_string(node) << ", property_name is empty or *val is nullptr " << " | return = " << std::to_string(retVal) @@ -969,6 +972,7 @@ int read_node_properties(uint32_t node, std::string property_name, if (KFDNodeSupported(node)) { retVal = myNode->get_property_value(property_name, val); ss << __PRETTY_FUNCTION__ + << " | File: " << propertiesFullPath << " | Successfully read node #" << std::to_string(node) << " for property_name = " << property_name << " | Data (" << property_name << ") * val = " @@ -979,6 +983,7 @@ int read_node_properties(uint32_t node, std::string property_name, } else { retVal = 1; ss << __PRETTY_FUNCTION__ + << " | File: " << propertiesFullPath << " | Issue: Could not read node #" << std::to_string(node) << ", KFD node was an unsupported node." << " | return = " << std::to_string(retVal) @@ -991,9 +996,12 @@ int read_node_properties(uint32_t node, std::string property_name, // /sys/class/kfd/kfd/topology/nodes/*/gpu_id int get_gpu_id(uint32_t node, uint64_t *gpu_id) { std::ostringstream ss; + std::string gpu_id_FullPath = "/sys/class/kfd/kfd/topology/nodes/" + + std::to_string(node) + "/gpu_id"; int retVal = EINVAL; if (gpu_id == nullptr) { ss << __PRETTY_FUNCTION__ + << " | File: " << gpu_id_FullPath << " | Issue: Could not read node #" << std::to_string(node) << ", gpu_id is a nullptr " << " | return = " << std::to_string(retVal) @@ -1006,6 +1014,7 @@ int get_gpu_id(uint32_t node, uint64_t *gpu_id) { if (KFDNodeSupported(node)) { retVal = ReadKFDGpuId(node, gpu_id); ss << __PRETTY_FUNCTION__ + << " | File: " << gpu_id_FullPath << " | Successfully read node #" << std::to_string(node) << " for gpu_id" << " | Data (gpu_id) *gpu_id = " @@ -1016,6 +1025,7 @@ int get_gpu_id(uint32_t node, uint64_t *gpu_id) { } else { retVal = 1; ss << __PRETTY_FUNCTION__ + << " | File: " << gpu_id_FullPath << " | Issue: Could not read node #" << std::to_string(node) << ", KFD node was an unsupported node." << " | return = " << std::to_string(retVal) diff --git a/rocm_smi/src/rocm_smi_main.cc b/rocm_smi/src/rocm_smi_main.cc index 0a07449b8b..22200ea273 100755 --- a/rocm_smi/src/rocm_smi_main.cc +++ b/rocm_smi/src/rocm_smi_main.cc @@ -317,6 +317,7 @@ RocmSMI::Initialize(uint64_t flags) { auto i = 0; uint32_t ret; int i_ret; + std::ostringstream ss; LOG_ALWAYS("=============== ROCM SMI initialize ================"); ROCmLogging::Logger::getInstance()->enableAllLogLevels(); @@ -360,9 +361,32 @@ RocmSMI::Initialize(uint64_t flags) { if (ConstructBDFID(device->path(), &bdfid) != 0) { std::cerr << "Failed to construct BDFID." << std::endl; ret = 1; + } else if (device->bdfid() != UINT64_MAX && device->bdfid() != bdfid) { + // handles secondary partitions - compute partition feature nodes + ss << __PRETTY_FUNCTION__ + << " | [before] device->path() = " << device->path() + << "\n | bdfid = " << bdfid + << "\n | device->bdfid() = " << device->bdfid() + << "\n | (xgmi node) setting to setting " + << "device->set_bdfid(device->bdfid())"; + LOG_TRACE(ss); + device->set_bdfid(device->bdfid()); } else { + // legacy & pcie card updates + ss << __PRETTY_FUNCTION__ + << " | [before] device->path() = " << device->path() + << "\n | bdfid = " << bdfid + << "\n | device->bdfid() = " << device->bdfid() + << "\n | (legacy/pcie card) setting device->set_bdfid(bdfid)"; + LOG_TRACE(ss); device->set_bdfid(bdfid); } + ss << __PRETTY_FUNCTION__ + << " | [after] device->path() = " << device->path() + << "\n | bdfid = " << bdfid + << "\n | device->bdfid() = " << device->bdfid() + << "\n | final update: device->bdfid() holds correct device bdf"; + LOG_TRACE(ss); } if (ret != 0) { throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, @@ -391,7 +415,6 @@ RocmSMI::Initialize(uint64_t flags) { // Remove any drm nodes that don't have a corresponding readable kfd node. // kfd nodes will not be added if their properties file is not readable. - std::ostringstream ss; auto dev_iter = devices_.begin(); while (dev_iter != devices_.end()) { uint64_t bdfid = (*dev_iter)->bdfid(); @@ -670,8 +693,8 @@ RocmSMI::FindMonitor(std::string monitor_path) { return m; } -void -RocmSMI::AddToDeviceList(std::string dev_name) { + +void RocmSMI::AddToDeviceList(std::string dev_name, uint64_t bdfid) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); @@ -689,10 +712,15 @@ RocmSMI::AddToDeviceList(std::string dev_name) { dev->set_drm_render_minor(GetDrmRenderMinor(dev_path)); dev->set_card_index(card_indx); GetSupportedEventGroups(card_indx, dev->supported_event_groups()); + if (bdfid != 0) { + dev->set_bdfid(bdfid); + } devices_.push_back(dev); - ss << __PRETTY_FUNCTION__ << " | Adding to device list dev_name = " - << dev_name << " | path = " << dev_path + ss << __PRETTY_FUNCTION__ + << " | Adding to device list dev_name = " << dev_name + << " | path = " << dev_path + << " | bdfid = " << bdfid << " | card index = " << std::to_string(card_indx) << " | "; LOG_DEBUG(ss); } @@ -773,19 +801,24 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { uint32_t s_node_id = 0; uint64_t s_gpu_id = 0; uint64_t s_unique_id = 0; + uint64_t s_location_id = 0; }; - // allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id} + // allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id, + // location_id} std::multimap allSystemNodes; uint32_t node_id = 0; while (true) { - uint64_t gpu_id = 0, unique_id = 0; + uint64_t gpu_id = 0, unique_id = 0, location_id = 0; int ret_gpu_id = get_gpu_id(node_id, &gpu_id); int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id); - if (ret_gpu_id == 0 || ret_unique_id == 0) { + int ret_loc_id = + read_node_properties(node_id, "location_id", &location_id); + if (ret_gpu_id == 0 || ret_unique_id == 0 || ret_loc_id == 0) { systemNode myNode; myNode.s_node_id = node_id; myNode.s_gpu_id = gpu_id; myNode.s_unique_id = unique_id; + myNode.s_location_id = location_id; if (gpu_id != 0) { // only add gpu nodes, 0 = CPU allSystemNodes.emplace(unique_id, myNode); } @@ -800,6 +833,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { ss << "\n[node_id = " << std::to_string(i.second.s_node_id) << "; gpu_id = " << std::to_string(i.second.s_gpu_id) << "; unique_id = " << std::to_string(i.second.s_unique_id) + << "; location_id = " << std::to_string(i.second.s_location_id) << "], "; } ss << "}"; @@ -812,6 +846,14 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { path += "/card"; path += std::to_string(cardId); uint64_t primary_unique_id = 0; + uint64_t device_uuid = 0; + bool doesDeviceSupportPartitions = false; + // get current partition + int kSize = 256; + char computePartition[kSize]; + std::string strCompPartition = "UNKNOWN"; + uint32_t numMonDevices = 0; + rsmi_num_monitor_devices(&numMonDevices); // each identified gpu card node is a primary node for // potential matching unique ids @@ -819,7 +861,25 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { (init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) { std::string d_name = "card"; d_name += std::to_string(cardId); - AddToDeviceList(d_name); + uint32_t numMonDevices = 0; + rsmi_num_monitor_devices(&numMonDevices); + if (rsmi_dev_compute_partition_get(cardAdded, computePartition, kSize) + == RSMI_STATUS_SUCCESS) { + strCompPartition = computePartition; + doesDeviceSupportPartitions = true; + } + rsmi_status_t ret_unique_id = + rsmi_dev_unique_id_get(cardAdded, &device_uuid); + auto temp_numb_nodes = allSystemNodes.count(device_uuid); + auto primaryBdfId = + allSystemNodes.lower_bound(device_uuid)->second.s_location_id; + if (doesDeviceSupportPartitions && temp_numb_nodes > 1 + && ret_unique_id == RSMI_STATUS_SUCCESS) { + // helps identify xgmi nodes (secondary nodes) easier + AddToDeviceList(d_name, primaryBdfId); + } else { + AddToDeviceList(d_name, UINT64_MAX); + } ss << __PRETTY_FUNCTION__ << " | Ordered system nodes seen in lookup = {"; @@ -827,12 +887,14 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { ss << "\n[node_id = " << std::to_string(i.second.s_node_id) << "; gpu_id = " << std::to_string(i.second.s_gpu_id) << "; unique_id = " << std::to_string(i.second.s_unique_id) + << "; location_id = " << std::to_string(i.second.s_location_id) << "], "; } ss << "}"; LOG_DEBUG(ss); uint64_t temp_primary_unique_id = 0; + uint64_t primary_location_id = 0; if (allSystemNodes.empty()) { cardAdded++; ss << __PRETTY_FUNCTION__ @@ -842,16 +904,11 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { } // get current partition - const int kSize = 256; - char computePartition[kSize]; - std::string strCompPartition = "UNKNOWN"; - uint32_t numMonDevices = 0; rsmi_num_monitor_devices(&numMonDevices); if (rsmi_dev_compute_partition_get(cardAdded, computePartition, kSize) == RSMI_STATUS_SUCCESS) { strCompPartition = computePartition; } - uint64_t device_uuid = 0; if (rsmi_dev_unique_id_get(cardAdded, &device_uuid) != RSMI_STATUS_SUCCESS) { cardAdded++; @@ -865,7 +922,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { temp_primary_unique_id = allSystemNodes.find(device_uuid)->second.s_unique_id; - auto temp_numb_nodes = allSystemNodes.count(temp_primary_unique_id); + temp_numb_nodes = allSystemNodes.count(temp_primary_unique_id); ss << __PRETTY_FUNCTION__ << " | device/node id (cardId) = " << std::to_string(cardId) @@ -897,12 +954,46 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { LOG_DEBUG(ss); while (numb_nodes > 1) { std::string secNode = "card"; - secNode += std::to_string(cardId); // add the primary node id - AddToDeviceList(secNode); + secNode += std::to_string(cardId); // maps the primary node card to + // secondary - allows get/sets + auto it = allSystemNodes.lower_bound(device_uuid); + auto it_end = allSystemNodes.upper_bound(device_uuid); + if (numb_nodes == temp_numb_nodes) { + auto removalNodeId = it->second.s_node_id; + auto removalGpuId = it->second.s_gpu_id; + auto removalUniqueId = it->second.s_unique_id; + auto removalLocId = it->second.s_location_id; + auto nodesErased = 1; + primary_location_id = removalLocId; + allSystemNodes.erase(it++); + ss << __PRETTY_FUNCTION__ + << "\nPRIMARY --> num_nodes == temp_numb_nodes; ERASING " + << std::to_string(nodesErased) << " node -> [node_id = " + << std::to_string(removalNodeId) + << "; gpu_id = " << std::to_string(removalGpuId) + << "; unique_id = " << std::to_string(removalUniqueId) + << "; location_id = " << std::to_string(removalLocId) + << "]"; + LOG_DEBUG(ss); + } + if (it == it_end) { + break; + } + auto myBdfId = it->second.s_location_id; + AddToDeviceList(secNode, myBdfId); + ss << __PRETTY_FUNCTION__ + << "\nSECONDARY --> After adding new node; ERASING -> [node_id = " + << std::to_string(it->second.s_node_id) + << "; gpu_id = " << std::to_string(it->second.s_gpu_id) + << "; unique_id = " << std::to_string(it->second.s_unique_id) + << "; location_id = " << std::to_string(it->second.s_location_id) + << "]"; + LOG_DEBUG(ss); + allSystemNodes.erase(it++); numb_nodes--; cardAdded++; } - // remove already added nodes associated with current card + // remove any remaining nodes associated with current card auto erasedNodes = allSystemNodes.erase(primary_unique_id); ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = " << std::to_string(primary_unique_id) << " erased " diff --git a/rocm_smi/src/rocm_smi_utils.cc b/rocm_smi/src/rocm_smi_utils.cc index 3f9a8b874a..fd20783e89 100755 --- a/rocm_smi/src/rocm_smi_utils.cc +++ b/rocm_smi/src/rocm_smi_utils.cc @@ -539,7 +539,7 @@ std::vector readEntireFile(std::string path) { void displayAppTmpFilesContent() { std::vector tmpFiles = getListOfAppTmpFiles(); if (!tmpFiles.empty()) { - for (auto &x: tmpFiles) { + for (auto &x : tmpFiles) { std::string out = readFile(x); std::cout << __PRETTY_FUNCTION__ << " | Temporary file: " << x << "; Contained content: " << out << std::endl; @@ -558,7 +558,7 @@ std::string debugVectorContent(std::vector v) { for (auto it=v.begin(); it < v.end(); it++) { ss << *it; auto temp_it = it; - if(++temp_it != v.end()) { + if (++temp_it != v.end()) { ss << ", "; } } @@ -576,7 +576,7 @@ std::string displayAllDevicePaths(std::vector> v) { for (auto it=v.begin(); it < v.end(); it++) { ss << (*it)->path(); auto temp_it = it; - if(++temp_it != v.end()) { + if (++temp_it != v.end()) { ss << ", "; } } diff --git a/tests/amd_smi_test/functional/xgmi_read_write.cc b/tests/amd_smi_test/functional/xgmi_read_write.cc index 756e793db3..d252cda1f8 100755 --- a/tests/amd_smi_test/functional/xgmi_read_write.cc +++ b/tests/amd_smi_test/functional/xgmi_read_write.cc @@ -85,6 +85,7 @@ void TestXGMIReadWrite::Close() { void TestXGMIReadWrite::Run(void) { + GTEST_SKIP_("Temporarily disabled"); amdsmi_status_t err; amdsmi_xgmi_status_t err_stat; uint64_t hive_id;