From 2a7589a0656c69a7cc03eb5d064caf42b475bfea Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 12 Oct 2023 21:27:55 -0500 Subject: [PATCH 1/2] TESTS - Skip XGMI test Change-Id: Idd9f505f36fac4a670e5129f835aa051b5c4c9fa Signed-off-by: Galantsev, Dmitrii --- tests/rocm_smi_test/functional/xgmi_read_write.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/rocm_smi_test/functional/xgmi_read_write.cc b/tests/rocm_smi_test/functional/xgmi_read_write.cc index c85de42cf4..d008947110 100755 --- a/tests/rocm_smi_test/functional/xgmi_read_write.cc +++ b/tests/rocm_smi_test/functional/xgmi_read_write.cc @@ -85,6 +85,7 @@ void TestXGMIReadWrite::Close() { void TestXGMIReadWrite::Run(void) { + GTEST_SKIP_("Temporarily disabled"); rsmi_status_t err; rsmi_xgmi_status_t err_stat; uint64_t hive_id; From 6f1afd2678139dd1b6421a81b225f0c32da574b0 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Thu, 12 Oct 2023 10:54:46 -0500 Subject: [PATCH 2/2] bdfid fix for partition & xgmi nodes * Updates: - [API] After discovering all amd gpus, we now properly map correct bdf (xgmi nodes). Especially important for partition changes - aka secondary nodes. - [API] While adding new secondary nodes we now have better grouping -> due to resorting based on kfd properties list & matching to primary uniqueid - [API] All secondary nodes are now AddToDeviceList with correct bdf (location id), provided by kfd - [API] Modified AddToDeviceList(..., uint64_t bdfid): providing an optional field - bdfid. This allows working around primary pcie cards with xgmi nodes - [API] Utils - cpplint minor fixes - [Example] Removed all endl references w/ newline, fixed spacing, and some incorrect values displaying as hex (needed dec representation) - [API] kfd node functions - now print full path of file for trace logs - [Tests] power_read.cc: Added in generic power test to confirm guaranteeing specific return values Change-Id: I143474e8d64c4915a966e789be6bcea4fa7f4472 Signed-off-by: Charis Poag --- include/rocm_smi/rocm_smi_main.h | 2 +- rocm_smi/example/rocm_smi_example.cc | 23 ++-- src/rocm_smi_kfd.cc | 10 ++ src/rocm_smi_main.cc | 127 ++++++++++++++++--- src/rocm_smi_utils.cc | 6 +- tests/rocm_smi_test/functional/power_read.cc | 2 + 6 files changed, 137 insertions(+), 33 deletions(-) diff --git a/include/rocm_smi/rocm_smi_main.h b/include/rocm_smi/rocm_smi_main.h index 8b60324988..1cd2ec343f 100755 --- a/include/rocm_smi/rocm_smi_main.h +++ b/include/rocm_smi/rocm_smi_main.h @@ -128,7 +128,7 @@ class RocmSMI { std::map, std::shared_ptr> io_link_map_; std::map dev_ind_to_node_ind_map_; - void AddToDeviceList(std::string dev_name); + void AddToDeviceList(std::string dev_name, uint64_t bdfid = 0); void GetEnvVariables(void); std::shared_ptr FindMonitor(std::string monitor_path); diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc index a2df8e66ee..fa01b42978 100755 --- a/rocm_smi/example/rocm_smi_example.cc +++ b/rocm_smi/example/rocm_smi_example.cc @@ -58,8 +58,8 @@ #define PRINT_RSMI_ERR(RET) { \ if (RET != RSMI_STATUS_SUCCESS) { \ std::cout << "[ERROR] RSMI call returned " << (RET) \ - << " at line " << __LINE__ << std::endl; \ - std::cout << amd::smi::getRSMIStatusString(RET) << std::endl; \ + << " at line " << __LINE__ << "\n"; \ + std::cout << amd::smi::getRSMIStatusString(RET) << "\n"; \ } \ } @@ -718,7 +718,7 @@ int main() { rsmi_num_monitor_devices(&num_monitor_devs); for (uint32_t i = 0; i < num_monitor_devs; ++i) { - std::cout << "\t**Device #: " << std::dec << i << std::endl; + std::cout << "\t**Device #: " << std::dec << i << "\n"; ret = rsmi_dev_id_get(i, &val_ui16); CHK_RSMI_RET_I(ret) std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << "\n"; @@ -765,8 +765,9 @@ int main() { uint64_t max_bandwidth = 0; ret = rsmi_minmax_bandwidth_get(0, i, &min_bandwidth, &max_bandwidth); CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret) - std::cout << "\nMinimum Bandwidth: " << min_bandwidth - << "\nMaximum Bandwidth: " << max_bandwidth; + std::cout << "\n\t**\tMinimum Bandwidth: " << std::dec << min_bandwidth + << "\n\t**\tMaximum Bandwidth: " << std::dec + << max_bandwidth << "\n"; } else { std::cout << "Not Supported\n"; } @@ -813,7 +814,7 @@ int main() { ret = rsmi_dev_temp_metric_get(i, RSMI_TEMP_TYPE_EDGE, rsmi_temperature_metric_t::RSMI_TEMP_CURRENT, &val_i64); if (ret == RSMI_STATUS_SUCCESS) { - std::cout << val_i64/1000 << "C" << "\n"; + std::cout << std::dec << val_i64/1000 << " C" << "\n"; } CHK_RSMI_NOT_SUPPORTED_RET(ret) @@ -821,7 +822,7 @@ int main() { ret = rsmi_dev_temp_metric_get(i, RSMI_TEMP_TYPE_JUNCTION, rsmi_temperature_metric_t::RSMI_TEMP_CURRENT, &val_i64); if (ret == RSMI_STATUS_SUCCESS) { - std::cout << (val_i64 / 1000) << "C" << std::endl; + std::cout << std::dec << (val_i64 / 1000) << " C" << "\n"; } CHK_RSMI_NOT_SUPPORTED_RET(ret) @@ -869,14 +870,14 @@ int main() { std::cout << "\t**Average Power Usage: "; ret = rsmi_dev_power_ave_get(i, 0, &val_ui64); if (ret == RSMI_STATUS_SUCCESS) { - std::cout << convert_mw_to_w(val_ui64) << " W" << std::endl; + std::cout << convert_mw_to_w(val_ui64) << " W" << "\n"; } CHK_RSMI_NOT_SUPPORTED_RET(ret) std::cout << "\t**Current Socket Power Usage: "; ret = rsmi_dev_current_socket_power_get(i, &val_ui64); if (ret == RSMI_STATUS_SUCCESS) { - std::cout << convert_mw_to_w(val_ui64) << " W" << std::endl; + std::cout << convert_mw_to_w(val_ui64) << " W" << "\n"; } CHK_RSMI_NOT_SUPPORTED_RET(ret) @@ -884,7 +885,7 @@ int main() { ret = rsmi_dev_power_get(i, &val_ui64, &power_type); if (ret == RSMI_STATUS_SUCCESS) { std::cout << "[" << amd::smi::power_type_string(power_type) << "] " - << convert_mw_to_w(val_ui64) << " W" << std::endl; + << convert_mw_to_w(val_ui64) << " W" << "\n"; } CHK_RSMI_NOT_SUPPORTED_RET(ret) std::cout << "\t=======" << "\n"; @@ -897,7 +898,7 @@ int main() { return 0; } - for (uint32_t i = 0; i< num_monitor_devs; ++i) { + for (uint32_t i = 0; i < num_monitor_devs; ++i) { ret = test_set_overdrive(i); CHK_AND_PRINT_RSMI_ERR_RET(ret) diff --git a/src/rocm_smi_kfd.cc b/src/rocm_smi_kfd.cc index 9b7e0e5eaa..40984b430b 100755 --- a/src/rocm_smi_kfd.cc +++ b/src/rocm_smi_kfd.cc @@ -890,9 +890,12 @@ int KFDNode::get_used_memory(uint64_t* used) { int read_node_properties(uint32_t node, std::string property_name, uint64_t *val) { std::ostringstream ss; + std::string propertiesFullPath = "/sys/class/kfd/kfd/topology/nodes/" + + std::to_string(node) + "/properties"; int retVal = EINVAL; if (property_name.empty() || val == nullptr) { ss << __PRETTY_FUNCTION__ + << " | File: " << propertiesFullPath << " | Issue: Could not read node #" << std::to_string(node) << ", property_name is empty or *val is nullptr " << " | return = " << std::to_string(retVal) @@ -905,6 +908,7 @@ int read_node_properties(uint32_t node, std::string property_name, if (KFDNodeSupported(node)) { retVal = myNode->get_property_value(property_name, val); ss << __PRETTY_FUNCTION__ + << " | File: " << propertiesFullPath << " | Successfully read node #" << std::to_string(node) << " for property_name = " << property_name << " | Data (" << property_name << ") * val = " @@ -915,6 +919,7 @@ int read_node_properties(uint32_t node, std::string property_name, } else { retVal = 1; ss << __PRETTY_FUNCTION__ + << " | File: " << propertiesFullPath << " | Issue: Could not read node #" << std::to_string(node) << ", KFD node was an unsupported node." << " | return = " << std::to_string(retVal) @@ -927,9 +932,12 @@ int read_node_properties(uint32_t node, std::string property_name, // /sys/class/kfd/kfd/topology/nodes/*/gpu_id int get_gpu_id(uint32_t node, uint64_t *gpu_id) { std::ostringstream ss; + std::string gpu_id_FullPath = "/sys/class/kfd/kfd/topology/nodes/" + + std::to_string(node) + "/gpu_id"; int retVal = EINVAL; if (gpu_id == nullptr) { ss << __PRETTY_FUNCTION__ + << " | File: " << gpu_id_FullPath << " | Issue: Could not read node #" << std::to_string(node) << ", gpu_id is a nullptr " << " | return = " << std::to_string(retVal) @@ -942,6 +950,7 @@ int get_gpu_id(uint32_t node, uint64_t *gpu_id) { if (KFDNodeSupported(node)) { retVal = ReadKFDGpuId(node, gpu_id); ss << __PRETTY_FUNCTION__ + << " | File: " << gpu_id_FullPath << " | Successfully read node #" << std::to_string(node) << " for gpu_id" << " | Data (gpu_id) *gpu_id = " @@ -952,6 +961,7 @@ int get_gpu_id(uint32_t node, uint64_t *gpu_id) { } else { retVal = 1; ss << __PRETTY_FUNCTION__ + << " | File: " << gpu_id_FullPath << " | Issue: Could not read node #" << std::to_string(node) << ", KFD node was an unsupported node." << " | return = " << std::to_string(retVal) diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index 05851a88e6..3647f8e35d 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -312,6 +312,7 @@ RocmSMI::Initialize(uint64_t flags) { auto i = 0; uint32_t ret; int i_ret; + std::ostringstream ss; LOG_ALWAYS("=============== ROCM SMI initialize ================"); ROCmLogging::Logger::getInstance()->enableAllLogLevels(); @@ -355,9 +356,32 @@ RocmSMI::Initialize(uint64_t flags) { if (ConstructBDFID(device->path(), &bdfid) != 0) { std::cerr << "Failed to construct BDFID." << std::endl; ret = 1; + } else if (device->bdfid() != UINT64_MAX && device->bdfid() != bdfid) { + // handles secondary partitions - compute partition feature nodes + ss << __PRETTY_FUNCTION__ + << " | [before] device->path() = " << device->path() + << "\n | bdfid = " << bdfid + << "\n | device->bdfid() = " << device->bdfid() + << "\n | (xgmi node) setting to setting " + << "device->set_bdfid(device->bdfid())"; + LOG_TRACE(ss); + device->set_bdfid(device->bdfid()); } else { + // legacy & pcie card updates + ss << __PRETTY_FUNCTION__ + << " | [before] device->path() = " << device->path() + << "\n | bdfid = " << bdfid + << "\n | device->bdfid() = " << device->bdfid() + << "\n | (legacy/pcie card) setting device->set_bdfid(bdfid)"; + LOG_TRACE(ss); device->set_bdfid(bdfid); } + ss << __PRETTY_FUNCTION__ + << " | [after] device->path() = " << device->path() + << "\n | bdfid = " << bdfid + << "\n | device->bdfid() = " << device->bdfid() + << "\n | final update: device->bdfid() holds correct device bdf"; + LOG_TRACE(ss); } if (ret != 0) { throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, @@ -386,7 +410,6 @@ RocmSMI::Initialize(uint64_t flags) { // Remove any drm nodes that don't have a corresponding readable kfd node. // kfd nodes will not be added if their properties file is not readable. - std::ostringstream ss; auto dev_iter = devices_.begin(); while (dev_iter != devices_.end()) { uint64_t bdfid = (*dev_iter)->bdfid(); @@ -665,8 +688,8 @@ RocmSMI::FindMonitor(std::string monitor_path) { return m; } -void -RocmSMI::AddToDeviceList(std::string dev_name) { + +void RocmSMI::AddToDeviceList(std::string dev_name, uint64_t bdfid) { std::ostringstream ss; ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); @@ -684,10 +707,15 @@ RocmSMI::AddToDeviceList(std::string dev_name) { dev->set_drm_render_minor(GetDrmRenderMinor(dev_path)); dev->set_card_index(card_indx); GetSupportedEventGroups(card_indx, dev->supported_event_groups()); + if (bdfid != 0) { + dev->set_bdfid(bdfid); + } devices_.push_back(dev); - ss << __PRETTY_FUNCTION__ << " | Adding to device list dev_name = " - << dev_name << " | path = " << dev_path + ss << __PRETTY_FUNCTION__ + << " | Adding to device list dev_name = " << dev_name + << " | path = " << dev_path + << " | bdfid = " << bdfid << " | card index = " << std::to_string(card_indx) << " | "; LOG_DEBUG(ss); } @@ -768,19 +796,24 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { uint32_t s_node_id = 0; uint64_t s_gpu_id = 0; uint64_t s_unique_id = 0; + uint64_t s_location_id = 0; }; - // allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id} + // allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id, + // location_id} std::multimap allSystemNodes; uint32_t node_id = 0; while (true) { - uint64_t gpu_id = 0, unique_id = 0; + uint64_t gpu_id = 0, unique_id = 0, location_id = 0; int ret_gpu_id = get_gpu_id(node_id, &gpu_id); int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id); - if (ret_gpu_id == 0 || ret_unique_id == 0) { + int ret_loc_id = + read_node_properties(node_id, "location_id", &location_id); + if (ret_gpu_id == 0 || ret_unique_id == 0 || ret_loc_id == 0) { systemNode myNode; myNode.s_node_id = node_id; myNode.s_gpu_id = gpu_id; myNode.s_unique_id = unique_id; + myNode.s_location_id = location_id; if (gpu_id != 0) { // only add gpu nodes, 0 = CPU allSystemNodes.emplace(unique_id, myNode); } @@ -795,6 +828,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { ss << "\n[node_id = " << std::to_string(i.second.s_node_id) << "; gpu_id = " << std::to_string(i.second.s_gpu_id) << "; unique_id = " << std::to_string(i.second.s_unique_id) + << "; location_id = " << std::to_string(i.second.s_location_id) << "], "; } ss << "}"; @@ -807,6 +841,14 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { path += "/card"; path += std::to_string(cardId); uint64_t primary_unique_id = 0; + uint64_t device_uuid = 0; + bool doesDeviceSupportPartitions = false; + // get current partition + int kSize = 256; + char computePartition[kSize]; + std::string strCompPartition = "UNKNOWN"; + uint32_t numMonDevices = 0; + rsmi_num_monitor_devices(&numMonDevices); // each identified gpu card node is a primary node for // potential matching unique ids @@ -814,7 +856,25 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { (init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) { std::string d_name = "card"; d_name += std::to_string(cardId); - AddToDeviceList(d_name); + uint32_t numMonDevices = 0; + rsmi_num_monitor_devices(&numMonDevices); + if (rsmi_dev_compute_partition_get(cardAdded, computePartition, kSize) + == RSMI_STATUS_SUCCESS) { + strCompPartition = computePartition; + doesDeviceSupportPartitions = true; + } + rsmi_status_t ret_unique_id = + rsmi_dev_unique_id_get(cardAdded, &device_uuid); + auto temp_numb_nodes = allSystemNodes.count(device_uuid); + auto primaryBdfId = + allSystemNodes.lower_bound(device_uuid)->second.s_location_id; + if (doesDeviceSupportPartitions && temp_numb_nodes > 1 + && ret_unique_id == RSMI_STATUS_SUCCESS) { + // helps identify xgmi nodes (secondary nodes) easier + AddToDeviceList(d_name, primaryBdfId); + } else { + AddToDeviceList(d_name, UINT64_MAX); + } ss << __PRETTY_FUNCTION__ << " | Ordered system nodes seen in lookup = {"; @@ -822,12 +882,14 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { ss << "\n[node_id = " << std::to_string(i.second.s_node_id) << "; gpu_id = " << std::to_string(i.second.s_gpu_id) << "; unique_id = " << std::to_string(i.second.s_unique_id) + << "; location_id = " << std::to_string(i.second.s_location_id) << "], "; } ss << "}"; LOG_DEBUG(ss); uint64_t temp_primary_unique_id = 0; + uint64_t primary_location_id = 0; if (allSystemNodes.empty()) { cardAdded++; ss << __PRETTY_FUNCTION__ @@ -837,16 +899,11 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { } // get current partition - const int kSize = 256; - char computePartition[kSize]; - std::string strCompPartition = "UNKNOWN"; - uint32_t numMonDevices = 0; rsmi_num_monitor_devices(&numMonDevices); if (rsmi_dev_compute_partition_get(cardAdded, computePartition, kSize) == RSMI_STATUS_SUCCESS) { strCompPartition = computePartition; } - uint64_t device_uuid = 0; if (rsmi_dev_unique_id_get(cardAdded, &device_uuid) != RSMI_STATUS_SUCCESS) { cardAdded++; @@ -860,7 +917,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { temp_primary_unique_id = allSystemNodes.find(device_uuid)->second.s_unique_id; - auto temp_numb_nodes = allSystemNodes.count(temp_primary_unique_id); + temp_numb_nodes = allSystemNodes.count(temp_primary_unique_id); ss << __PRETTY_FUNCTION__ << " | device/node id (cardId) = " << std::to_string(cardId) @@ -892,12 +949,46 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { LOG_DEBUG(ss); while (numb_nodes > 1) { std::string secNode = "card"; - secNode += std::to_string(cardId); // add the primary node id - AddToDeviceList(secNode); + secNode += std::to_string(cardId); // maps the primary node card to + // secondary - allows get/sets + auto it = allSystemNodes.lower_bound(device_uuid); + auto it_end = allSystemNodes.upper_bound(device_uuid); + if (numb_nodes == temp_numb_nodes) { + auto removalNodeId = it->second.s_node_id; + auto removalGpuId = it->second.s_gpu_id; + auto removalUniqueId = it->second.s_unique_id; + auto removalLocId = it->second.s_location_id; + auto nodesErased = 1; + primary_location_id = removalLocId; + allSystemNodes.erase(it++); + ss << __PRETTY_FUNCTION__ + << "\nPRIMARY --> num_nodes == temp_numb_nodes; ERASING " + << std::to_string(nodesErased) << " node -> [node_id = " + << std::to_string(removalNodeId) + << "; gpu_id = " << std::to_string(removalGpuId) + << "; unique_id = " << std::to_string(removalUniqueId) + << "; location_id = " << std::to_string(removalLocId) + << "]"; + LOG_DEBUG(ss); + } + if (it == it_end) { + break; + } + auto myBdfId = it->second.s_location_id; + AddToDeviceList(secNode, myBdfId); + ss << __PRETTY_FUNCTION__ + << "\nSECONDARY --> After adding new node; ERASING -> [node_id = " + << std::to_string(it->second.s_node_id) + << "; gpu_id = " << std::to_string(it->second.s_gpu_id) + << "; unique_id = " << std::to_string(it->second.s_unique_id) + << "; location_id = " << std::to_string(it->second.s_location_id) + << "]"; + LOG_DEBUG(ss); + allSystemNodes.erase(it++); numb_nodes--; cardAdded++; } - // remove already added nodes associated with current card + // remove any remaining nodes associated with current card auto erasedNodes = allSystemNodes.erase(primary_unique_id); ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = " << std::to_string(primary_unique_id) << " erased " diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index db11f0645c..3d74c7e7f1 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -520,7 +520,7 @@ std::vector readEntireFile(std::string path) { void displayAppTmpFilesContent() { std::vector tmpFiles = getListOfAppTmpFiles(); if (!tmpFiles.empty()) { - for (auto &x: tmpFiles) { + for (auto &x : tmpFiles) { std::string out = readFile(x); std::cout << __PRETTY_FUNCTION__ << " | Temporary file: " << x << "; Contained content: " << out << std::endl; @@ -539,7 +539,7 @@ std::string debugVectorContent(std::vector v) { for (auto it=v.begin(); it < v.end(); it++) { ss << *it; auto temp_it = it; - if(++temp_it != v.end()) { + if (++temp_it != v.end()) { ss << ", "; } } @@ -557,7 +557,7 @@ std::string displayAllDevicePaths(std::vector> v) { for (auto it=v.begin(); it < v.end(); it++) { ss << (*it)->path(); auto temp_it = it; - if(++temp_it != v.end()) { + if (++temp_it != v.end()) { ss << ", "; } } diff --git a/tests/rocm_smi_test/functional/power_read.cc b/tests/rocm_smi_test/functional/power_read.cc index f379fd48c8..e5c636b0b8 100755 --- a/tests/rocm_smi_test/functional/power_read.cc +++ b/tests/rocm_smi_test/functional/power_read.cc @@ -167,6 +167,8 @@ void TestPowerRead::Run(void) { err = rsmi_dev_power_get(i, &val_ui64, &type); ASSERT_TRUE(err == RSMI_STATUS_SUCCESS || err == RSMI_STATUS_NOT_SUPPORTED); + ASSERT_TRUE(type == RSMI_AVERAGE_POWER || type == RSMI_CURRENT_POWER + || type == RSMI_INVALID_POWER); if (err == RSMI_STATUS_NOT_SUPPORTED) { std::cout <<