From 8c444164103bec701ff24c231eddc0eb36fdbef6 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 8 May 2024 13:14:39 -0500 Subject: [PATCH 1/3] Discover the amdgpu when card numbers are not consecutive. When discover the amdgpu, if the assigned numbers are not consecutive, not all GPU can be discovered. The code is change to discover the GPU based on max card number. Change-Id: I8b6a8b49594d6a54c7feb2645bedb83dc5c1b4cc --- src/rocm_smi_main.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index 03c8b61375..7d6edea648 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -713,6 +713,8 @@ static bool isAMDGPU(std::string dev_path) { uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { std::string err_msg; uint32_t count = 0; + int32_t cardId = 0; + int32_t max_cardId = -1; std::ostringstream ss; // If this gets called more than once, clear previous findings. @@ -736,6 +738,9 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { if ((strcmp(dentry->d_name, ".") == 0) || (strcmp(dentry->d_name, "..") == 0)) continue; + sscanf(&dentry->d_name[strlen(kDeviceNamePrefix)], "%d", &cardId); + if (cardId > max_cardId) + max_cardId = cardId; count++; } dentry = readdir(drm_dir); @@ -818,7 +823,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { uint32_t cardAdded = 0; // Discover all root cards & gpu partitions associated with each - for (uint32_t cardId = 0; cardId < count; cardId++) { + for (uint32_t cardId = 0; cardId <= max_cardId; cardId++) { std::string path = kPathDRMRoot; path += "/card"; path += std::to_string(cardId); From 497ef4a7ef090d70d2324c57f607d379af6df6dd Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 14 May 2024 18:18:00 -0500 Subject: [PATCH 2/3] fix: [SWDEV-461904] [rocm/rocm_smi_lib] Checks returned error by rsmi_dev_od_volt_info_get() before assert Code changes related to the following: * Unit tests Change-Id: Icc0f329e35992aae19f07243024521181467bcd3 Signed-off-by: Oliveira, Daniel --- .../functional/volt_freq_curv_read.cc | 112 ++++++++++-------- 1 file changed, 65 insertions(+), 47 deletions(-) diff --git a/tests/rocm_smi_test/functional/volt_freq_curv_read.cc b/tests/rocm_smi_test/functional/volt_freq_curv_read.cc index e0e0bf2bab..60d068da19 100755 --- a/tests/rocm_smi_test/functional/volt_freq_curv_read.cc +++ b/tests/rocm_smi_test/functional/volt_freq_curv_read.cc @@ -106,13 +106,22 @@ void TestVoltCurvRead::Run(void) { << amd::smi::getRSMIStatusString(err, false) << "\n"; } - CHK_ERR_ASRT(err) - ret = rsmi_dev_perf_level_get(i, &pfl); - IF_VERB(STANDARD) { - std::cout << "\t**rsmi_dev_perf_level_get(i, &pfl): " - << amd::smi::getRSMIStatusString(ret, false) << "\n"; + + if (err != rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) { + ASSERT_EQ(err, rsmi_status_t::RSMI_STATUS_SUCCESS); + ret = rsmi_dev_perf_level_get(i, &pfl); + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_perf_level_get(i, &pfl): " + << amd::smi::getRSMIStatusString(ret, false) << "\n"; + } + ASSERT_EQ(err, rsmi_status_t::RSMI_STATUS_SUCCESS); + } + else { + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_perf_level_get: Not supported on this " + "machine" << std::endl; + } } - CHK_ERR_ASRT(ret) // Verify api support checking functionality is working err = rsmi_dev_od_volt_info_get(i, nullptr); @@ -120,53 +129,62 @@ void TestVoltCurvRead::Run(void) { std::cout << "\t**rsmi_dev_od_volt_info_get(i, nullptr): " << amd::smi::getRSMIStatusString(err, false) << "\n"; } - ASSERT_TRUE(err == RSMI_STATUS_INVALID_ARGS); - err = rsmi_dev_od_volt_info_get(i, &odv); - IF_VERB(STANDARD) { - std::cout << "\t**rsmi_dev_od_volt_info_get(i, &odv): " - << amd::smi::getRSMIStatusString(err, false) << "\n" - << amd::smi::print_rsmi_od_volt_freq_data_t(&odv) - << "\t**odv.num_regions = " << std::dec - << odv.num_regions << "\n"; - } - if (err == RSMI_STATUS_SUCCESS) { - std::cout << "\t**Frequency-voltage curve data:" << "\n"; - std::cout << amd::smi::print_rsmi_od_volt_freq_data_t(&odv); - rsmi_freq_volt_region_t *regions{}; - uint32_t num_regions; - regions = new rsmi_freq_volt_region_t[odv.num_regions]; - ASSERT_TRUE(regions != nullptr); - - num_regions = odv.num_regions; - err = rsmi_dev_od_volt_curve_regions_get(i, &num_regions, regions); + if (err != rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) { + ASSERT_EQ(err, rsmi_status_t::RSMI_STATUS_INVALID_ARGS); + err = rsmi_dev_od_volt_info_get(i, &odv); IF_VERB(STANDARD) { - std::cout << "\t**rsmi_dev_od_volt_curve_regions_get(" - << "i, &num_regions, regions): " - << amd::smi::getRSMIStatusString(err, false) << "\n" - << "\t**Number of regions: " << std::dec << num_regions - << "\n"; + std::cout << "\t**rsmi_dev_od_volt_info_get(i, &odv): " + << amd::smi::getRSMIStatusString(err, false) << "\n" + << amd::smi::print_rsmi_od_volt_freq_data_t(&odv) + << "\t**odv.num_regions = " << std::dec + << odv.num_regions << "\n"; } - ASSERT_TRUE(err == RSMI_STATUS_SUCCESS - || err == RSMI_STATUS_NOT_SUPPORTED - || err == RSMI_STATUS_UNEXPECTED_DATA - || err == RSMI_STATUS_UNEXPECTED_SIZE - || err == RSMI_STATUS_INVALID_ARGS); - if (err != RSMI_STATUS_SUCCESS) { + if (err == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::cout << "\t**Frequency-voltage curve data:" << "\n"; + std::cout << amd::smi::print_rsmi_od_volt_freq_data_t(&odv); + + rsmi_freq_volt_region_t *regions{}; + uint32_t num_regions; + regions = new rsmi_freq_volt_region_t[odv.num_regions]; + ASSERT_NE(regions, nullptr); + + num_regions = odv.num_regions; + err = rsmi_dev_od_volt_curve_regions_get(i, &num_regions, regions); IF_VERB(STANDARD) { - std::cout << "\t**rsmi_dev_od_volt_curve_regions_get: " - "Not supported on this machine" << std::endl; + std::cout << "\t**rsmi_dev_od_volt_curve_regions_get(" + << "i, &num_regions, regions): " + << amd::smi::getRSMIStatusString(err, false) << "\n" + << "\t**Number of regions: " << std::dec << num_regions + << "\n"; } - continue; + ASSERT_TRUE(err == RSMI_STATUS_SUCCESS + || err == RSMI_STATUS_NOT_SUPPORTED + || err == RSMI_STATUS_UNEXPECTED_DATA + || err == RSMI_STATUS_UNEXPECTED_SIZE + || err == RSMI_STATUS_INVALID_ARGS); + if (err != RSMI_STATUS_SUCCESS) { + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_od_volt_curve_regions_get: " + "Not supported on this machine" << std::endl; + } + continue; + } + ASSERT_EQ(err, rsmi_status_t::RSMI_STATUS_SUCCESS); + ASSERT_EQ(num_regions, odv.num_regions); + + std::cout << "\t**Frequency-voltage curve regions:" << std::endl; + std::cout << amd::smi::print_rsmi_od_volt_freq_regions(num_regions, + regions); + + delete []regions; + } + } + else { + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_od_volt_info_get: Not supported on this " + "machine" << std::endl; } - CHK_ERR_ASRT(err) - ASSERT_TRUE(num_regions == odv.num_regions); - - std::cout << "\t**Frequency-voltage curve regions:" << std::endl; - std::cout << amd::smi::print_rsmi_od_volt_freq_regions(num_regions, - regions); - - delete []regions; } } } From e7d54946fb6bd578ee94752305f71aaf84555197 Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 14 May 2024 19:36:52 -0500 Subject: [PATCH 3/3] fix: [MIT-License] [rocm/rocm_smi_lib] Updates the license to MIT Code changes related to the following: None Change-Id: I62d0a5f02a2d5e58c1952337dff54892793c16cf Signed-off-by: Oliveira, Daniel --- License.txt | 44 ++++++++++++++------------------------------ 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/License.txt b/License.txt index 7d64f3652c..31f950344e 100644 --- a/License.txt +++ b/License.txt @@ -1,38 +1,22 @@ -The University of Illinois/NCSA -Open Source License (NCSA) +MIT License -Copyright (c) 2014-2018, Advanced Micro Devices, Inc. All rights reserved. - -Developed by: - - AMD Research and AMD HSA Software Development - - Advanced Micro Devices, Inc. - - www.amd.com +Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to -deal with the Software without restriction, including without limitation -the rights to use, copy, modify, merge, publish, distribute, sublicense, -and/or sell copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following conditions: +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: - - Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimers. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimers in - the documentation and/or other materials provided with the distribution. - - Neither the names of Advanced Micro Devices, Inc, - nor the names of its contributors may be used to endorse or promote - products derived from this Software without specific prior written - permission. +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS WITH THE SOFTWARE. +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE.