From 39c1c4334ea90c9364bdf26be6c3cc2d89eefd0c Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Thu, 18 Aug 2022 08:52:08 -0400 Subject: [PATCH] Support events in the amdsmi Port the events handling from rocm-smi to amd-smi Change-Id: I0b4cb30a585cb2188a24be0e21c1c156b461bb1d [ROCm/amdsmi commit: 7b92c694a0e499ccf23f887528f8ea9a29c307c4] --- .../amd_smi/include/impl/amd_smi_system.h | 3 + projects/amdsmi/amd_smi/src/amd_smi.cc | 62 +++++- projects/amdsmi/amd_smi/src/amd_smi_system.cc | 21 ++ .../functional/evt_notif_read_write.cc | 195 ++++++++++++++++++ .../functional/evt_notif_read_write.h | 73 +++++++ projects/amdsmi/tests/amd_smi_test/main.cc | 10 +- 6 files changed, 354 insertions(+), 10 deletions(-) create mode 100755 projects/amdsmi/tests/amd_smi_test/functional/evt_notif_read_write.cc create mode 100755 projects/amdsmi/tests/amd_smi_test/functional/evt_notif_read_write.h diff --git a/projects/amdsmi/amd_smi/include/impl/amd_smi_system.h b/projects/amdsmi/amd_smi/include/impl/amd_smi_system.h index 4da41af5db..ee184235a8 100644 --- a/projects/amdsmi/amd_smi/include/impl/amd_smi_system.h +++ b/projects/amdsmi/amd_smi/include/impl/amd_smi_system.h @@ -72,6 +72,9 @@ class AMDSmiSystem { amdsmi_status_t handle_to_device(amdsmi_device_handle device_handle, AMDSmiDevice** device); + amdsmi_status_t gpu_index_to_handle(uint32_t gpu_index, + amdsmi_device_handle* device_handle); + private: AMDSmiSystem() : init_flag_(AMD_SMI_INIT_ALL_DEVICES) {} uint64_t init_flag_; diff --git a/projects/amdsmi/amd_smi/src/amd_smi.cc b/projects/amdsmi/amd_smi/src/amd_smi.cc index 5909039371..9bd70e973b 100644 --- a/projects/amdsmi/amd_smi/src/amd_smi.cc +++ b/projects/amdsmi/amd_smi/src/amd_smi.cc @@ -62,6 +62,11 @@ #include "rocm_smi/rocm_smi.h" #include "impl/amdgpu_drm.h" +// TODO(bliu): One to one map to all status code +static amdsmi_status_t rsmi_to_amdsmi_status(rsmi_status_t status) { + if (status == RSMI_STATUS_NO_DATA) return AMDSMI_STATUS_NO_DATA; + return static_cast(status); +} template amdsmi_status_t rsmi_wrapper(F && f, @@ -79,7 +84,7 @@ amdsmi_status_t rsmi_wrapper(F && f, uint32_t gpu_index = gpu_device->get_gpu_id(); auto r = std::forward(f)(gpu_index, std::forward(args)...); - return static_cast(r); + return rsmi_to_amdsmi_status(r); } return AMDSMI_STATUS_NOT_SUPPORTED; @@ -98,7 +103,7 @@ amdsmi_shut_down() { amdsmi_status_t amdsmi_status_string(amdsmi_status_t status, const char **status_string) { if (status <= AMDSMI_LIB_START) { - return static_cast( + return rsmi_to_amdsmi_status( rsmi_status_string(static_cast(status), status_string)); } switch (status) { @@ -182,12 +187,14 @@ amdsmi_status_t amdsmi_get_device_type(amdsmi_device_handle device_handle , return AMDSMI_STATUS_SUCCESS; } -amdsmi_status_t amdsmi_get_board_info(amdsmi_device_handle device_handle, amdsmi_board_info_t *board_info) { +amdsmi_status_t amdsmi_get_board_info(amdsmi_device_handle device_handle, + amdsmi_board_info_t *board_info) { if (board_info == NULL) { return AMDSMI_STATUS_INVAL; } - return rsmi_wrapper(rsmi_dev_name_get, device_handle, board_info->product_name, AMDSMI_PRODUCT_NAME_LENGTH); + return rsmi_wrapper(rsmi_dev_name_get, device_handle, + board_info->product_name, AMDSMI_PRODUCT_NAME_LENGTH); } amdsmi_status_t amdsmi_dev_temp_metric_get(amdsmi_device_handle device_handle, @@ -201,7 +208,8 @@ amdsmi_status_t amdsmi_dev_temp_metric_get(amdsmi_device_handle device_handle, static_cast(metric), temperature); } -amdsmi_status_t amdsmi_get_vram_usage(amdsmi_device_handle device_handle, amdsmi_vram_info_t *vram_info) { +amdsmi_status_t amdsmi_get_vram_usage(amdsmi_device_handle device_handle, + amdsmi_vram_info_t *vram_info) { if (vram_info == NULL) { return AMDSMI_STATUS_INVAL; } @@ -377,4 +385,48 @@ amdsmi_status_t amdsmi_dev_subsystem_vendor_id_get( return rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, device_handle, id); } +amdsmi_status_t +amdsmi_event_notification_init(amdsmi_device_handle device_handle) { + return rsmi_wrapper(rsmi_event_notification_init, device_handle); +} +amdsmi_status_t +amdsmi_event_notification_mask_set(amdsmi_device_handle device_handle, + uint64_t mask) { + return rsmi_wrapper(rsmi_event_notification_mask_set, device_handle, mask); +} + +amdsmi_status_t +amdsmi_event_notification_get(int timeout_ms, + uint32_t *num_elem, amdsmi_evt_notification_data_t *data) { + if (num_elem == nullptr || data == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + // Get the rsmi data + std::vector r_data(*num_elem); + rsmi_status_t r = rsmi_event_notification_get( + timeout_ms, num_elem, &r_data[0]); + if (r != RSMI_STATUS_SUCCESS) + return rsmi_to_amdsmi_status(r); + + // convert output + for (uint32_t i=0; i < *num_elem; i++) { + rsmi_evt_notification_data_t rsmi_data = r_data[i]; + data[i].event = static_cast( + rsmi_data.event); + strncpy(data[i].message, rsmi_data.message, + MAX_EVENT_NOTIFICATION_MSG_SIZE); + amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance() + .gpu_index_to_handle(rsmi_data.dv_ind, &(data[i].device_handle)); + if (r != AMDSMI_STATUS_SUCCESS) + return r; + } + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t amdsmi_event_notification_stop( + amdsmi_device_handle device_handle) { + return rsmi_wrapper(rsmi_event_notification_stop, device_handle); +} diff --git a/projects/amdsmi/amd_smi/src/amd_smi_system.cc b/projects/amdsmi/amd_smi/src/amd_smi_system.cc index 3b39d88dc6..4f8e931ee1 100644 --- a/projects/amdsmi/amd_smi/src/amd_smi_system.cc +++ b/projects/amdsmi/amd_smi/src/amd_smi_system.cc @@ -156,6 +156,27 @@ amdsmi_status_t AMDSmiSystem::handle_to_device( return AMDSMI_STATUS_INVAL; } +amdsmi_status_t AMDSmiSystem::gpu_index_to_handle(uint32_t gpu_index, + amdsmi_device_handle* device_handle) { + if (device_handle == nullptr) + return AMDSMI_STATUS_INVAL; + + auto iter = devices_.begin(); + for (; iter != devices_.end(); iter++) { + auto cur_device = (*iter); + if (cur_device->get_device_type() != AMD_GPU) + continue; + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(cur_device); + uint32_t cur_gpu_index = gpu_device->get_gpu_id(); + if (gpu_index == cur_gpu_index) { + *device_handle = cur_device; + return AMDSMI_STATUS_SUCCESS; + } + } + return AMDSMI_STATUS_INVAL; +} + } // namespace smi } // namespace amd diff --git a/projects/amdsmi/tests/amd_smi_test/functional/evt_notif_read_write.cc b/projects/amdsmi/tests/amd_smi_test/functional/evt_notif_read_write.cc new file mode 100755 index 0000000000..ae086cfe4f --- /dev/null +++ b/projects/amdsmi/tests/amd_smi_test/functional/evt_notif_read_write.cc @@ -0,0 +1,195 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#include +#include + +#include + +#include "gtest/gtest.h" +#include "amd_smi.h" +#include "amd_smi_test/functional/evt_notif_read_write.h" +#include "amd_smi_test/test_common.h" +#include "amd_smi_test/test_utils.h" + +TestEvtNotifReadWrite::TestEvtNotifReadWrite() : TestBase() { + set_title("AMDSMI Event Notification Read/Write Test"); + set_description("The Event Notification Read/Write tests verifies that " + "we can configure to collect various event types and then read them"); +} + +TestEvtNotifReadWrite::~TestEvtNotifReadWrite(void) { +} + +void TestEvtNotifReadWrite::SetUp(void) { + TestBase::SetUp(); + return; +} + +void TestEvtNotifReadWrite::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void TestEvtNotifReadWrite::DisplayResults(void) const { + TestBase::DisplayResults(); + return; +} + +void TestEvtNotifReadWrite::Close() { + // This will close handles opened within amdsmitst utility calls and call + // amdsmi_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + +void TestEvtNotifReadWrite::Run(void) { + amdsmi_status_t ret; + uint32_t dv_ind; + + TestBase::Run(); + if (num_monitor_devs() == 0) { + return; + } + + if (setup_failed_) { + IF_VERB(STANDARD) { + std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl; + } + return; + } + + amdsmi_evt_notification_type_t evt_type = AMDSMI_EVT_NOTIF_FIRST; + uint64_t mask = AMDSMI_EVENT_MASK_FROM_INDEX(evt_type); + while (evt_type <= AMDSMI_EVT_NOTIF_LAST) { + mask |= AMDSMI_EVENT_MASK_FROM_INDEX(evt_type); + evt_type = static_cast( + static_cast(evt_type)+1); + } + + for (dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { + ret = amdsmi_event_notification_init(device_handles_[dv_ind]); + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << + "Event notification is not supported for this driver version." << std::endl; + } + return; + } + ASSERT_EQ(ret, AMDSMI_STATUS_SUCCESS); + ret = amdsmi_event_notification_mask_set(device_handles_[dv_ind], mask); + ASSERT_EQ(ret, AMDSMI_STATUS_SUCCESS); + } + + uint32_t num_elem = 10; + amdsmi_evt_notification_data_t data[num_elem]; + bool read_again = false; + + ret = amdsmi_event_notification_get(10000, &num_elem, data); + if (ret == AMDSMI_STATUS_SUCCESS || ret == AMDSMI_STATUS_INSUFFICIENT_SIZE) { + EXPECT_LE(num_elem, 10) << + "Expected the number of elements found to be <= buffer size (10)"; + IF_VERB(STANDARD) { + for (uint32_t i = 0; i < num_elem; ++i) { + std::cout << "\tdv_handle=" << data[i].device_handle << + " Type: " << NameFromEvtNotifType(data[i].event) << + " Mesg: " << data[i].message << std::endl; + if (data[i].event == AMDSMI_EVT_NOTIF_GPU_PRE_RESET) { + read_again = true; + } + } + } + IF_VERB(STANDARD) { + if (ret == AMDSMI_STATUS_INSUFFICIENT_SIZE) { + std::cout << + "\t\tBuffer size is 10, but more than 10 events are available." << + std::endl; + } + } + } else if (ret == AMDSMI_STATUS_NO_DATA) { + IF_VERB(STANDARD) { + std::cout << "\tNo events were collected." << std::endl; + } + } else { + // This should always fail. We want to print out the return code. + EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS) << + "Unexpected return code for amdsmi_event_notification_get()"; + } + + // In case GPU Pre reset event was collected in the previous read, + // read again to get the GPU Post reset event. + if (read_again) { + ret = amdsmi_event_notification_get(10000, &num_elem, data); + if (ret == AMDSMI_STATUS_SUCCESS || ret == AMDSMI_STATUS_INSUFFICIENT_SIZE) { + EXPECT_LE(num_elem, 10) << + "Expected the number of elements found to be <= buffer size (10)"; + IF_VERB(STANDARD) { + for (uint32_t i = 0; i < num_elem; ++i) { + std::cout << "\tdv_handle=" << data[i].device_handle << + " Type: " << NameFromEvtNotifType(data[i].event) << + " Mesg: " << data[i].message << std::endl; + } + } + IF_VERB(STANDARD) { + if (ret == AMDSMI_STATUS_INSUFFICIENT_SIZE) { + std::cout << + "\t\tBuffer size is 10, but more than 10 events are available." << + std::endl; + } + } + } else if (ret == AMDSMI_STATUS_NO_DATA) { + IF_VERB(STANDARD) { + std::cout << "\tNo further events were collected." << std::endl; + } + } else { + // This should always fail. We want to print out the return code. + EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS) << + "Unexpected return code for amdsmi_event_notification_get()"; + } + } + + for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { + ret = amdsmi_event_notification_stop(device_handles_[dv_ind]); + ASSERT_EQ(ret, AMDSMI_STATUS_SUCCESS); + } +} diff --git a/projects/amdsmi/tests/amd_smi_test/functional/evt_notif_read_write.h b/projects/amdsmi/tests/amd_smi_test/functional/evt_notif_read_write.h new file mode 100755 index 0000000000..1238e7c777 --- /dev/null +++ b/projects/amdsmi/tests/amd_smi_test/functional/evt_notif_read_write.h @@ -0,0 +1,73 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ +#ifndef TESTS_AMD_SMI_TEST_FUNCTIONAL_EVT_NOTIF_READ_WRITE_H_ +#define TESTS_AMD_SMI_TEST_FUNCTIONAL_EVT_NOTIF_READ_WRITE_H_ + +#include "amd_smi_test/test_base.h" + +class TestEvtNotifReadWrite : public TestBase { + public: + TestEvtNotifReadWrite(); + + // @Brief: Destructor for test case of TestEvtNotifReadWrite + virtual ~TestEvtNotifReadWrite(); + + // @Brief: Setup the environment for measurement + virtual void SetUp(); + + // @Brief: Core measurement execution + virtual void Run(); + + // @Brief: Clean up and retrive the resource + virtual void Close(); + + // @Brief: Display results + virtual void DisplayResults() const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); +}; + +#endif // TESTS_AMD_SMI_TEST_FUNCTIONAL_EVT_NOTIF_READ_WRITE_H_ diff --git a/projects/amdsmi/tests/amd_smi_test/main.cc b/projects/amdsmi/tests/amd_smi_test/main.cc index dcaa2bfd86..0be9e57466 100644 --- a/projects/amdsmi/tests/amd_smi_test/main.cc +++ b/projects/amdsmi/tests/amd_smi_test/main.cc @@ -55,6 +55,7 @@ #include "functional/fan_read.h" #include "functional/fan_read_write.h" +#include "functional/evt_notif_read_write.h" /* #include "functional/temp_read.h" #include "functional/volt_read.h" @@ -81,7 +82,6 @@ #include "functional/mem_page_info_read.h" #include "functional/api_support_read.h" #include "functional/mutual_exclusion.h" -#include "functional/evt_notif_read_write.h" #include "functional/init_shutdown_refcount.h" #include "amd_smi_test/functional/hw_topology_read.h" #include "amd_smi_test/functional/gpu_metrics_read.h" @@ -153,6 +153,10 @@ TEST(amdsmitstReadWrite, FanReadWrite) { TestFanReadWrite tst; RunGenericTest(&tst); } +TEST(amdsmitstReadWrite, TestEvtNotifReadWrite) { + TestEvtNotifReadWrite tst; + RunGenericTest(&tst); +} /* TEST(amdsmitstReadOnly, TempRead) { TestTempRead tst; @@ -270,10 +274,6 @@ TEST(amdsmitstReadOnly, TestMutualExclusion) { tst.Run(); RunCustomTestEpilog(&tst); } -TEST(amdsmitstReadWrite, TestEvtNotifReadWrite) { - TestEvtNotifReadWrite tst; - RunGenericTest(&tst); -} TEST(amdsmitstReadOnly, TestConcurrentInit) { TestConcurrentInit tst; SetFlags(&tst);