Support events in the amdsmi

Port the events handling from rocm-smi to amd-smi

Change-Id: I0b4cb30a585cb2188a24be0e21c1c156b461bb1d


[ROCm/amdsmi commit: 7b92c694a0]
Tá an tiomantas seo le fáil i:
Bill(Shuzhou) Liu
2022-08-18 08:52:08 -04:00
tuismitheoir 06a481c563
tiomantas 39c1c4334e
D'athraigh 6 comhad le 354 breiseanna agus 10 scriosta
@@ -72,6 +72,9 @@ class AMDSmiSystem {
amdsmi_status_t handle_to_device(amdsmi_device_handle device_handle,
AMDSmiDevice** device);
amdsmi_status_t gpu_index_to_handle(uint32_t gpu_index,
amdsmi_device_handle* device_handle);
private:
AMDSmiSystem() : init_flag_(AMD_SMI_INIT_ALL_DEVICES) {}
uint64_t init_flag_;
+57 -5
Féach ar an gComhad
@@ -62,6 +62,11 @@
#include "rocm_smi/rocm_smi.h"
#include "impl/amdgpu_drm.h"
// TODO(bliu): One to one map to all status code
static amdsmi_status_t rsmi_to_amdsmi_status(rsmi_status_t status) {
if (status == RSMI_STATUS_NO_DATA) return AMDSMI_STATUS_NO_DATA;
return static_cast<amdsmi_status_t>(status);
}
template <typename F, typename ...Args>
amdsmi_status_t rsmi_wrapper(F && f,
@@ -79,7 +84,7 @@ amdsmi_status_t rsmi_wrapper(F && f,
uint32_t gpu_index = gpu_device->get_gpu_id();
auto r = std::forward<F>(f)(gpu_index,
std::forward<Args>(args)...);
return static_cast<amdsmi_status_t>(r);
return rsmi_to_amdsmi_status(r);
}
return AMDSMI_STATUS_NOT_SUPPORTED;
@@ -98,7 +103,7 @@ amdsmi_shut_down() {
amdsmi_status_t
amdsmi_status_string(amdsmi_status_t status, const char **status_string) {
if (status <= AMDSMI_LIB_START) {
return static_cast<amdsmi_status_t>(
return rsmi_to_amdsmi_status(
rsmi_status_string(static_cast<rsmi_status_t>(status), status_string));
}
switch (status) {
@@ -182,12 +187,14 @@ amdsmi_status_t amdsmi_get_device_type(amdsmi_device_handle device_handle ,
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_board_info(amdsmi_device_handle device_handle, amdsmi_board_info_t *board_info) {
amdsmi_status_t amdsmi_get_board_info(amdsmi_device_handle device_handle,
amdsmi_board_info_t *board_info) {
if (board_info == NULL) {
return AMDSMI_STATUS_INVAL;
}
return rsmi_wrapper(rsmi_dev_name_get, device_handle, board_info->product_name, AMDSMI_PRODUCT_NAME_LENGTH);
return rsmi_wrapper(rsmi_dev_name_get, device_handle,
board_info->product_name, AMDSMI_PRODUCT_NAME_LENGTH);
}
amdsmi_status_t amdsmi_dev_temp_metric_get(amdsmi_device_handle device_handle,
@@ -201,7 +208,8 @@ amdsmi_status_t amdsmi_dev_temp_metric_get(amdsmi_device_handle device_handle,
static_cast<rsmi_temperature_metric_t>(metric), temperature);
}
amdsmi_status_t amdsmi_get_vram_usage(amdsmi_device_handle device_handle, amdsmi_vram_info_t *vram_info) {
amdsmi_status_t amdsmi_get_vram_usage(amdsmi_device_handle device_handle,
amdsmi_vram_info_t *vram_info) {
if (vram_info == NULL) {
return AMDSMI_STATUS_INVAL;
}
@@ -377,4 +385,48 @@ amdsmi_status_t amdsmi_dev_subsystem_vendor_id_get(
return rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, device_handle, id);
}
amdsmi_status_t
amdsmi_event_notification_init(amdsmi_device_handle device_handle) {
return rsmi_wrapper(rsmi_event_notification_init, device_handle);
}
amdsmi_status_t
amdsmi_event_notification_mask_set(amdsmi_device_handle device_handle,
uint64_t mask) {
return rsmi_wrapper(rsmi_event_notification_mask_set, device_handle, mask);
}
amdsmi_status_t
amdsmi_event_notification_get(int timeout_ms,
uint32_t *num_elem, amdsmi_evt_notification_data_t *data) {
if (num_elem == nullptr || data == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// Get the rsmi data
std::vector<rsmi_evt_notification_data_t> r_data(*num_elem);
rsmi_status_t r = rsmi_event_notification_get(
timeout_ms, num_elem, &r_data[0]);
if (r != RSMI_STATUS_SUCCESS)
return rsmi_to_amdsmi_status(r);
// convert output
for (uint32_t i=0; i < *num_elem; i++) {
rsmi_evt_notification_data_t rsmi_data = r_data[i];
data[i].event = static_cast<amdsmi_evt_notification_type_t>(
rsmi_data.event);
strncpy(data[i].message, rsmi_data.message,
MAX_EVENT_NOTIFICATION_MSG_SIZE);
amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance()
.gpu_index_to_handle(rsmi_data.dv_ind, &(data[i].device_handle));
if (r != AMDSMI_STATUS_SUCCESS)
return r;
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_event_notification_stop(
amdsmi_device_handle device_handle) {
return rsmi_wrapper(rsmi_event_notification_stop, device_handle);
}
+21
Féach ar an gComhad
@@ -156,6 +156,27 @@ amdsmi_status_t AMDSmiSystem::handle_to_device(
return AMDSMI_STATUS_INVAL;
}
amdsmi_status_t AMDSmiSystem::gpu_index_to_handle(uint32_t gpu_index,
amdsmi_device_handle* device_handle) {
if (device_handle == nullptr)
return AMDSMI_STATUS_INVAL;
auto iter = devices_.begin();
for (; iter != devices_.end(); iter++) {
auto cur_device = (*iter);
if (cur_device->get_device_type() != AMD_GPU)
continue;
amd::smi::AMDSmiGPUDevice* gpu_device =
static_cast<amd::smi::AMDSmiGPUDevice*>(cur_device);
uint32_t cur_gpu_index = gpu_device->get_gpu_id();
if (gpu_index == cur_gpu_index) {
*device_handle = cur_device;
return AMDSMI_STATUS_SUCCESS;
}
}
return AMDSMI_STATUS_INVAL;
}
} // namespace smi
} // namespace amd
@@ -0,0 +1,195 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2022, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <stdint.h>
#include <stddef.h>
#include <iostream>
#include "gtest/gtest.h"
#include "amd_smi.h"
#include "amd_smi_test/functional/evt_notif_read_write.h"
#include "amd_smi_test/test_common.h"
#include "amd_smi_test/test_utils.h"
TestEvtNotifReadWrite::TestEvtNotifReadWrite() : TestBase() {
set_title("AMDSMI Event Notification Read/Write Test");
set_description("The Event Notification Read/Write tests verifies that "
"we can configure to collect various event types and then read them");
}
TestEvtNotifReadWrite::~TestEvtNotifReadWrite(void) {
}
void TestEvtNotifReadWrite::SetUp(void) {
TestBase::SetUp();
return;
}
void TestEvtNotifReadWrite::DisplayTestInfo(void) {
TestBase::DisplayTestInfo();
}
void TestEvtNotifReadWrite::DisplayResults(void) const {
TestBase::DisplayResults();
return;
}
void TestEvtNotifReadWrite::Close() {
// This will close handles opened within amdsmitst utility calls and call
// amdsmi_shut_down(), so it should be done after other hsa cleanup
TestBase::Close();
}
void TestEvtNotifReadWrite::Run(void) {
amdsmi_status_t ret;
uint32_t dv_ind;
TestBase::Run();
if (num_monitor_devs() == 0) {
return;
}
if (setup_failed_) {
IF_VERB(STANDARD) {
std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl;
}
return;
}
amdsmi_evt_notification_type_t evt_type = AMDSMI_EVT_NOTIF_FIRST;
uint64_t mask = AMDSMI_EVENT_MASK_FROM_INDEX(evt_type);
while (evt_type <= AMDSMI_EVT_NOTIF_LAST) {
mask |= AMDSMI_EVENT_MASK_FROM_INDEX(evt_type);
evt_type = static_cast<amdsmi_evt_notification_type_t>(
static_cast<uint32_t>(evt_type)+1);
}
for (dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
ret = amdsmi_event_notification_init(device_handles_[dv_ind]);
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
IF_VERB(STANDARD) {
std::cout <<
"Event notification is not supported for this driver version." << std::endl;
}
return;
}
ASSERT_EQ(ret, AMDSMI_STATUS_SUCCESS);
ret = amdsmi_event_notification_mask_set(device_handles_[dv_ind], mask);
ASSERT_EQ(ret, AMDSMI_STATUS_SUCCESS);
}
uint32_t num_elem = 10;
amdsmi_evt_notification_data_t data[num_elem];
bool read_again = false;
ret = amdsmi_event_notification_get(10000, &num_elem, data);
if (ret == AMDSMI_STATUS_SUCCESS || ret == AMDSMI_STATUS_INSUFFICIENT_SIZE) {
EXPECT_LE(num_elem, 10) <<
"Expected the number of elements found to be <= buffer size (10)";
IF_VERB(STANDARD) {
for (uint32_t i = 0; i < num_elem; ++i) {
std::cout << "\tdv_handle=" << data[i].device_handle <<
" Type: " << NameFromEvtNotifType(data[i].event) <<
" Mesg: " << data[i].message << std::endl;
if (data[i].event == AMDSMI_EVT_NOTIF_GPU_PRE_RESET) {
read_again = true;
}
}
}
IF_VERB(STANDARD) {
if (ret == AMDSMI_STATUS_INSUFFICIENT_SIZE) {
std::cout <<
"\t\tBuffer size is 10, but more than 10 events are available." <<
std::endl;
}
}
} else if (ret == AMDSMI_STATUS_NO_DATA) {
IF_VERB(STANDARD) {
std::cout << "\tNo events were collected." << std::endl;
}
} else {
// This should always fail. We want to print out the return code.
EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS) <<
"Unexpected return code for amdsmi_event_notification_get()";
}
// In case GPU Pre reset event was collected in the previous read,
// read again to get the GPU Post reset event.
if (read_again) {
ret = amdsmi_event_notification_get(10000, &num_elem, data);
if (ret == AMDSMI_STATUS_SUCCESS || ret == AMDSMI_STATUS_INSUFFICIENT_SIZE) {
EXPECT_LE(num_elem, 10) <<
"Expected the number of elements found to be <= buffer size (10)";
IF_VERB(STANDARD) {
for (uint32_t i = 0; i < num_elem; ++i) {
std::cout << "\tdv_handle=" << data[i].device_handle <<
" Type: " << NameFromEvtNotifType(data[i].event) <<
" Mesg: " << data[i].message << std::endl;
}
}
IF_VERB(STANDARD) {
if (ret == AMDSMI_STATUS_INSUFFICIENT_SIZE) {
std::cout <<
"\t\tBuffer size is 10, but more than 10 events are available." <<
std::endl;
}
}
} else if (ret == AMDSMI_STATUS_NO_DATA) {
IF_VERB(STANDARD) {
std::cout << "\tNo further events were collected." << std::endl;
}
} else {
// This should always fail. We want to print out the return code.
EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS) <<
"Unexpected return code for amdsmi_event_notification_get()";
}
}
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
ret = amdsmi_event_notification_stop(device_handles_[dv_ind]);
ASSERT_EQ(ret, AMDSMI_STATUS_SUCCESS);
}
}
@@ -0,0 +1,73 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2022, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef TESTS_AMD_SMI_TEST_FUNCTIONAL_EVT_NOTIF_READ_WRITE_H_
#define TESTS_AMD_SMI_TEST_FUNCTIONAL_EVT_NOTIF_READ_WRITE_H_
#include "amd_smi_test/test_base.h"
class TestEvtNotifReadWrite : public TestBase {
public:
TestEvtNotifReadWrite();
// @Brief: Destructor for test case of TestEvtNotifReadWrite
virtual ~TestEvtNotifReadWrite();
// @Brief: Setup the environment for measurement
virtual void SetUp();
// @Brief: Core measurement execution
virtual void Run();
// @Brief: Clean up and retrive the resource
virtual void Close();
// @Brief: Display results
virtual void DisplayResults() const;
// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);
};
#endif // TESTS_AMD_SMI_TEST_FUNCTIONAL_EVT_NOTIF_READ_WRITE_H_
+5 -5
Féach ar an gComhad
@@ -55,6 +55,7 @@
#include "functional/fan_read.h"
#include "functional/fan_read_write.h"
#include "functional/evt_notif_read_write.h"
/*
#include "functional/temp_read.h"
#include "functional/volt_read.h"
@@ -81,7 +82,6 @@
#include "functional/mem_page_info_read.h"
#include "functional/api_support_read.h"
#include "functional/mutual_exclusion.h"
#include "functional/evt_notif_read_write.h"
#include "functional/init_shutdown_refcount.h"
#include "amd_smi_test/functional/hw_topology_read.h"
#include "amd_smi_test/functional/gpu_metrics_read.h"
@@ -153,6 +153,10 @@ TEST(amdsmitstReadWrite, FanReadWrite) {
TestFanReadWrite tst;
RunGenericTest(&tst);
}
TEST(amdsmitstReadWrite, TestEvtNotifReadWrite) {
TestEvtNotifReadWrite tst;
RunGenericTest(&tst);
}
/*
TEST(amdsmitstReadOnly, TempRead) {
TestTempRead tst;
@@ -270,10 +274,6 @@ TEST(amdsmitstReadOnly, TestMutualExclusion) {
tst.Run();
RunCustomTestEpilog(&tst);
}
TEST(amdsmitstReadWrite, TestEvtNotifReadWrite) {
TestEvtNotifReadWrite tst;
RunGenericTest(&tst);
}
TEST(amdsmitstReadOnly, TestConcurrentInit) {
TestConcurrentInit tst;
SetFlags(&tst);