Support events in the amdsmi
Port the events handling from rocm-smi to amd-smi
Change-Id: I0b4cb30a585cb2188a24be0e21c1c156b461bb1d
[ROCm/amdsmi commit: 7b92c694a0]
Tá an tiomantas seo le fáil i:
@@ -72,6 +72,9 @@ class AMDSmiSystem {
|
||||
amdsmi_status_t handle_to_device(amdsmi_device_handle device_handle,
|
||||
AMDSmiDevice** device);
|
||||
|
||||
amdsmi_status_t gpu_index_to_handle(uint32_t gpu_index,
|
||||
amdsmi_device_handle* device_handle);
|
||||
|
||||
private:
|
||||
AMDSmiSystem() : init_flag_(AMD_SMI_INIT_ALL_DEVICES) {}
|
||||
uint64_t init_flag_;
|
||||
|
||||
@@ -62,6 +62,11 @@
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "impl/amdgpu_drm.h"
|
||||
|
||||
// TODO(bliu): One to one map to all status code
|
||||
static amdsmi_status_t rsmi_to_amdsmi_status(rsmi_status_t status) {
|
||||
if (status == RSMI_STATUS_NO_DATA) return AMDSMI_STATUS_NO_DATA;
|
||||
return static_cast<amdsmi_status_t>(status);
|
||||
}
|
||||
|
||||
template <typename F, typename ...Args>
|
||||
amdsmi_status_t rsmi_wrapper(F && f,
|
||||
@@ -79,7 +84,7 @@ amdsmi_status_t rsmi_wrapper(F && f,
|
||||
uint32_t gpu_index = gpu_device->get_gpu_id();
|
||||
auto r = std::forward<F>(f)(gpu_index,
|
||||
std::forward<Args>(args)...);
|
||||
return static_cast<amdsmi_status_t>(r);
|
||||
return rsmi_to_amdsmi_status(r);
|
||||
}
|
||||
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
@@ -98,7 +103,7 @@ amdsmi_shut_down() {
|
||||
amdsmi_status_t
|
||||
amdsmi_status_string(amdsmi_status_t status, const char **status_string) {
|
||||
if (status <= AMDSMI_LIB_START) {
|
||||
return static_cast<amdsmi_status_t>(
|
||||
return rsmi_to_amdsmi_status(
|
||||
rsmi_status_string(static_cast<rsmi_status_t>(status), status_string));
|
||||
}
|
||||
switch (status) {
|
||||
@@ -182,12 +187,14 @@ amdsmi_status_t amdsmi_get_device_type(amdsmi_device_handle device_handle ,
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_board_info(amdsmi_device_handle device_handle, amdsmi_board_info_t *board_info) {
|
||||
amdsmi_status_t amdsmi_get_board_info(amdsmi_device_handle device_handle,
|
||||
amdsmi_board_info_t *board_info) {
|
||||
if (board_info == NULL) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
return rsmi_wrapper(rsmi_dev_name_get, device_handle, board_info->product_name, AMDSMI_PRODUCT_NAME_LENGTH);
|
||||
return rsmi_wrapper(rsmi_dev_name_get, device_handle,
|
||||
board_info->product_name, AMDSMI_PRODUCT_NAME_LENGTH);
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_dev_temp_metric_get(amdsmi_device_handle device_handle,
|
||||
@@ -201,7 +208,8 @@ amdsmi_status_t amdsmi_dev_temp_metric_get(amdsmi_device_handle device_handle,
|
||||
static_cast<rsmi_temperature_metric_t>(metric), temperature);
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_vram_usage(amdsmi_device_handle device_handle, amdsmi_vram_info_t *vram_info) {
|
||||
amdsmi_status_t amdsmi_get_vram_usage(amdsmi_device_handle device_handle,
|
||||
amdsmi_vram_info_t *vram_info) {
|
||||
if (vram_info == NULL) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
@@ -377,4 +385,48 @@ amdsmi_status_t amdsmi_dev_subsystem_vendor_id_get(
|
||||
return rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, device_handle, id);
|
||||
}
|
||||
|
||||
amdsmi_status_t
|
||||
amdsmi_event_notification_init(amdsmi_device_handle device_handle) {
|
||||
return rsmi_wrapper(rsmi_event_notification_init, device_handle);
|
||||
}
|
||||
|
||||
amdsmi_status_t
|
||||
amdsmi_event_notification_mask_set(amdsmi_device_handle device_handle,
|
||||
uint64_t mask) {
|
||||
return rsmi_wrapper(rsmi_event_notification_mask_set, device_handle, mask);
|
||||
}
|
||||
|
||||
amdsmi_status_t
|
||||
amdsmi_event_notification_get(int timeout_ms,
|
||||
uint32_t *num_elem, amdsmi_evt_notification_data_t *data) {
|
||||
if (num_elem == nullptr || data == nullptr) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
// Get the rsmi data
|
||||
std::vector<rsmi_evt_notification_data_t> r_data(*num_elem);
|
||||
rsmi_status_t r = rsmi_event_notification_get(
|
||||
timeout_ms, num_elem, &r_data[0]);
|
||||
if (r != RSMI_STATUS_SUCCESS)
|
||||
return rsmi_to_amdsmi_status(r);
|
||||
|
||||
// convert output
|
||||
for (uint32_t i=0; i < *num_elem; i++) {
|
||||
rsmi_evt_notification_data_t rsmi_data = r_data[i];
|
||||
data[i].event = static_cast<amdsmi_evt_notification_type_t>(
|
||||
rsmi_data.event);
|
||||
strncpy(data[i].message, rsmi_data.message,
|
||||
MAX_EVENT_NOTIFICATION_MSG_SIZE);
|
||||
amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance()
|
||||
.gpu_index_to_handle(rsmi_data.dv_ind, &(data[i].device_handle));
|
||||
if (r != AMDSMI_STATUS_SUCCESS)
|
||||
return r;
|
||||
}
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_event_notification_stop(
|
||||
amdsmi_device_handle device_handle) {
|
||||
return rsmi_wrapper(rsmi_event_notification_stop, device_handle);
|
||||
}
|
||||
|
||||
@@ -156,6 +156,27 @@ amdsmi_status_t AMDSmiSystem::handle_to_device(
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
amdsmi_status_t AMDSmiSystem::gpu_index_to_handle(uint32_t gpu_index,
|
||||
amdsmi_device_handle* device_handle) {
|
||||
if (device_handle == nullptr)
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
|
||||
auto iter = devices_.begin();
|
||||
for (; iter != devices_.end(); iter++) {
|
||||
auto cur_device = (*iter);
|
||||
if (cur_device->get_device_type() != AMD_GPU)
|
||||
continue;
|
||||
amd::smi::AMDSmiGPUDevice* gpu_device =
|
||||
static_cast<amd::smi::AMDSmiGPUDevice*>(cur_device);
|
||||
uint32_t cur_gpu_index = gpu_device->get_gpu_id();
|
||||
if (gpu_index == cur_gpu_index) {
|
||||
*device_handle = cur_device;
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
+195
@@ -0,0 +1,195 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2022, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "amd_smi.h"
|
||||
#include "amd_smi_test/functional/evt_notif_read_write.h"
|
||||
#include "amd_smi_test/test_common.h"
|
||||
#include "amd_smi_test/test_utils.h"
|
||||
|
||||
TestEvtNotifReadWrite::TestEvtNotifReadWrite() : TestBase() {
|
||||
set_title("AMDSMI Event Notification Read/Write Test");
|
||||
set_description("The Event Notification Read/Write tests verifies that "
|
||||
"we can configure to collect various event types and then read them");
|
||||
}
|
||||
|
||||
TestEvtNotifReadWrite::~TestEvtNotifReadWrite(void) {
|
||||
}
|
||||
|
||||
void TestEvtNotifReadWrite::SetUp(void) {
|
||||
TestBase::SetUp();
|
||||
return;
|
||||
}
|
||||
|
||||
void TestEvtNotifReadWrite::DisplayTestInfo(void) {
|
||||
TestBase::DisplayTestInfo();
|
||||
}
|
||||
|
||||
void TestEvtNotifReadWrite::DisplayResults(void) const {
|
||||
TestBase::DisplayResults();
|
||||
return;
|
||||
}
|
||||
|
||||
void TestEvtNotifReadWrite::Close() {
|
||||
// This will close handles opened within amdsmitst utility calls and call
|
||||
// amdsmi_shut_down(), so it should be done after other hsa cleanup
|
||||
TestBase::Close();
|
||||
}
|
||||
|
||||
void TestEvtNotifReadWrite::Run(void) {
|
||||
amdsmi_status_t ret;
|
||||
uint32_t dv_ind;
|
||||
|
||||
TestBase::Run();
|
||||
if (num_monitor_devs() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (setup_failed_) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
amdsmi_evt_notification_type_t evt_type = AMDSMI_EVT_NOTIF_FIRST;
|
||||
uint64_t mask = AMDSMI_EVENT_MASK_FROM_INDEX(evt_type);
|
||||
while (evt_type <= AMDSMI_EVT_NOTIF_LAST) {
|
||||
mask |= AMDSMI_EVENT_MASK_FROM_INDEX(evt_type);
|
||||
evt_type = static_cast<amdsmi_evt_notification_type_t>(
|
||||
static_cast<uint32_t>(evt_type)+1);
|
||||
}
|
||||
|
||||
for (dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
|
||||
ret = amdsmi_event_notification_init(device_handles_[dv_ind]);
|
||||
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout <<
|
||||
"Event notification is not supported for this driver version." << std::endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
ASSERT_EQ(ret, AMDSMI_STATUS_SUCCESS);
|
||||
ret = amdsmi_event_notification_mask_set(device_handles_[dv_ind], mask);
|
||||
ASSERT_EQ(ret, AMDSMI_STATUS_SUCCESS);
|
||||
}
|
||||
|
||||
uint32_t num_elem = 10;
|
||||
amdsmi_evt_notification_data_t data[num_elem];
|
||||
bool read_again = false;
|
||||
|
||||
ret = amdsmi_event_notification_get(10000, &num_elem, data);
|
||||
if (ret == AMDSMI_STATUS_SUCCESS || ret == AMDSMI_STATUS_INSUFFICIENT_SIZE) {
|
||||
EXPECT_LE(num_elem, 10) <<
|
||||
"Expected the number of elements found to be <= buffer size (10)";
|
||||
IF_VERB(STANDARD) {
|
||||
for (uint32_t i = 0; i < num_elem; ++i) {
|
||||
std::cout << "\tdv_handle=" << data[i].device_handle <<
|
||||
" Type: " << NameFromEvtNotifType(data[i].event) <<
|
||||
" Mesg: " << data[i].message << std::endl;
|
||||
if (data[i].event == AMDSMI_EVT_NOTIF_GPU_PRE_RESET) {
|
||||
read_again = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
IF_VERB(STANDARD) {
|
||||
if (ret == AMDSMI_STATUS_INSUFFICIENT_SIZE) {
|
||||
std::cout <<
|
||||
"\t\tBuffer size is 10, but more than 10 events are available." <<
|
||||
std::endl;
|
||||
}
|
||||
}
|
||||
} else if (ret == AMDSMI_STATUS_NO_DATA) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\tNo events were collected." << std::endl;
|
||||
}
|
||||
} else {
|
||||
// This should always fail. We want to print out the return code.
|
||||
EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS) <<
|
||||
"Unexpected return code for amdsmi_event_notification_get()";
|
||||
}
|
||||
|
||||
// In case GPU Pre reset event was collected in the previous read,
|
||||
// read again to get the GPU Post reset event.
|
||||
if (read_again) {
|
||||
ret = amdsmi_event_notification_get(10000, &num_elem, data);
|
||||
if (ret == AMDSMI_STATUS_SUCCESS || ret == AMDSMI_STATUS_INSUFFICIENT_SIZE) {
|
||||
EXPECT_LE(num_elem, 10) <<
|
||||
"Expected the number of elements found to be <= buffer size (10)";
|
||||
IF_VERB(STANDARD) {
|
||||
for (uint32_t i = 0; i < num_elem; ++i) {
|
||||
std::cout << "\tdv_handle=" << data[i].device_handle <<
|
||||
" Type: " << NameFromEvtNotifType(data[i].event) <<
|
||||
" Mesg: " << data[i].message << std::endl;
|
||||
}
|
||||
}
|
||||
IF_VERB(STANDARD) {
|
||||
if (ret == AMDSMI_STATUS_INSUFFICIENT_SIZE) {
|
||||
std::cout <<
|
||||
"\t\tBuffer size is 10, but more than 10 events are available." <<
|
||||
std::endl;
|
||||
}
|
||||
}
|
||||
} else if (ret == AMDSMI_STATUS_NO_DATA) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\tNo further events were collected." << std::endl;
|
||||
}
|
||||
} else {
|
||||
// This should always fail. We want to print out the return code.
|
||||
EXPECT_EQ(ret, AMDSMI_STATUS_SUCCESS) <<
|
||||
"Unexpected return code for amdsmi_event_notification_get()";
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
|
||||
ret = amdsmi_event_notification_stop(device_handles_[dv_ind]);
|
||||
ASSERT_EQ(ret, AMDSMI_STATUS_SUCCESS);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2022, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
#ifndef TESTS_AMD_SMI_TEST_FUNCTIONAL_EVT_NOTIF_READ_WRITE_H_
|
||||
#define TESTS_AMD_SMI_TEST_FUNCTIONAL_EVT_NOTIF_READ_WRITE_H_
|
||||
|
||||
#include "amd_smi_test/test_base.h"
|
||||
|
||||
class TestEvtNotifReadWrite : public TestBase {
|
||||
public:
|
||||
TestEvtNotifReadWrite();
|
||||
|
||||
// @Brief: Destructor for test case of TestEvtNotifReadWrite
|
||||
virtual ~TestEvtNotifReadWrite();
|
||||
|
||||
// @Brief: Setup the environment for measurement
|
||||
virtual void SetUp();
|
||||
|
||||
// @Brief: Core measurement execution
|
||||
virtual void Run();
|
||||
|
||||
// @Brief: Clean up and retrive the resource
|
||||
virtual void Close();
|
||||
|
||||
// @Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
// @Brief: Display information about what this test does
|
||||
virtual void DisplayTestInfo(void);
|
||||
};
|
||||
|
||||
#endif // TESTS_AMD_SMI_TEST_FUNCTIONAL_EVT_NOTIF_READ_WRITE_H_
|
||||
@@ -55,6 +55,7 @@
|
||||
|
||||
#include "functional/fan_read.h"
|
||||
#include "functional/fan_read_write.h"
|
||||
#include "functional/evt_notif_read_write.h"
|
||||
/*
|
||||
#include "functional/temp_read.h"
|
||||
#include "functional/volt_read.h"
|
||||
@@ -81,7 +82,6 @@
|
||||
#include "functional/mem_page_info_read.h"
|
||||
#include "functional/api_support_read.h"
|
||||
#include "functional/mutual_exclusion.h"
|
||||
#include "functional/evt_notif_read_write.h"
|
||||
#include "functional/init_shutdown_refcount.h"
|
||||
#include "amd_smi_test/functional/hw_topology_read.h"
|
||||
#include "amd_smi_test/functional/gpu_metrics_read.h"
|
||||
@@ -153,6 +153,10 @@ TEST(amdsmitstReadWrite, FanReadWrite) {
|
||||
TestFanReadWrite tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
TEST(amdsmitstReadWrite, TestEvtNotifReadWrite) {
|
||||
TestEvtNotifReadWrite tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
/*
|
||||
TEST(amdsmitstReadOnly, TempRead) {
|
||||
TestTempRead tst;
|
||||
@@ -270,10 +274,6 @@ TEST(amdsmitstReadOnly, TestMutualExclusion) {
|
||||
tst.Run();
|
||||
RunCustomTestEpilog(&tst);
|
||||
}
|
||||
TEST(amdsmitstReadWrite, TestEvtNotifReadWrite) {
|
||||
TestEvtNotifReadWrite tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
TEST(amdsmitstReadOnly, TestConcurrentInit) {
|
||||
TestConcurrentInit tst;
|
||||
SetFlags(&tst);
|
||||
|
||||
Tagairt in Eagrán Nua
Cuir bac ar úsáideoir