Add xgmi error_status and error_reset functions
Also, comment corrections and added check for invalid arguments
Change-Id: I891cbf9b37bfda629914a008811b840323872c02
[ROCm/amdsmi commit: 557e1f5704]
This commit is contained in:
Plik binarny nie jest wyświetlany.
@@ -402,7 +402,7 @@ typedef enum {
|
||||
} rsmi_memory_type_t;
|
||||
|
||||
/**
|
||||
* @brief This values of this enum are used as frequency identifiers.
|
||||
* @brief The values of this enum are used as frequency identifiers.
|
||||
*/
|
||||
typedef enum {
|
||||
RSMI_FREQ_IND_MIN = 0, //!< Index used for the minimum frequency value
|
||||
@@ -413,6 +413,15 @@ typedef enum {
|
||||
typedef rsmi_freq_ind_t rsmi_freq_ind;
|
||||
/// \endcond
|
||||
|
||||
/**
|
||||
* @brief XGMI Status
|
||||
*/
|
||||
typedef enum {
|
||||
RSMI_XGMI_STATUS_NO_ERRORS = 0,
|
||||
RSMI_XGMI_STATUS_ERROR,
|
||||
RSMI_XGMI_STATUS_MULTIPLE_ERRORS,
|
||||
} rsmi_xgmi_status_t;
|
||||
|
||||
/**
|
||||
* @brief Bitfield used in various RSMI calls
|
||||
*/
|
||||
@@ -1915,6 +1924,48 @@ rsmi_compute_process_info_by_pid_get(uint32_t pid, rsmi_process_info_t *proc);
|
||||
|
||||
/** @} */ // end of SysInfo
|
||||
|
||||
/*****************************************************************************/
|
||||
/** @defgroup XGMIInfo XGMI Functions
|
||||
* These functions are used to configure, query and control XGMI.
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Retrieve the XGMI error status for a device
|
||||
*
|
||||
* @details Given a device index @p dv_ind, and a pointer to an
|
||||
* ::rsmi_xgmi_status_t @p status, this function will write the current XGMI
|
||||
* error state ::rsmi_xgmi_status_t for the device @p dv_ind to the memory
|
||||
* pointed to by @p status.
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @param[inout] status A pointer to an ::rsmi_xgmi_status_t to which the
|
||||
* XGMI error state should be written
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
|
||||
*
|
||||
*/
|
||||
rsmi_status_t
|
||||
rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status);
|
||||
|
||||
/**
|
||||
* @brief Reset the XGMI error status for a device
|
||||
*
|
||||
* @details Given a device index @p dv_ind, this function will reset the
|
||||
* current XGMI error state ::rsmi_xgmi_status_t for the device @p dv_ind to
|
||||
* rsmi_xgmi_status_t::RSMI_XGMI_STATUS_NO_ERRORS
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
|
||||
*
|
||||
*/
|
||||
rsmi_status_t
|
||||
rsmi_dev_xgmi_error_reset(uint32_t dv_ind);
|
||||
|
||||
/** @} */ // end of SysInfo
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // __cplusplus
|
||||
|
||||
@@ -96,6 +96,7 @@ enum DevInfoTypes {
|
||||
kDevUniqueId,
|
||||
kDevDFCountersAvailable,
|
||||
kDevMemBusyPercent,
|
||||
kDevXGMIError,
|
||||
};
|
||||
|
||||
class Device {
|
||||
|
||||
@@ -2146,6 +2146,10 @@ rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) {
|
||||
DEVICE_MUTEX
|
||||
rsmi_status_t ret;
|
||||
|
||||
if (unique_id == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
ret = get_dev_value_int(amd::smi::kDevUniqueId, dv_ind, unique_id);
|
||||
return ret;
|
||||
|
||||
@@ -2349,3 +2353,62 @@ rsmi_compute_process_info_by_pid_get(uint32_t pid,
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status) {
|
||||
TRY
|
||||
DEVICE_MUTEX
|
||||
|
||||
if (status == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
rsmi_status_t ret;
|
||||
uint64_t status_code;
|
||||
|
||||
ret = get_dev_value_int(amd::smi::kDevXGMIError, dv_ind, &status_code);
|
||||
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
switch (status_code) {
|
||||
case 0:
|
||||
*status = RSMI_XGMI_STATUS_NO_ERRORS;
|
||||
break;
|
||||
|
||||
case 1:
|
||||
*status = RSMI_XGMI_STATUS_ERROR;
|
||||
break;
|
||||
|
||||
case 2:
|
||||
*status = RSMI_XGMI_STATUS_MULTIPLE_ERRORS;
|
||||
break;
|
||||
|
||||
default:
|
||||
assert(!"Unexpected XGMI error status read");
|
||||
return RSMI_STATUS_UNKNOWN_ERROR;
|
||||
}
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_xgmi_error_reset(uint32_t dv_ind) {
|
||||
TRY
|
||||
DEVICE_MUTEX
|
||||
|
||||
rsmi_status_t ret;
|
||||
uint64_t status_code;
|
||||
|
||||
ret = get_dev_value_int(amd::smi::kDevXGMIError, dv_ind, &status_code);
|
||||
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
@@ -101,6 +101,7 @@ static const char *kDevPCIEReplayCountFName = "pcie_replay_count";
|
||||
static const char *kDevUniqueIdFName = "unique_id";
|
||||
static const char *kDevDFCountersAvailableFName = "df_cntr_avail";
|
||||
static const char *kDevMemBusyPercentFName = "mem_busy_percent";
|
||||
static const char *kDevXGMIErrorFName = "xgmi_error";
|
||||
|
||||
// Strings that are found within sysfs files
|
||||
static const char *kDevPerfLevelAutoStr = "auto";
|
||||
@@ -145,6 +146,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
|
||||
{kDevPCIEReplayCount, kDevPCIEReplayCountFName},
|
||||
{kDevUniqueId, kDevUniqueIdFName},
|
||||
{kDevDFCountersAvailable, kDevDFCountersAvailableFName},
|
||||
{kDevXGMIError, kDevXGMIErrorFName},
|
||||
};
|
||||
|
||||
static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
|
||||
@@ -380,6 +382,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
|
||||
case kDevPCIEReplayCount:
|
||||
case kDevDFCountersAvailable:
|
||||
case kDevMemBusyPercent:
|
||||
case kDevXGMIError:
|
||||
ret = readDevInfoStr(type, &tempStr);
|
||||
RET_IF_NONZERO(ret);
|
||||
*val = std::stoul(tempStr, 0);
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2019, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi_test/functional/xgmi_read_write.h"
|
||||
#include "rocm_smi_test/test_common.h"
|
||||
|
||||
TestXGMIReadWrite::TestXGMIReadWrite() : TestBase() {
|
||||
set_title("RSMI XGMI Read/Write Test");
|
||||
set_description("This test verifies that XGMI error counts can be read"
|
||||
" properly, and that the count can be reset.");
|
||||
}
|
||||
|
||||
TestXGMIReadWrite::~TestXGMIReadWrite(void) {
|
||||
}
|
||||
|
||||
void TestXGMIReadWrite::SetUp(void) {
|
||||
TestBase::SetUp();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void TestXGMIReadWrite::DisplayTestInfo(void) {
|
||||
TestBase::DisplayTestInfo();
|
||||
}
|
||||
|
||||
void TestXGMIReadWrite::DisplayResults(void) const {
|
||||
TestBase::DisplayResults();
|
||||
return;
|
||||
}
|
||||
|
||||
void TestXGMIReadWrite::Close() {
|
||||
// This will close handles opened within rsmitst utility calls and call
|
||||
// rsmi_shut_down(), so it should be done after other hsa cleanup
|
||||
TestBase::Close();
|
||||
}
|
||||
|
||||
|
||||
void TestXGMIReadWrite::Run(void) {
|
||||
rsmi_status_t err;
|
||||
rsmi_xgmi_status_t err_stat;
|
||||
|
||||
TestBase::Run();
|
||||
|
||||
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
|
||||
PrintDeviceHeader(dv_ind);
|
||||
|
||||
err = rsmi_dev_xgmi_error_status(dv_ind, &err_stat);
|
||||
|
||||
if (err != RSMI_STATUS_SUCCESS) {
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**XGMI Error Status: Not supported on this machine"
|
||||
<< std::endl;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
}
|
||||
} else {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**XGMI Error Status: " <<
|
||||
static_cast<uint32_t>(err_stat) << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(cfree) We need to find a way to generate xgmi errors so this
|
||||
// test won't be meaningless
|
||||
err = rsmi_dev_xgmi_error_reset(dv_ind);
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Successfully reset XGMI Error Status: " << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2019, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
#ifndef TESTS_ROCM_SMI_TEST_FUNCTIONAL_XGMI_READ_WRITE_H_
|
||||
#define TESTS_ROCM_SMI_TEST_FUNCTIONAL_XGMI_READ_WRITE_H_
|
||||
|
||||
#include "rocm_smi_test/test_base.h"
|
||||
|
||||
class TestXGMIReadWrite : public TestBase {
|
||||
public:
|
||||
TestXGMIReadWrite();
|
||||
|
||||
// @Brief: Destructor for test case of TestXGMIReadWrite
|
||||
virtual ~TestXGMIReadWrite();
|
||||
|
||||
// @Brief: Setup the environment for measurement
|
||||
virtual void SetUp();
|
||||
|
||||
// @Brief: Core measurement execution
|
||||
virtual void Run();
|
||||
|
||||
// @Brief: Clean up and retrive the resource
|
||||
virtual void Close();
|
||||
|
||||
// @Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
// @Brief: Display information about what this test does
|
||||
virtual void DisplayTestInfo(void);
|
||||
};
|
||||
|
||||
#endif // TESTS_ROCM_SMI_TEST_FUNCTIONAL_XGMI_READ_WRITE_H_
|
||||
@@ -74,6 +74,7 @@
|
||||
#include "functional/id_info_read.h"
|
||||
#include "functional/perf_cntr_read_write.h"
|
||||
#include "functional/process_info_read.h"
|
||||
#include "functional/xgmi_read_write.h"
|
||||
|
||||
static RSMITstGlobals *sRSMIGlvalues = nullptr;
|
||||
|
||||
@@ -208,6 +209,10 @@ TEST(rsmitstReadOnly, TestProcInfoRead) {
|
||||
TestProcInfoRead tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
TEST(rsmitstReadWrite, TestXGMIReadWrite) {
|
||||
TestXGMIReadWrite tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
|
||||
Reference in New Issue
Block a user