Add xgmi error_status and error_reset functions

Also, comment corrections and added check for invalid arguments

Change-Id: I891cbf9b37bfda629914a008811b840323872c02


[ROCm/amdsmi commit: 557e1f5704]
This commit is contained in:
Chris Freehill
2019-07-09 08:46:08 -05:00
rodzic df93edbbc0
commit 64f22d2749
8 zmienionych plików z 320 dodań i 1 usunięć
Plik binarny nie jest wyświetlany.
@@ -402,7 +402,7 @@ typedef enum {
} rsmi_memory_type_t;
/**
* @brief This values of this enum are used as frequency identifiers.
* @brief The values of this enum are used as frequency identifiers.
*/
typedef enum {
RSMI_FREQ_IND_MIN = 0, //!< Index used for the minimum frequency value
@@ -413,6 +413,15 @@ typedef enum {
typedef rsmi_freq_ind_t rsmi_freq_ind;
/// \endcond
/**
* @brief XGMI Status
*/
typedef enum {
RSMI_XGMI_STATUS_NO_ERRORS = 0,
RSMI_XGMI_STATUS_ERROR,
RSMI_XGMI_STATUS_MULTIPLE_ERRORS,
} rsmi_xgmi_status_t;
/**
* @brief Bitfield used in various RSMI calls
*/
@@ -1915,6 +1924,48 @@ rsmi_compute_process_info_by_pid_get(uint32_t pid, rsmi_process_info_t *proc);
/** @} */ // end of SysInfo
/*****************************************************************************/
/** @defgroup XGMIInfo XGMI Functions
* These functions are used to configure, query and control XGMI.
* @{
*/
/**
* @brief Retrieve the XGMI error status for a device
*
* @details Given a device index @p dv_ind, and a pointer to an
* ::rsmi_xgmi_status_t @p status, this function will write the current XGMI
* error state ::rsmi_xgmi_status_t for the device @p dv_ind to the memory
* pointed to by @p status.
*
* @param[in] dv_ind a device index
*
* @param[inout] status A pointer to an ::rsmi_xgmi_status_t to which the
* XGMI error state should be written
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*
*/
rsmi_status_t
rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status);
/**
* @brief Reset the XGMI error status for a device
*
* @details Given a device index @p dv_ind, this function will reset the
* current XGMI error state ::rsmi_xgmi_status_t for the device @p dv_ind to
* rsmi_xgmi_status_t::RSMI_XGMI_STATUS_NO_ERRORS
*
* @param[in] dv_ind a device index
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*
*/
rsmi_status_t
rsmi_dev_xgmi_error_reset(uint32_t dv_ind);
/** @} */ // end of SysInfo
#ifdef __cplusplus
}
#endif // __cplusplus
@@ -96,6 +96,7 @@ enum DevInfoTypes {
kDevUniqueId,
kDevDFCountersAvailable,
kDevMemBusyPercent,
kDevXGMIError,
};
class Device {
+63
Wyświetl plik
@@ -2146,6 +2146,10 @@ rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) {
DEVICE_MUTEX
rsmi_status_t ret;
if (unique_id == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
ret = get_dev_value_int(amd::smi::kDevUniqueId, dv_ind, unique_id);
return ret;
@@ -2349,3 +2353,62 @@ rsmi_compute_process_info_by_pid_get(uint32_t pid,
CATCH
}
rsmi_status_t
rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status) {
TRY
DEVICE_MUTEX
if (status == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
rsmi_status_t ret;
uint64_t status_code;
ret = get_dev_value_int(amd::smi::kDevXGMIError, dv_ind, &status_code);
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
switch (status_code) {
case 0:
*status = RSMI_XGMI_STATUS_NO_ERRORS;
break;
case 1:
*status = RSMI_XGMI_STATUS_ERROR;
break;
case 2:
*status = RSMI_XGMI_STATUS_MULTIPLE_ERRORS;
break;
default:
assert(!"Unexpected XGMI error status read");
return RSMI_STATUS_UNKNOWN_ERROR;
}
return RSMI_STATUS_SUCCESS;
CATCH
}
rsmi_status_t
rsmi_dev_xgmi_error_reset(uint32_t dv_ind) {
TRY
DEVICE_MUTEX
rsmi_status_t ret;
uint64_t status_code;
ret = get_dev_value_int(amd::smi::kDevXGMIError, dv_ind, &status_code);
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
return RSMI_STATUS_SUCCESS;
CATCH
}
@@ -101,6 +101,7 @@ static const char *kDevPCIEReplayCountFName = "pcie_replay_count";
static const char *kDevUniqueIdFName = "unique_id";
static const char *kDevDFCountersAvailableFName = "df_cntr_avail";
static const char *kDevMemBusyPercentFName = "mem_busy_percent";
static const char *kDevXGMIErrorFName = "xgmi_error";
// Strings that are found within sysfs files
static const char *kDevPerfLevelAutoStr = "auto";
@@ -145,6 +146,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevPCIEReplayCount, kDevPCIEReplayCountFName},
{kDevUniqueId, kDevUniqueIdFName},
{kDevDFCountersAvailable, kDevDFCountersAvailableFName},
{kDevXGMIError, kDevXGMIErrorFName},
};
static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
@@ -380,6 +382,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
case kDevPCIEReplayCount:
case kDevDFCountersAvailable:
case kDevMemBusyPercent:
case kDevXGMIError:
ret = readDevInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
*val = std::stoul(tempStr, 0);
@@ -0,0 +1,123 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2019, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <stdint.h>
#include <stddef.h>
#include <iostream>
#include "gtest/gtest.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi_test/functional/xgmi_read_write.h"
#include "rocm_smi_test/test_common.h"
TestXGMIReadWrite::TestXGMIReadWrite() : TestBase() {
set_title("RSMI XGMI Read/Write Test");
set_description("This test verifies that XGMI error counts can be read"
" properly, and that the count can be reset.");
}
TestXGMIReadWrite::~TestXGMIReadWrite(void) {
}
void TestXGMIReadWrite::SetUp(void) {
TestBase::SetUp();
return;
}
void TestXGMIReadWrite::DisplayTestInfo(void) {
TestBase::DisplayTestInfo();
}
void TestXGMIReadWrite::DisplayResults(void) const {
TestBase::DisplayResults();
return;
}
void TestXGMIReadWrite::Close() {
// This will close handles opened within rsmitst utility calls and call
// rsmi_shut_down(), so it should be done after other hsa cleanup
TestBase::Close();
}
void TestXGMIReadWrite::Run(void) {
rsmi_status_t err;
rsmi_xgmi_status_t err_stat;
TestBase::Run();
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
PrintDeviceHeader(dv_ind);
err = rsmi_dev_xgmi_error_status(dv_ind, &err_stat);
if (err != RSMI_STATUS_SUCCESS) {
if (err == RSMI_STATUS_NOT_SUPPORTED) {
IF_VERB(STANDARD) {
std::cout << "\t**XGMI Error Status: Not supported on this machine"
<< std::endl;
return;
}
} else {
CHK_ERR_ASRT(err)
}
} else {
IF_VERB(STANDARD) {
std::cout << "\t**XGMI Error Status: " <<
static_cast<uint32_t>(err_stat) << std::endl;
}
}
// TODO(cfree) We need to find a way to generate xgmi errors so this
// test won't be meaningless
err = rsmi_dev_xgmi_error_reset(dv_ind);
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Successfully reset XGMI Error Status: " << std::endl;
}
}
}
@@ -0,0 +1,73 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2019, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef TESTS_ROCM_SMI_TEST_FUNCTIONAL_XGMI_READ_WRITE_H_
#define TESTS_ROCM_SMI_TEST_FUNCTIONAL_XGMI_READ_WRITE_H_
#include "rocm_smi_test/test_base.h"
class TestXGMIReadWrite : public TestBase {
public:
TestXGMIReadWrite();
// @Brief: Destructor for test case of TestXGMIReadWrite
virtual ~TestXGMIReadWrite();
// @Brief: Setup the environment for measurement
virtual void SetUp();
// @Brief: Core measurement execution
virtual void Run();
// @Brief: Clean up and retrive the resource
virtual void Close();
// @Brief: Display results
virtual void DisplayResults() const;
// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);
};
#endif // TESTS_ROCM_SMI_TEST_FUNCTIONAL_XGMI_READ_WRITE_H_
@@ -74,6 +74,7 @@
#include "functional/id_info_read.h"
#include "functional/perf_cntr_read_write.h"
#include "functional/process_info_read.h"
#include "functional/xgmi_read_write.h"
static RSMITstGlobals *sRSMIGlvalues = nullptr;
@@ -208,6 +209,10 @@ TEST(rsmitstReadOnly, TestProcInfoRead) {
TestProcInfoRead tst;
RunGenericTest(&tst);
}
TEST(rsmitstReadWrite, TestXGMIReadWrite) {
TestXGMIReadWrite tst;
RunGenericTest(&tst);
}
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);