Added rsmi_dev_error_count_get()

[ROCm/amdsmi commit: fb5f41fc10]
Этот коммит содержится в:
Chris Freehill
2019-03-01 16:33:11 -06:00
родитель 2c61b97684
Коммит 0f27774440
11 изменённых файлов: 334 добавлений и 18 удалений
+53 -8
Просмотреть файл
@@ -206,6 +206,28 @@ typedef enum {
RSMI_PWR_PROF_PRST_INVALID = 0xFFFFFFFFFFFFFFFF
} rsmi_power_profile_preset_masks;
/**
* @brief This enum is used to identify different GPU blocks.
*/
typedef enum {
RSMI_GPU_BLOCK_FIRST = 0,
RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST,
RSMI_GPU_BLOCK_SDMA,
RSMI_GPU_BLOCK_GFX,
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_GFX,
} rsmi_gpu_block;
/**
* @brief This values of this enum are used as frequency identifiers.
*/
typedef enum {
RSMI_FREQ_IND_MIN = 0, //!< Index used for the minimum frequency value
RSMI_FREQ_IND_MAX = 1, //!< Index used for the maximum frequency value
RSMI_FREQ_IND_INVALID = 0xFFFFFFFF //!< An invalid frequency index
} rsmi_freq_ind;
/**
* @brief Bitfield used in various RSMI calls
*/
@@ -342,13 +364,13 @@ typedef struct {
} rsmi_od_volt_freq_data;
/**
* @brief This values of this enum are used as frequency identifiers.
* @brief This structure holds error counts.
*/
typedef enum {
RSMI_FREQ_IND_MIN = 0, //!< Index used for the minimum frequency value
RSMI_FREQ_IND_MAX = 1, //!< Index used for the maximum frequency value
RSMI_FREQ_IND_INVALID = 0xFFFFFFFF //!< An invalid frequency index
} rsmi_freq_ind;
typedef struct {
uint64_t correctable_err; //!< Accumulated correctable errors
uint64_t uncorrectable_err; //!< Accumulated uncorrectable errors
} rsmi_error_count_t;
/**
* @brief Initialize ROCm SMI.
@@ -487,9 +509,8 @@ rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid);
*
* @param[inout] max_pkt_sz a pointer to uint64_t to which the maximum packet
* size will be written. If pointer is NULL, it will be ignored.
*
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*/
rsmi_status_t rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent,
uint64_t *received, uint64_t *max_pkt_sz);
@@ -687,6 +708,9 @@ rsmi_status_t rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len);
* specified temperature sensor on the specified device.
*
* @details Given a device index @p dv_ind, a 0-based sensor index
* @p sensor_ind, a ::rsmi_temperature_metric @p metric and a pointer to an
* int64_t @p temperature, this function will write the value of the metric
* indicated by @p metric to the memory location @p temperature.
*
* @param[in] dv_ind a device index
*
@@ -704,6 +728,27 @@ rsmi_status_t rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len);
*/
rsmi_status_t rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_ind,
rsmi_temperature_metric metric, int64_t *temperature);
/**
* @brief Retrieve the error counts for a GPU block
*
* @details Given a device index @p dv_ind, an ::rsmi_gpu_block @p block and a
* pointer to an ::rsmi_error_count_t @p ec, this function will write the error
* count values for the GPU block indicated by @p block to memory pointed to by
* @p ec.
*
* @param[in] dv_ind a device index
*
* @param[in] block The block for which error counts should be retrieved
*
* @param[inout] ec A pointer to an ::rsmi_error_count_t to which the error
* counts should be written
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*
*/
rsmi_status_t rsmi_dev_error_count_get(uint32_t dv_ind,
rsmi_gpu_block block, rsmi_error_count_t *ec);
/**
* @brief Reset the fan to automatic driver control
*
+5
Просмотреть файл
@@ -70,6 +70,11 @@ enum DevInfoTypes {
kDevPowerODVoltage,
kDevVBiosVer,
kDevPCIEThruPut,
kDevErrCntSDMA,
kDevErrCntUMC,
kDevErrCntGFX,
// Reserve spots for future ErrCnt blocks.
// Next, non-ErrCnt device enum should start at 100
};
class Device {
-2
Просмотреть файл
@@ -1,6 +1,4 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
+56 -2
Просмотреть файл
@@ -1,6 +1,4 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
@@ -433,6 +431,62 @@ rsmi_num_monitor_devices(uint32_t *num_devices) {
CATCH
}
rsmi_status_t
rsmi_dev_error_count_get(uint32_t dv_ind, rsmi_gpu_block block,
rsmi_error_count_t *ec) {
std::vector<std::string> val_vec;
rsmi_status_t ret;
TRY
if (ec == nullptr || block > RSMI_GPU_BLOCK_LAST) {
return RSMI_STATUS_INVALID_ARGS;
}
amd::smi::DevInfoTypes type;
switch (block) {
case RSMI_GPU_BLOCK_UMC:
type = amd::smi::kDevErrCntUMC;
break;
case RSMI_GPU_BLOCK_SDMA:
type = amd::smi::kDevErrCntSDMA;
break;
case RSMI_GPU_BLOCK_GFX:
type = amd::smi::kDevErrCntGFX;
break;
default:
assert(!"Unsupported block provided to rsmi_dev_error_count_get()");
return RSMI_STATUS_NOT_SUPPORTED;
}
ret = get_dev_value_vec(type, dv_ind, &val_vec);
if (ret == RSMI_STATUS_FILE_ERROR) {
return RSMI_STATUS_NOT_SUPPORTED;
}
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
assert(val_vec.size() == 2);
std::string junk;
std::istringstream fs1(val_vec[0]);
fs1 >> junk;
assert(junk == "ue:");
fs1 >> ec->uncorrectable_err;
std::istringstream fs2(val_vec[1]);
fs2 >> junk;
assert(junk == "ce:");
fs2 >> ec->correctable_err;
return ret;
CATCH
}
rsmi_status_t
rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) {
TRY
+12
Просмотреть файл
@@ -59,6 +59,7 @@
namespace amd {
namespace smi {
// Sysfs file names
static const char *kDevPerfLevelFName = "power_dpm_force_performance_level";
static const char *kDevDevIDFName = "device";
static const char *kDevOverDriveLevelFName = "pp_sclk_od";
@@ -70,6 +71,11 @@ static const char *kDevPowerODVoltageFName = "pp_od_clk_voltage";
static const char *kDevUsageFName = "gpu_busy_percent";
static const char *kDevVBiosVerFName = "vbios_version";
static const char *kDevPCIEThruPutFName = "pcie_bw";
static const char *kDevErrCntSDMAFName = "ras/sdma_err_count";
static const char *kDevErrCntUMCFName = "ras/umc_err_count";
static const char *kDevErrCntGFXFName = "ras/gfx_err_count";
// Strings that are found within sysfs files
static const char *kDevPerfLevelAutoStr = "auto";
static const char *kDevPerfLevelLowStr = "low";
static const char *kDevPerfLevelHighStr = "high";
@@ -92,6 +98,9 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevPowerODVoltage, kDevPowerODVoltageFName},
{kDevVBiosVer, kDevVBiosVerFName},
{kDevPCIEThruPut, kDevPCIEThruPutFName},
{kDevErrCntSDMA, kDevErrCntSDMAFName},
{kDevErrCntUMC, kDevErrCntUMCFName},
{kDevErrCntGFX, kDevErrCntGFXFName},
};
static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
@@ -322,6 +331,9 @@ int Device::readDevInfo(DevInfoTypes type, std::vector<std::string> *val) {
case kDevPCIEClk:
case kDevPowerProfileMode:
case kDevPowerODVoltage:
case kDevErrCntSDMA:
case kDevErrCntUMC:
case kDevErrCntGFX:
return readDevInfoMultiLineStr(type, val);
break;
-3
Просмотреть файл
@@ -1,7 +1,4 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
+116
Просмотреть файл
@@ -0,0 +1,116 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2019, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#include <stdint.h>
#include <stddef.h>
#include <iostream>
#include "gtest/gtest.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi_test/functional/err_cnt_read.h"
#include "rocm_smi_test/test_common.h"
TestErrCntRead::TestErrCntRead() : TestBase() {
set_title("RSMI Error Count Read Test");
set_description("The Error Count Read tests verifies that error counts"
" can be read properly.");
}
TestErrCntRead::~TestErrCntRead(void) {
}
void TestErrCntRead::SetUp(void) {
TestBase::SetUp();
return;
}
void TestErrCntRead::DisplayTestInfo(void) {
TestBase::DisplayTestInfo();
}
void TestErrCntRead::DisplayResults(void) const {
TestBase::DisplayResults();
return;
}
void TestErrCntRead::Close() {
// This will close handles opened within rsmitst utility calls and call
// rsmi_shut_down(), so it should be done after other hsa cleanup
TestBase::Close();
}
void TestErrCntRead::Run(void) {
rsmi_status_t err;
rsmi_error_count_t ec;
TestBase::Run();
for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
PrintDeviceHeader(i);
for (uint32_t b = RSMI_GPU_BLOCK_FIRST; b <= RSMI_GPU_BLOCK_LAST; ++b) {
err = rsmi_dev_error_count_get(i, static_cast<rsmi_gpu_block>(b), &ec);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout << "\t**Error Count for " <<
GetBlockNameStr(static_cast<rsmi_gpu_block>(b)) <<
": Not supported on this machine" << std::endl;
} else {
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**Error counts for " <<
GetBlockNameStr(static_cast<rsmi_gpu_block>(b)) << " block: "
<< std::endl;
std::cout << "\t\tCorrectable errors: " << ec.correctable_err
<< std::endl;
std::cout << "\t\tUncorrectable errors: " << ec.uncorrectable_err
<< std::endl;
}
}
}
}
}
+71
Просмотреть файл
@@ -0,0 +1,71 @@
/*
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2019, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef TESTS_ROCM_SMI_TEST_FUNCTIONAL_ERR_CNT_READ_H_
#define TESTS_ROCM_SMI_TEST_FUNCTIONAL_ERR_CNT_READ_H_
#include "rocm_smi_test/test_base.h"
class TestErrCntRead : public TestBase {
public:
TestErrCntRead();
// @Brief: Destructor for test case of TestErrCntRead
virtual ~TestErrCntRead();
// @Brief: Setup the environment for measurement
virtual void SetUp();
// @Brief: Core measurement execution
virtual void Run();
// @Brief: Clean up and retrive the resource
virtual void Close();
// @Brief: Display results
virtual void DisplayResults() const;
// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);
};
#endif // TESTS_ROCM_SMI_TEST_FUNCTIONAL_ERR_CNT_READ_H_
+5 -1
Просмотреть файл
@@ -68,6 +68,7 @@
#include "functional/power_read_write.h"
#include "functional/power_cap_read_write.h"
#include "functional/version_read.h"
#include "functional/err_cnt_read.h"
static RSMITstGlobals *sRSMIGlvalues = nullptr;
@@ -181,7 +182,10 @@ TEST(rsmitstReadWrite, TestPowerCapReadWrite) {
TestPowerCapReadWrite tst;
RunGenericTest(&tst);
}
TEST(rsmitstReadOnly, TestErrCntRead) {
TestErrCntRead tst;
RunGenericTest(&tst);
}
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
+14
Просмотреть файл
@@ -49,9 +49,17 @@
#include <iostream>
#include <string>
#include <map>
#include "rocm_smi_test/test_base.h"
#include "rocm_smi_test/test_common.h"
#include "rocm_smi/rocm_smi.h"
static const std::map<rsmi_gpu_block, const char *> kBlockNameMap = {
{RSMI_GPU_BLOCK_UMC, "UMC"},
{RSMI_GPU_BLOCK_SDMA, "SDMA"},
{RSMI_GPU_BLOCK_GFX, "GFX"},
};
static const struct option long_options[] = {
{"iterations", required_argument, nullptr, 'i'},
@@ -123,6 +131,12 @@ uint32_t ProcessCmdline(RSMITstGlobals* test, int arg_cnt, char** arg_list) {
return 0;
}
const char *GetBlockNameStr(rsmi_gpu_block id) {
return kBlockNameMap.at(id);
}
#if ENABLE_SMI
void DumpMonitorInfo(const TestBase *test) {
int ret = 0;
+2 -2
Просмотреть файл
@@ -48,9 +48,8 @@
#include <memory>
#include <vector>
#if ENABLE_SMI
#include "rocm_smi/rocm_smi.h"
#endif
struct RSMITstGlobals {
uint32_t verbosity;
@@ -62,6 +61,7 @@ struct RSMITstGlobals {
uint32_t ProcessCmdline(RSMITstGlobals* test, int arg_cnt, char** arg_list);
void PrintTestHeader(uint32_t dv_ind);
const char *GetBlockNameStr(rsmi_gpu_block id);
#if ENABLE_SMI
void DumpMonitorInfo(const TestBase *test);