@@ -206,6 +206,28 @@ typedef enum {
|
||||
RSMI_PWR_PROF_PRST_INVALID = 0xFFFFFFFFFFFFFFFF
|
||||
} rsmi_power_profile_preset_masks;
|
||||
|
||||
/**
|
||||
* @brief This enum is used to identify different GPU blocks.
|
||||
*/
|
||||
typedef enum {
|
||||
RSMI_GPU_BLOCK_FIRST = 0,
|
||||
|
||||
RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST,
|
||||
RSMI_GPU_BLOCK_SDMA,
|
||||
RSMI_GPU_BLOCK_GFX,
|
||||
|
||||
RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_GFX,
|
||||
} rsmi_gpu_block;
|
||||
|
||||
/**
|
||||
* @brief This values of this enum are used as frequency identifiers.
|
||||
*/
|
||||
typedef enum {
|
||||
RSMI_FREQ_IND_MIN = 0, //!< Index used for the minimum frequency value
|
||||
RSMI_FREQ_IND_MAX = 1, //!< Index used for the maximum frequency value
|
||||
RSMI_FREQ_IND_INVALID = 0xFFFFFFFF //!< An invalid frequency index
|
||||
} rsmi_freq_ind;
|
||||
|
||||
/**
|
||||
* @brief Bitfield used in various RSMI calls
|
||||
*/
|
||||
@@ -342,13 +364,13 @@ typedef struct {
|
||||
} rsmi_od_volt_freq_data;
|
||||
|
||||
/**
|
||||
* @brief This values of this enum are used as frequency identifiers.
|
||||
* @brief This structure holds error counts.
|
||||
*/
|
||||
typedef enum {
|
||||
RSMI_FREQ_IND_MIN = 0, //!< Index used for the minimum frequency value
|
||||
RSMI_FREQ_IND_MAX = 1, //!< Index used for the maximum frequency value
|
||||
RSMI_FREQ_IND_INVALID = 0xFFFFFFFF //!< An invalid frequency index
|
||||
} rsmi_freq_ind;
|
||||
typedef struct {
|
||||
uint64_t correctable_err; //!< Accumulated correctable errors
|
||||
uint64_t uncorrectable_err; //!< Accumulated uncorrectable errors
|
||||
} rsmi_error_count_t;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Initialize ROCm SMI.
|
||||
@@ -487,9 +509,8 @@ rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid);
|
||||
*
|
||||
* @param[inout] max_pkt_sz a pointer to uint64_t to which the maximum packet
|
||||
* size will be written. If pointer is NULL, it will be ignored.
|
||||
*
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
|
||||
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent,
|
||||
uint64_t *received, uint64_t *max_pkt_sz);
|
||||
@@ -687,6 +708,9 @@ rsmi_status_t rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len);
|
||||
* specified temperature sensor on the specified device.
|
||||
*
|
||||
* @details Given a device index @p dv_ind, a 0-based sensor index
|
||||
* @p sensor_ind, a ::rsmi_temperature_metric @p metric and a pointer to an
|
||||
* int64_t @p temperature, this function will write the value of the metric
|
||||
* indicated by @p metric to the memory location @p temperature.
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
@@ -704,6 +728,27 @@ rsmi_status_t rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len);
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_ind,
|
||||
rsmi_temperature_metric metric, int64_t *temperature);
|
||||
|
||||
/**
|
||||
* @brief Retrieve the error counts for a GPU block
|
||||
*
|
||||
* @details Given a device index @p dv_ind, an ::rsmi_gpu_block @p block and a
|
||||
* pointer to an ::rsmi_error_count_t @p ec, this function will write the error
|
||||
* count values for the GPU block indicated by @p block to memory pointed to by
|
||||
* @p ec.
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @param[in] block The block for which error counts should be retrieved
|
||||
*
|
||||
* @param[inout] ec A pointer to an ::rsmi_error_count_t to which the error
|
||||
* counts should be written
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
|
||||
*
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_error_count_get(uint32_t dv_ind,
|
||||
rsmi_gpu_block block, rsmi_error_count_t *ec);
|
||||
/**
|
||||
* @brief Reset the fan to automatic driver control
|
||||
*
|
||||
|
||||
@@ -70,6 +70,11 @@ enum DevInfoTypes {
|
||||
kDevPowerODVoltage,
|
||||
kDevVBiosVer,
|
||||
kDevPCIEThruPut,
|
||||
kDevErrCntSDMA,
|
||||
kDevErrCntUMC,
|
||||
kDevErrCntGFX,
|
||||
// Reserve spots for future ErrCnt blocks.
|
||||
// Next, non-ErrCnt device enum should start at 100
|
||||
};
|
||||
|
||||
class Device {
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
@@ -433,6 +431,62 @@ rsmi_num_monitor_devices(uint32_t *num_devices) {
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_error_count_get(uint32_t dv_ind, rsmi_gpu_block block,
|
||||
rsmi_error_count_t *ec) {
|
||||
std::vector<std::string> val_vec;
|
||||
rsmi_status_t ret;
|
||||
|
||||
TRY
|
||||
if (ec == nullptr || block > RSMI_GPU_BLOCK_LAST) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
amd::smi::DevInfoTypes type;
|
||||
switch (block) {
|
||||
case RSMI_GPU_BLOCK_UMC:
|
||||
type = amd::smi::kDevErrCntUMC;
|
||||
break;
|
||||
|
||||
case RSMI_GPU_BLOCK_SDMA:
|
||||
type = amd::smi::kDevErrCntSDMA;
|
||||
break;
|
||||
|
||||
case RSMI_GPU_BLOCK_GFX:
|
||||
type = amd::smi::kDevErrCntGFX;
|
||||
break;
|
||||
|
||||
default:
|
||||
assert(!"Unsupported block provided to rsmi_dev_error_count_get()");
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
ret = get_dev_value_vec(type, dv_ind, &val_vec);
|
||||
|
||||
if (ret == RSMI_STATUS_FILE_ERROR) {
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
assert(val_vec.size() == 2);
|
||||
|
||||
std::string junk;
|
||||
std::istringstream fs1(val_vec[0]);
|
||||
|
||||
fs1 >> junk;
|
||||
assert(junk == "ue:");
|
||||
fs1 >> ec->uncorrectable_err;
|
||||
|
||||
std::istringstream fs2(val_vec[1]);
|
||||
|
||||
fs2 >> junk;
|
||||
assert(junk == "ce:");
|
||||
fs2 >> ec->correctable_err;
|
||||
|
||||
return ret;
|
||||
CATCH
|
||||
}
|
||||
rsmi_status_t
|
||||
rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) {
|
||||
TRY
|
||||
|
||||
@@ -59,6 +59,7 @@
|
||||
namespace amd {
|
||||
namespace smi {
|
||||
|
||||
// Sysfs file names
|
||||
static const char *kDevPerfLevelFName = "power_dpm_force_performance_level";
|
||||
static const char *kDevDevIDFName = "device";
|
||||
static const char *kDevOverDriveLevelFName = "pp_sclk_od";
|
||||
@@ -70,6 +71,11 @@ static const char *kDevPowerODVoltageFName = "pp_od_clk_voltage";
|
||||
static const char *kDevUsageFName = "gpu_busy_percent";
|
||||
static const char *kDevVBiosVerFName = "vbios_version";
|
||||
static const char *kDevPCIEThruPutFName = "pcie_bw";
|
||||
static const char *kDevErrCntSDMAFName = "ras/sdma_err_count";
|
||||
static const char *kDevErrCntUMCFName = "ras/umc_err_count";
|
||||
static const char *kDevErrCntGFXFName = "ras/gfx_err_count";
|
||||
|
||||
// Strings that are found within sysfs files
|
||||
static const char *kDevPerfLevelAutoStr = "auto";
|
||||
static const char *kDevPerfLevelLowStr = "low";
|
||||
static const char *kDevPerfLevelHighStr = "high";
|
||||
@@ -92,6 +98,9 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
|
||||
{kDevPowerODVoltage, kDevPowerODVoltageFName},
|
||||
{kDevVBiosVer, kDevVBiosVerFName},
|
||||
{kDevPCIEThruPut, kDevPCIEThruPutFName},
|
||||
{kDevErrCntSDMA, kDevErrCntSDMAFName},
|
||||
{kDevErrCntUMC, kDevErrCntUMCFName},
|
||||
{kDevErrCntGFX, kDevErrCntGFXFName},
|
||||
};
|
||||
|
||||
static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
|
||||
@@ -322,6 +331,9 @@ int Device::readDevInfo(DevInfoTypes type, std::vector<std::string> *val) {
|
||||
case kDevPCIEClk:
|
||||
case kDevPowerProfileMode:
|
||||
case kDevPowerODVoltage:
|
||||
case kDevErrCntSDMA:
|
||||
case kDevErrCntUMC:
|
||||
case kDevErrCntGFX:
|
||||
return readDevInfoMultiLineStr(type, val);
|
||||
break;
|
||||
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
|
||||
Исполняемый файл
+116
@@ -0,0 +1,116 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2019, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi_test/functional/err_cnt_read.h"
|
||||
#include "rocm_smi_test/test_common.h"
|
||||
|
||||
TestErrCntRead::TestErrCntRead() : TestBase() {
|
||||
set_title("RSMI Error Count Read Test");
|
||||
set_description("The Error Count Read tests verifies that error counts"
|
||||
" can be read properly.");
|
||||
}
|
||||
|
||||
TestErrCntRead::~TestErrCntRead(void) {
|
||||
}
|
||||
|
||||
void TestErrCntRead::SetUp(void) {
|
||||
TestBase::SetUp();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void TestErrCntRead::DisplayTestInfo(void) {
|
||||
TestBase::DisplayTestInfo();
|
||||
}
|
||||
|
||||
void TestErrCntRead::DisplayResults(void) const {
|
||||
TestBase::DisplayResults();
|
||||
return;
|
||||
}
|
||||
|
||||
void TestErrCntRead::Close() {
|
||||
// This will close handles opened within rsmitst utility calls and call
|
||||
// rsmi_shut_down(), so it should be done after other hsa cleanup
|
||||
TestBase::Close();
|
||||
}
|
||||
|
||||
|
||||
void TestErrCntRead::Run(void) {
|
||||
rsmi_status_t err;
|
||||
rsmi_error_count_t ec;
|
||||
|
||||
TestBase::Run();
|
||||
|
||||
|
||||
for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
|
||||
PrintDeviceHeader(i);
|
||||
|
||||
for (uint32_t b = RSMI_GPU_BLOCK_FIRST; b <= RSMI_GPU_BLOCK_LAST; ++b) {
|
||||
err = rsmi_dev_error_count_get(i, static_cast<rsmi_gpu_block>(b), &ec);
|
||||
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "\t**Error Count for " <<
|
||||
GetBlockNameStr(static_cast<rsmi_gpu_block>(b)) <<
|
||||
": Not supported on this machine" << std::endl;
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Error counts for " <<
|
||||
GetBlockNameStr(static_cast<rsmi_gpu_block>(b)) << " block: "
|
||||
<< std::endl;
|
||||
std::cout << "\t\tCorrectable errors: " << ec.correctable_err
|
||||
<< std::endl;
|
||||
std::cout << "\t\tUncorrectable errors: " << ec.uncorrectable_err
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Исполняемый файл
+71
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2019, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
#ifndef TESTS_ROCM_SMI_TEST_FUNCTIONAL_ERR_CNT_READ_H_
|
||||
#define TESTS_ROCM_SMI_TEST_FUNCTIONAL_ERR_CNT_READ_H_
|
||||
|
||||
#include "rocm_smi_test/test_base.h"
|
||||
|
||||
class TestErrCntRead : public TestBase {
|
||||
public:
|
||||
TestErrCntRead();
|
||||
|
||||
// @Brief: Destructor for test case of TestErrCntRead
|
||||
virtual ~TestErrCntRead();
|
||||
|
||||
// @Brief: Setup the environment for measurement
|
||||
virtual void SetUp();
|
||||
|
||||
// @Brief: Core measurement execution
|
||||
virtual void Run();
|
||||
|
||||
// @Brief: Clean up and retrive the resource
|
||||
virtual void Close();
|
||||
|
||||
// @Brief: Display results
|
||||
virtual void DisplayResults() const;
|
||||
|
||||
// @Brief: Display information about what this test does
|
||||
virtual void DisplayTestInfo(void);
|
||||
};
|
||||
|
||||
#endif // TESTS_ROCM_SMI_TEST_FUNCTIONAL_ERR_CNT_READ_H_
|
||||
@@ -68,6 +68,7 @@
|
||||
#include "functional/power_read_write.h"
|
||||
#include "functional/power_cap_read_write.h"
|
||||
#include "functional/version_read.h"
|
||||
#include "functional/err_cnt_read.h"
|
||||
|
||||
static RSMITstGlobals *sRSMIGlvalues = nullptr;
|
||||
|
||||
@@ -181,7 +182,10 @@ TEST(rsmitstReadWrite, TestPowerCapReadWrite) {
|
||||
TestPowerCapReadWrite tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
|
||||
TEST(rsmitstReadOnly, TestErrCntRead) {
|
||||
TestErrCntRead tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
|
||||
@@ -49,9 +49,17 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
#include "rocm_smi_test/test_base.h"
|
||||
#include "rocm_smi_test/test_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
static const std::map<rsmi_gpu_block, const char *> kBlockNameMap = {
|
||||
{RSMI_GPU_BLOCK_UMC, "UMC"},
|
||||
{RSMI_GPU_BLOCK_SDMA, "SDMA"},
|
||||
{RSMI_GPU_BLOCK_GFX, "GFX"},
|
||||
};
|
||||
|
||||
static const struct option long_options[] = {
|
||||
{"iterations", required_argument, nullptr, 'i'},
|
||||
@@ -123,6 +131,12 @@ uint32_t ProcessCmdline(RSMITstGlobals* test, int arg_cnt, char** arg_list) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *GetBlockNameStr(rsmi_gpu_block id) {
|
||||
return kBlockNameMap.at(id);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if ENABLE_SMI
|
||||
void DumpMonitorInfo(const TestBase *test) {
|
||||
int ret = 0;
|
||||
|
||||
@@ -48,9 +48,8 @@
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#if ENABLE_SMI
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#endif
|
||||
|
||||
struct RSMITstGlobals {
|
||||
uint32_t verbosity;
|
||||
@@ -62,6 +61,7 @@ struct RSMITstGlobals {
|
||||
uint32_t ProcessCmdline(RSMITstGlobals* test, int arg_cnt, char** arg_list);
|
||||
|
||||
void PrintTestHeader(uint32_t dv_ind);
|
||||
const char *GetBlockNameStr(rsmi_gpu_block id);
|
||||
|
||||
#if ENABLE_SMI
|
||||
void DumpMonitorInfo(const TestBase *test);
|
||||
|
||||
Ссылка в новой задаче
Block a user