From 0f27774440380db3312fbbf3bc43e8bbd3b459f3 Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Fri, 1 Mar 2019 16:33:11 -0600 Subject: [PATCH] Added rsmi_dev_error_count_get() [ROCm/amdsmi commit: fb5f41fc10fb65c3931ff7470aeb586b4ee0b1a7] --- projects/amdsmi/include/rocm_smi/rocm_smi.h | 61 +++++++-- .../amdsmi/include/rocm_smi/rocm_smi_device.h | 5 + .../amdsmi/include/rocm_smi/rocm_smi_utils.h | 2 - projects/amdsmi/src/rocm_smi.cc | 58 ++++++++- projects/amdsmi/src/rocm_smi_device.cc | 12 ++ projects/amdsmi/src/rocm_smi_main.cc | 3 - .../rocm_smi_test/functional/err_cnt_read.cc | 116 ++++++++++++++++++ .../rocm_smi_test/functional/err_cnt_read.h | 71 +++++++++++ projects/amdsmi/tests/rocm_smi_test/main.cc | 6 +- .../amdsmi/tests/rocm_smi_test/test_common.cc | 14 +++ .../amdsmi/tests/rocm_smi_test/test_common.h | 4 +- 11 files changed, 334 insertions(+), 18 deletions(-) create mode 100755 projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc create mode 100755 projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.h diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/include/rocm_smi/rocm_smi.h index 6fc2a41644..303607e4b0 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi.h @@ -206,6 +206,28 @@ typedef enum { RSMI_PWR_PROF_PRST_INVALID = 0xFFFFFFFFFFFFFFFF } rsmi_power_profile_preset_masks; +/** + * @brief This enum is used to identify different GPU blocks. + */ +typedef enum { + RSMI_GPU_BLOCK_FIRST = 0, + + RSMI_GPU_BLOCK_UMC = RSMI_GPU_BLOCK_FIRST, + RSMI_GPU_BLOCK_SDMA, + RSMI_GPU_BLOCK_GFX, + + RSMI_GPU_BLOCK_LAST = RSMI_GPU_BLOCK_GFX, +} rsmi_gpu_block; + +/** + * @brief This values of this enum are used as frequency identifiers. + */ +typedef enum { + RSMI_FREQ_IND_MIN = 0, //!< Index used for the minimum frequency value + RSMI_FREQ_IND_MAX = 1, //!< Index used for the maximum frequency value + RSMI_FREQ_IND_INVALID = 0xFFFFFFFF //!< An invalid frequency index +} rsmi_freq_ind; + /** * @brief Bitfield used in various RSMI calls */ @@ -342,13 +364,13 @@ typedef struct { } rsmi_od_volt_freq_data; /** - * @brief This values of this enum are used as frequency identifiers. + * @brief This structure holds error counts. */ -typedef enum { - RSMI_FREQ_IND_MIN = 0, //!< Index used for the minimum frequency value - RSMI_FREQ_IND_MAX = 1, //!< Index used for the maximum frequency value - RSMI_FREQ_IND_INVALID = 0xFFFFFFFF //!< An invalid frequency index -} rsmi_freq_ind; +typedef struct { + uint64_t correctable_err; //!< Accumulated correctable errors + uint64_t uncorrectable_err; //!< Accumulated uncorrectable errors +} rsmi_error_count_t; + /** * @brief Initialize ROCm SMI. @@ -487,9 +509,8 @@ rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid); * * @param[inout] max_pkt_sz a pointer to uint64_t to which the maximum packet * size will be written. If pointer is NULL, it will be ignored. - * + * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - */ rsmi_status_t rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent, uint64_t *received, uint64_t *max_pkt_sz); @@ -687,6 +708,9 @@ rsmi_status_t rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len); * specified temperature sensor on the specified device. * * @details Given a device index @p dv_ind, a 0-based sensor index + * @p sensor_ind, a ::rsmi_temperature_metric @p metric and a pointer to an + * int64_t @p temperature, this function will write the value of the metric + * indicated by @p metric to the memory location @p temperature. * * @param[in] dv_ind a device index * @@ -704,6 +728,27 @@ rsmi_status_t rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len); */ rsmi_status_t rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_ind, rsmi_temperature_metric metric, int64_t *temperature); + +/** + * @brief Retrieve the error counts for a GPU block + * + * @details Given a device index @p dv_ind, an ::rsmi_gpu_block @p block and a + * pointer to an ::rsmi_error_count_t @p ec, this function will write the error + * count values for the GPU block indicated by @p block to memory pointed to by + * @p ec. + * + * @param[in] dv_ind a device index + * + * @param[in] block The block for which error counts should be retrieved + * + * @param[inout] ec A pointer to an ::rsmi_error_count_t to which the error + * counts should be written + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * + */ +rsmi_status_t rsmi_dev_error_count_get(uint32_t dv_ind, + rsmi_gpu_block block, rsmi_error_count_t *ec); /** * @brief Reset the fan to automatic driver control * diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h index 426c326832..2cb1f48ee2 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h @@ -70,6 +70,11 @@ enum DevInfoTypes { kDevPowerODVoltage, kDevVBiosVer, kDevPCIEThruPut, + kDevErrCntSDMA, + kDevErrCntUMC, + kDevErrCntGFX, + // Reserve spots for future ErrCnt blocks. + // Next, non-ErrCnt device enum should start at 100 }; class Device { diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h index 57c0754536..15bae95e14 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h @@ -1,6 +1,4 @@ /* - * ============================================================================= - * ROC Runtime Conformance Release License * ============================================================================= * The University of Illinois/NCSA * Open Source License (NCSA) diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index 44449fa1bc..382533c1ea 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -1,6 +1,4 @@ /* - * ============================================================================= - * ROC Runtime Conformance Release License * ============================================================================= * The University of Illinois/NCSA * Open Source License (NCSA) @@ -433,6 +431,62 @@ rsmi_num_monitor_devices(uint32_t *num_devices) { CATCH } +rsmi_status_t +rsmi_dev_error_count_get(uint32_t dv_ind, rsmi_gpu_block block, + rsmi_error_count_t *ec) { + std::vector val_vec; + rsmi_status_t ret; + + TRY + if (ec == nullptr || block > RSMI_GPU_BLOCK_LAST) { + return RSMI_STATUS_INVALID_ARGS; + } + + amd::smi::DevInfoTypes type; + switch (block) { + case RSMI_GPU_BLOCK_UMC: + type = amd::smi::kDevErrCntUMC; + break; + + case RSMI_GPU_BLOCK_SDMA: + type = amd::smi::kDevErrCntSDMA; + break; + + case RSMI_GPU_BLOCK_GFX: + type = amd::smi::kDevErrCntGFX; + break; + + default: + assert(!"Unsupported block provided to rsmi_dev_error_count_get()"); + return RSMI_STATUS_NOT_SUPPORTED; + } + ret = get_dev_value_vec(type, dv_ind, &val_vec); + + if (ret == RSMI_STATUS_FILE_ERROR) { + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + assert(val_vec.size() == 2); + + std::string junk; + std::istringstream fs1(val_vec[0]); + + fs1 >> junk; + assert(junk == "ue:"); + fs1 >> ec->uncorrectable_err; + + std::istringstream fs2(val_vec[1]); + + fs2 >> junk; + assert(junk == "ce:"); + fs2 >> ec->correctable_err; + + return ret; + CATCH +} rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) { TRY diff --git a/projects/amdsmi/src/rocm_smi_device.cc b/projects/amdsmi/src/rocm_smi_device.cc index 47bcacbb9e..e5f86be33c 100755 --- a/projects/amdsmi/src/rocm_smi_device.cc +++ b/projects/amdsmi/src/rocm_smi_device.cc @@ -59,6 +59,7 @@ namespace amd { namespace smi { +// Sysfs file names static const char *kDevPerfLevelFName = "power_dpm_force_performance_level"; static const char *kDevDevIDFName = "device"; static const char *kDevOverDriveLevelFName = "pp_sclk_od"; @@ -70,6 +71,11 @@ static const char *kDevPowerODVoltageFName = "pp_od_clk_voltage"; static const char *kDevUsageFName = "gpu_busy_percent"; static const char *kDevVBiosVerFName = "vbios_version"; static const char *kDevPCIEThruPutFName = "pcie_bw"; +static const char *kDevErrCntSDMAFName = "ras/sdma_err_count"; +static const char *kDevErrCntUMCFName = "ras/umc_err_count"; +static const char *kDevErrCntGFXFName = "ras/gfx_err_count"; + +// Strings that are found within sysfs files static const char *kDevPerfLevelAutoStr = "auto"; static const char *kDevPerfLevelLowStr = "low"; static const char *kDevPerfLevelHighStr = "high"; @@ -92,6 +98,9 @@ static const std::map kDevAttribNameMap = { {kDevPowerODVoltage, kDevPowerODVoltageFName}, {kDevVBiosVer, kDevVBiosVerFName}, {kDevPCIEThruPut, kDevPCIEThruPutFName}, + {kDevErrCntSDMA, kDevErrCntSDMAFName}, + {kDevErrCntUMC, kDevErrCntUMCFName}, + {kDevErrCntGFX, kDevErrCntGFXFName}, }; static const std::map kDevPerfLvlMap = { @@ -322,6 +331,9 @@ int Device::readDevInfo(DevInfoTypes type, std::vector *val) { case kDevPCIEClk: case kDevPowerProfileMode: case kDevPowerODVoltage: + case kDevErrCntSDMA: + case kDevErrCntUMC: + case kDevErrCntGFX: return readDevInfoMultiLineStr(type, val); break; diff --git a/projects/amdsmi/src/rocm_smi_main.cc b/projects/amdsmi/src/rocm_smi_main.cc index fc78e77d40..4c7d5f96d5 100755 --- a/projects/amdsmi/src/rocm_smi_main.cc +++ b/projects/amdsmi/src/rocm_smi_main.cc @@ -1,7 +1,4 @@ /* - * ============================================================================= - * ROC Runtime Conformance Release License - * ============================================================================= * The University of Illinois/NCSA * Open Source License (NCSA) * diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc b/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc new file mode 100755 index 0000000000..efa0f2e520 --- /dev/null +++ b/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc @@ -0,0 +1,116 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2019, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#include +#include + +#include + +#include "gtest/gtest.h" +#include "rocm_smi/rocm_smi.h" +#include "rocm_smi_test/functional/err_cnt_read.h" +#include "rocm_smi_test/test_common.h" + +TestErrCntRead::TestErrCntRead() : TestBase() { + set_title("RSMI Error Count Read Test"); + set_description("The Error Count Read tests verifies that error counts" + " can be read properly."); +} + +TestErrCntRead::~TestErrCntRead(void) { +} + +void TestErrCntRead::SetUp(void) { + TestBase::SetUp(); + + return; +} + +void TestErrCntRead::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void TestErrCntRead::DisplayResults(void) const { + TestBase::DisplayResults(); + return; +} + +void TestErrCntRead::Close() { + // This will close handles opened within rsmitst utility calls and call + // rsmi_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + + +void TestErrCntRead::Run(void) { + rsmi_status_t err; + rsmi_error_count_t ec; + + TestBase::Run(); + + + for (uint32_t i = 0; i < num_monitor_devs(); ++i) { + PrintDeviceHeader(i); + + for (uint32_t b = RSMI_GPU_BLOCK_FIRST; b <= RSMI_GPU_BLOCK_LAST; ++b) { + err = rsmi_dev_error_count_get(i, static_cast(b), &ec); + + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**Error Count for " << + GetBlockNameStr(static_cast(b)) << + ": Not supported on this machine" << std::endl; + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Error counts for " << + GetBlockNameStr(static_cast(b)) << " block: " + << std::endl; + std::cout << "\t\tCorrectable errors: " << ec.correctable_err + << std::endl; + std::cout << "\t\tUncorrectable errors: " << ec.uncorrectable_err + << std::endl; + } + } + } + } +} diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.h b/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.h new file mode 100755 index 0000000000..3653ebebe0 --- /dev/null +++ b/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.h @@ -0,0 +1,71 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2019, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ +#ifndef TESTS_ROCM_SMI_TEST_FUNCTIONAL_ERR_CNT_READ_H_ +#define TESTS_ROCM_SMI_TEST_FUNCTIONAL_ERR_CNT_READ_H_ + +#include "rocm_smi_test/test_base.h" + +class TestErrCntRead : public TestBase { + public: + TestErrCntRead(); + + // @Brief: Destructor for test case of TestErrCntRead + virtual ~TestErrCntRead(); + + // @Brief: Setup the environment for measurement + virtual void SetUp(); + + // @Brief: Core measurement execution + virtual void Run(); + + // @Brief: Clean up and retrive the resource + virtual void Close(); + + // @Brief: Display results + virtual void DisplayResults() const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); +}; + +#endif // TESTS_ROCM_SMI_TEST_FUNCTIONAL_ERR_CNT_READ_H_ diff --git a/projects/amdsmi/tests/rocm_smi_test/main.cc b/projects/amdsmi/tests/rocm_smi_test/main.cc index 0ff9b48307..369671cfea 100755 --- a/projects/amdsmi/tests/rocm_smi_test/main.cc +++ b/projects/amdsmi/tests/rocm_smi_test/main.cc @@ -68,6 +68,7 @@ #include "functional/power_read_write.h" #include "functional/power_cap_read_write.h" #include "functional/version_read.h" +#include "functional/err_cnt_read.h" static RSMITstGlobals *sRSMIGlvalues = nullptr; @@ -181,7 +182,10 @@ TEST(rsmitstReadWrite, TestPowerCapReadWrite) { TestPowerCapReadWrite tst; RunGenericTest(&tst); } - +TEST(rsmitstReadOnly, TestErrCntRead) { + TestErrCntRead tst; + RunGenericTest(&tst); +} int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/projects/amdsmi/tests/rocm_smi_test/test_common.cc b/projects/amdsmi/tests/rocm_smi_test/test_common.cc index 4869cbf158..b662dc9c12 100755 --- a/projects/amdsmi/tests/rocm_smi_test/test_common.cc +++ b/projects/amdsmi/tests/rocm_smi_test/test_common.cc @@ -49,9 +49,17 @@ #include #include +#include #include "rocm_smi_test/test_base.h" #include "rocm_smi_test/test_common.h" +#include "rocm_smi/rocm_smi.h" + +static const std::map kBlockNameMap = { + {RSMI_GPU_BLOCK_UMC, "UMC"}, + {RSMI_GPU_BLOCK_SDMA, "SDMA"}, + {RSMI_GPU_BLOCK_GFX, "GFX"}, +}; static const struct option long_options[] = { {"iterations", required_argument, nullptr, 'i'}, @@ -123,6 +131,12 @@ uint32_t ProcessCmdline(RSMITstGlobals* test, int arg_cnt, char** arg_list) { return 0; } +const char *GetBlockNameStr(rsmi_gpu_block id) { + return kBlockNameMap.at(id); +} + + + #if ENABLE_SMI void DumpMonitorInfo(const TestBase *test) { int ret = 0; diff --git a/projects/amdsmi/tests/rocm_smi_test/test_common.h b/projects/amdsmi/tests/rocm_smi_test/test_common.h index 5769153f05..58368beb83 100755 --- a/projects/amdsmi/tests/rocm_smi_test/test_common.h +++ b/projects/amdsmi/tests/rocm_smi_test/test_common.h @@ -48,9 +48,8 @@ #include #include -#if ENABLE_SMI + #include "rocm_smi/rocm_smi.h" -#endif struct RSMITstGlobals { uint32_t verbosity; @@ -62,6 +61,7 @@ struct RSMITstGlobals { uint32_t ProcessCmdline(RSMITstGlobals* test, int arg_cnt, char** arg_list); void PrintTestHeader(uint32_t dv_ind); +const char *GetBlockNameStr(rsmi_gpu_block id); #if ENABLE_SMI void DumpMonitorInfo(const TestBase *test);