diff --git a/projects/amdsmi/docs/ROCm_SMI_Manual.pdf b/projects/amdsmi/docs/ROCm_SMI_Manual.pdf index 7f65a42ac1..955d870b61 100644 Binary files a/projects/amdsmi/docs/ROCm_SMI_Manual.pdf and b/projects/amdsmi/docs/ROCm_SMI_Manual.pdf differ diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/include/rocm_smi/rocm_smi.h index f270bcd36d..49fa374d4a 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi.h @@ -402,7 +402,7 @@ typedef enum { } rsmi_memory_type_t; /** - * @brief This values of this enum are used as frequency identifiers. + * @brief The values of this enum are used as frequency identifiers. */ typedef enum { RSMI_FREQ_IND_MIN = 0, //!< Index used for the minimum frequency value @@ -413,6 +413,15 @@ typedef enum { typedef rsmi_freq_ind_t rsmi_freq_ind; /// \endcond +/** + * @brief XGMI Status + */ +typedef enum { + RSMI_XGMI_STATUS_NO_ERRORS = 0, + RSMI_XGMI_STATUS_ERROR, + RSMI_XGMI_STATUS_MULTIPLE_ERRORS, +} rsmi_xgmi_status_t; + /** * @brief Bitfield used in various RSMI calls */ @@ -1915,6 +1924,48 @@ rsmi_compute_process_info_by_pid_get(uint32_t pid, rsmi_process_info_t *proc); /** @} */ // end of SysInfo +/*****************************************************************************/ +/** @defgroup XGMIInfo XGMI Functions + * These functions are used to configure, query and control XGMI. + * @{ + */ + +/** + * @brief Retrieve the XGMI error status for a device + * + * @details Given a device index @p dv_ind, and a pointer to an + * ::rsmi_xgmi_status_t @p status, this function will write the current XGMI + * error state ::rsmi_xgmi_status_t for the device @p dv_ind to the memory + * pointed to by @p status. + * + * @param[in] dv_ind a device index + * + * @param[inout] status A pointer to an ::rsmi_xgmi_status_t to which the + * XGMI error state should be written + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * + */ +rsmi_status_t +rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status); + +/** + * @brief Reset the XGMI error status for a device + * + * @details Given a device index @p dv_ind, this function will reset the + * current XGMI error state ::rsmi_xgmi_status_t for the device @p dv_ind to + * rsmi_xgmi_status_t::RSMI_XGMI_STATUS_NO_ERRORS + * + * @param[in] dv_ind a device index + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * + */ +rsmi_status_t +rsmi_dev_xgmi_error_reset(uint32_t dv_ind); + +/** @} */ // end of SysInfo + #ifdef __cplusplus } #endif // __cplusplus diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h index d92e3014a9..d773453c88 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h @@ -96,6 +96,7 @@ enum DevInfoTypes { kDevUniqueId, kDevDFCountersAvailable, kDevMemBusyPercent, + kDevXGMIError, }; class Device { diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index 777c298c94..72c260c824 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -2146,6 +2146,10 @@ rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) { DEVICE_MUTEX rsmi_status_t ret; + if (unique_id == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + ret = get_dev_value_int(amd::smi::kDevUniqueId, dv_ind, unique_id); return ret; @@ -2349,3 +2353,62 @@ rsmi_compute_process_info_by_pid_get(uint32_t pid, CATCH } + +rsmi_status_t +rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status) { + TRY + DEVICE_MUTEX + + if (status == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + rsmi_status_t ret; + uint64_t status_code; + + ret = get_dev_value_int(amd::smi::kDevXGMIError, dv_ind, &status_code); + + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + switch (status_code) { + case 0: + *status = RSMI_XGMI_STATUS_NO_ERRORS; + break; + + case 1: + *status = RSMI_XGMI_STATUS_ERROR; + break; + + case 2: + *status = RSMI_XGMI_STATUS_MULTIPLE_ERRORS; + break; + + default: + assert(!"Unexpected XGMI error status read"); + return RSMI_STATUS_UNKNOWN_ERROR; + } + return RSMI_STATUS_SUCCESS; + + CATCH +} + +rsmi_status_t +rsmi_dev_xgmi_error_reset(uint32_t dv_ind) { + TRY + DEVICE_MUTEX + + rsmi_status_t ret; + uint64_t status_code; + + ret = get_dev_value_int(amd::smi::kDevXGMIError, dv_ind, &status_code); + + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + return RSMI_STATUS_SUCCESS; + + CATCH +} diff --git a/projects/amdsmi/src/rocm_smi_device.cc b/projects/amdsmi/src/rocm_smi_device.cc index 7065d9486f..a4fcd1160f 100755 --- a/projects/amdsmi/src/rocm_smi_device.cc +++ b/projects/amdsmi/src/rocm_smi_device.cc @@ -101,6 +101,7 @@ static const char *kDevPCIEReplayCountFName = "pcie_replay_count"; static const char *kDevUniqueIdFName = "unique_id"; static const char *kDevDFCountersAvailableFName = "df_cntr_avail"; static const char *kDevMemBusyPercentFName = "mem_busy_percent"; +static const char *kDevXGMIErrorFName = "xgmi_error"; // Strings that are found within sysfs files static const char *kDevPerfLevelAutoStr = "auto"; @@ -145,6 +146,7 @@ static const std::map kDevAttribNameMap = { {kDevPCIEReplayCount, kDevPCIEReplayCountFName}, {kDevUniqueId, kDevUniqueIdFName}, {kDevDFCountersAvailable, kDevDFCountersAvailableFName}, + {kDevXGMIError, kDevXGMIErrorFName}, }; static const std::map kDevPerfLvlMap = { @@ -380,6 +382,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevPCIEReplayCount: case kDevDFCountersAvailable: case kDevMemBusyPercent: + case kDevXGMIError: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); *val = std::stoul(tempStr, 0); diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/xgmi_read_write.cc b/projects/amdsmi/tests/rocm_smi_test/functional/xgmi_read_write.cc new file mode 100755 index 0000000000..747ac927c8 --- /dev/null +++ b/projects/amdsmi/tests/rocm_smi_test/functional/xgmi_read_write.cc @@ -0,0 +1,123 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2019, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#include +#include + +#include + +#include "gtest/gtest.h" +#include "rocm_smi/rocm_smi.h" +#include "rocm_smi_test/functional/xgmi_read_write.h" +#include "rocm_smi_test/test_common.h" + +TestXGMIReadWrite::TestXGMIReadWrite() : TestBase() { + set_title("RSMI XGMI Read/Write Test"); + set_description("This test verifies that XGMI error counts can be read" + " properly, and that the count can be reset."); +} + +TestXGMIReadWrite::~TestXGMIReadWrite(void) { +} + +void TestXGMIReadWrite::SetUp(void) { + TestBase::SetUp(); + + return; +} + +void TestXGMIReadWrite::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void TestXGMIReadWrite::DisplayResults(void) const { + TestBase::DisplayResults(); + return; +} + +void TestXGMIReadWrite::Close() { + // This will close handles opened within rsmitst utility calls and call + // rsmi_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + + +void TestXGMIReadWrite::Run(void) { + rsmi_status_t err; + rsmi_xgmi_status_t err_stat; + + TestBase::Run(); + + for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { + PrintDeviceHeader(dv_ind); + + err = rsmi_dev_xgmi_error_status(dv_ind, &err_stat); + + if (err != RSMI_STATUS_SUCCESS) { + if (err == RSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**XGMI Error Status: Not supported on this machine" + << std::endl; + return; + } + } else { + CHK_ERR_ASRT(err) + } + } else { + IF_VERB(STANDARD) { + std::cout << "\t**XGMI Error Status: " << + static_cast(err_stat) << std::endl; + } + } + + // TODO(cfree) We need to find a way to generate xgmi errors so this + // test won't be meaningless + err = rsmi_dev_xgmi_error_reset(dv_ind); + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Successfully reset XGMI Error Status: " << std::endl; + } + } +} diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/xgmi_read_write.h b/projects/amdsmi/tests/rocm_smi_test/functional/xgmi_read_write.h new file mode 100755 index 0000000000..4541f7a346 --- /dev/null +++ b/projects/amdsmi/tests/rocm_smi_test/functional/xgmi_read_write.h @@ -0,0 +1,73 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2019, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ +#ifndef TESTS_ROCM_SMI_TEST_FUNCTIONAL_XGMI_READ_WRITE_H_ +#define TESTS_ROCM_SMI_TEST_FUNCTIONAL_XGMI_READ_WRITE_H_ + +#include "rocm_smi_test/test_base.h" + +class TestXGMIReadWrite : public TestBase { + public: + TestXGMIReadWrite(); + + // @Brief: Destructor for test case of TestXGMIReadWrite + virtual ~TestXGMIReadWrite(); + + // @Brief: Setup the environment for measurement + virtual void SetUp(); + + // @Brief: Core measurement execution + virtual void Run(); + + // @Brief: Clean up and retrive the resource + virtual void Close(); + + // @Brief: Display results + virtual void DisplayResults() const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); +}; + +#endif // TESTS_ROCM_SMI_TEST_FUNCTIONAL_XGMI_READ_WRITE_H_ diff --git a/projects/amdsmi/tests/rocm_smi_test/main.cc b/projects/amdsmi/tests/rocm_smi_test/main.cc index afce6643e7..2d8b954f09 100755 --- a/projects/amdsmi/tests/rocm_smi_test/main.cc +++ b/projects/amdsmi/tests/rocm_smi_test/main.cc @@ -74,6 +74,7 @@ #include "functional/id_info_read.h" #include "functional/perf_cntr_read_write.h" #include "functional/process_info_read.h" +#include "functional/xgmi_read_write.h" static RSMITstGlobals *sRSMIGlvalues = nullptr; @@ -208,6 +209,10 @@ TEST(rsmitstReadOnly, TestProcInfoRead) { TestProcInfoRead tst; RunGenericTest(&tst); } +TEST(rsmitstReadWrite, TestXGMIReadWrite) { + TestXGMIReadWrite tst; + RunGenericTest(&tst); +} int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv);