diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 007d9cb930..4434e93f39 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -463,6 +463,27 @@ typedef uint64_t rsmi_bit_field_t; typedef rsmi_bit_field_t rsmi_bit_field; /// \endcond +/** + * @brief Reserved Memory Page States + */ +typedef enum { + RSMI_MEM_PAGE_STATUS_RESERVED = 0, //!< Reserved. This gpu page is reserved + //!< and not available for use + RSMI_MEM_PAGE_STATUS_PENDING, //!< Pending. This gpu page is marked + //!< as bad and will be marked reserved + //!< at the next window. + RSMI_MEM_PAGE_STATUS_UNRESERVABLE //!< Unable to reserve this page +} rsmi_memory_page_status_t; + +/** + * @brief Reserved Memory Page Record + */ +typedef struct { + uint64_t page_address; //!< Start address of page + uint64_t page_size; //!< Page size + rsmi_memory_page_status_t status; //!< Page "reserved" status +} rsmi_retired_page_record_t; + /** * @brief Number of possible power profiles that a system could support */ @@ -1228,6 +1249,38 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, rsmi_status_t rsmi_dev_memory_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent); +/** + * @brief Get information about reserved ("retired") memory pages + * + * @details Given a device index @p dv_ind, this function returns retired page + * information @p records corresponding to the device with the provided device + * index @p dv_ind. The number of retired page records is returned through @p + * num_pages. @p records may be NULL on input. In this case, the number of + * records available for retrieval will be returned through @p num_pages. + * + * @param[in] dv_ind a device index + * + * @param[inout] num_pages a pointer to a uint32. As input, the value passed + * through this parameter is the number of ::rsmi_retired_page_record_t's that + * may be safely written to the memory pointed to by @p records. This is the + * limit on how many records will be written to @p records. On return, @p + * num_pages will contain the number of records written to @p records, or the + * number of records that could have been written if enough memory had been + * provided. + * + * @param[inout] records A pointer to a block of memory to which the + * ::rsmi_retired_page_record_t values will be written. This value may be NULL. + * In this case, this function can be used to query how many records are + * available to read. + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * + * ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if more records were available + * than allowed by the provided, allocated memory. + */ +rsmi_status_t +rsmi_dev_memory_reserved_pages_get(uint32_t dv_ind, uint32_t *num_pages, + rsmi_retired_page_record_t *records); /** @} */ // end of MemQuer /** @defgroup PhysQuer Physical State Queries diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h index 9905835582..73ec27de69 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h @@ -119,6 +119,7 @@ enum DevInfoTypes { kDevFwVersionVce, kDevFwVersionVcn, kDevSerialNumber, + kDevMemPageBad, }; class Device { diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index dc510bcd90..2b3953af80 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -2477,6 +2477,78 @@ rsmi_compute_process_info_get(rsmi_process_info_t *procs, CATCH } +rsmi_status_t +rsmi_dev_memory_reserved_pages_get(uint32_t dv_ind, uint32_t *num_pages, + rsmi_retired_page_record_t *records) { + TRY + + rsmi_status_t ret; + + if (num_pages == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + + std::vector val_vec; + + ret = get_dev_value_vec(amd::smi::kDevMemPageBad, dv_ind, &val_vec); + + if (ret == RSMI_STATUS_FILE_ERROR) { + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + if (records == nullptr || *num_pages > val_vec.size()) { + *num_pages = val_vec.size(); + } + if (records == nullptr) { + return RSMI_STATUS_SUCCESS; + } + + // Fill in records + char status_code; + rsmi_memory_page_status_t tmp_stat; + std::string junk; + + for (uint32_t i = 0; i < *num_pages; ++i) { + std::istringstream fs1(val_vec[i]); + + fs1 >> std::hex >> records[i].page_address; + fs1 >> junk; + assert(junk == ":"); + fs1 >> std::hex >> records[i].page_size; + fs1 >> junk; + assert(junk == ":"); + fs1 >> status_code; + + switch (status_code) { + case 'P': + tmp_stat = RSMI_MEM_PAGE_STATUS_PENDING; + break; + + case 'F': + tmp_stat = RSMI_MEM_PAGE_STATUS_UNRESERVABLE; + break; + + case 'R': + tmp_stat = RSMI_MEM_PAGE_STATUS_RESERVED; + break; + default: + assert(!"Unexpected retired memory page status code read"); + return RSMI_STATUS_UNKNOWN_ERROR; + } + records[i].status = tmp_stat; + } + if (*num_pages < val_vec.size()) { + return RSMI_STATUS_INSUFFICIENT_SIZE; + } + + return RSMI_STATUS_SUCCESS; + + CATCH +} + rsmi_status_t rsmi_compute_process_info_by_pid_get(uint32_t pid, rsmi_process_info_t *proc) { diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index 5090244efe..5d4ce010d2 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -91,6 +91,7 @@ static const char *kDevErrCntSDMAFName = "ras/sdma_err_count"; static const char *kDevErrCntUMCFName = "ras/umc_err_count"; static const char *kDevErrCntGFXFName = "ras/gfx_err_count"; static const char *kDevErrCntFeaturesFName = "ras/features"; +static const char *kDevMemPageBadFName = "ras/gpu_vram_bad_pages"; static const char *kDevMemTotGTTFName = "mem_info_gtt_total"; static const char *kDevMemTotVisVRAMFName = "mem_info_vis_vram_total"; static const char *kDevMemTotVRAMFName = "mem_info_vram_total"; @@ -194,6 +195,7 @@ static const std::map kDevAttribNameMap = { {kDevFwVersionVce, kDevFwVersionVceFName}, {kDevFwVersionVcn, kDevFwVersionVcnFName}, {kDevSerialNumber, kDevSerialNumberFName}, + {kDevMemPageBad, kDevMemPageBadFName}, }; static const std::map kDevPerfLvlMap = { @@ -394,7 +396,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, } if (retVec->size() == 0) { - return EPERM; + return 0; } // Remove any *trailing* empty (whitespace) lines while (retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) { @@ -484,6 +486,7 @@ int Device::readDevInfo(DevInfoTypes type, std::vector *val) { case kDevErrCntUMC: case kDevErrCntGFX: case kDevErrCntFeatures: + case kDevMemPageBad: return readDevInfoMultiLineStr(type, val); break; diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/mem_page_info_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/mem_page_info_read.cc new file mode 100755 index 0000000000..874bb38c71 --- /dev/null +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/mem_page_info_read.cc @@ -0,0 +1,157 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2019, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#include +#include + +#include +#include + +#include "gtest/gtest.h" +#include "rocm_smi/rocm_smi.h" +#include "rocm_smi_test/functional/mem_page_info_read.h" +#include "rocm_smi_test/test_common.h" + +TestMemPageInfoRead::TestMemPageInfoRead() : TestBase() { + set_title("RSMI Memory Page Info Test"); + set_description("The Memory Page Info. test verifies that we can read " + "memory page information, and then displays the information read"); +} + +TestMemPageInfoRead::~TestMemPageInfoRead(void) { +} + +void TestMemPageInfoRead::SetUp(void) { + TestBase::SetUp(); + + return; +} + +void TestMemPageInfoRead::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void TestMemPageInfoRead::DisplayResults(void) const { + TestBase::DisplayResults(); + return; +} + +void TestMemPageInfoRead::Close() { + // This will close handles opened within rsmitst utility calls and call + // rsmi_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + +void TestMemPageInfoRead::Run(void) { + rsmi_status_t err; + rsmi_retired_page_record_t *records; + uint32_t num_pages; + + TestBase::Run(); + + for (uint32_t i = 0; i < num_monitor_devs(); ++i) { + PrintDeviceHeader(i); + + err = rsmi_dev_memory_reserved_pages_get(i, &num_pages, nullptr); + + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << + "\t**Memory page information is not supported for this device" + << std::endl; + continue; + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\tNumber of memory page records: " << num_pages << + std::endl; + } + } + + if (num_pages > 0) { + records = new rsmi_retired_page_record_t[num_pages]; + + assert(records != nullptr); + + err = rsmi_dev_memory_reserved_pages_get(i, &num_pages, records); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << "\t**Getting Memory Page Retirement Status not " + "supported for this device" << std::endl; + continue; + } else { + CHK_ERR_ASRT(err) + } + + IF_VERB(STANDARD) { + std::cout.setf(std::ios::hex, std::ios::basefield); + std::string page_state; + + for (uint32_t p = 0; p < num_pages; ++p) { + std::cout << "\t\tAddress: 0x" << records[p].page_address; + std::cout << " Size: " << records[p].page_size; + + switch (records[p].status) { + case RSMI_MEM_PAGE_STATUS_RESERVED: + page_state = "Retired"; + break; + + case RSMI_MEM_PAGE_STATUS_PENDING: + page_state = "Pending"; + break; + + case RSMI_MEM_PAGE_STATUS_UNRESERVABLE: + page_state = "Unreservable"; + break; + + default: + ASSERT_EQ(0, 1) << "Unexpected memory page status"; + } + std::cout << " Status: " << page_state << std::endl; + } + std::cout.setf(std::ios::dec, std::ios::basefield); + } + delete []records; + } else { + continue; + } + } +} diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/mem_page_info_read.h b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/mem_page_info_read.h new file mode 100755 index 0000000000..76a25c349b --- /dev/null +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/mem_page_info_read.h @@ -0,0 +1,71 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2019, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ +#ifndef TESTS_ROCM_SMI_TEST_FUNCTIONAL_MEM_PAGE_INFO_READ_H_ +#define TESTS_ROCM_SMI_TEST_FUNCTIONAL_MEM_PAGE_INFO_READ_H_ + +#include "rocm_smi_test/test_base.h" + +class TestMemPageInfoRead : public TestBase { + public: + TestMemPageInfoRead(); + + // @Brief: Destructor for test case of TestMemPageInfoRead + virtual ~TestMemPageInfoRead(); + + // @Brief: Setup the environment for measurement + virtual void SetUp(); + + // @Brief: Core measurement execution + virtual void Run(); + + // @Brief: Clean up and retrive the resource + virtual void Close(); + + // @Brief: Display results + virtual void DisplayResults() const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); +}; + +#endif // TESTS_ROCM_SMI_TEST_FUNCTIONAL_MEM_PAGE_INFO_READ_H_ diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc index 2d8b954f09..5a3290692f 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc @@ -75,6 +75,7 @@ #include "functional/perf_cntr_read_write.h" #include "functional/process_info_read.h" #include "functional/xgmi_read_write.h" +#include "functional/mem_page_info_read.h" static RSMITstGlobals *sRSMIGlvalues = nullptr; @@ -213,6 +214,10 @@ TEST(rsmitstReadWrite, TestXGMIReadWrite) { TestXGMIReadWrite tst; RunGenericTest(&tst); } +TEST(rsmitstReadOnly, TestMemPageInfoRead) { + TestMemPageInfoRead tst; + RunGenericTest(&tst); +} int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv);