From f504f697e383a6ee9700a1631c74e9d1d103ef0b Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Tue, 4 May 2021 09:49:55 -0400 Subject: [PATCH] The Diagnostic API interface The API interface defines how the caller will use the API. An example also shows how the API can be used. It also defines the RdcDiagnostic module which can load the library dynamically and then dispatch diagnostic test to run. Change-Id: I1e041aab86f7e19338860f5ba65262977f4ea9cb [ROCm/rdc commit: eab3625d65479b037f613b1adb8dc188064aa994] --- projects/rdc/example/CMakeLists.txt | 10 ++ projects/rdc/example/diagnostic_example.cc | 157 ++++++++++++++++++ projects/rdc/include/rdc/rdc.h | 119 +++++++++++++ projects/rdc/include/rdc_lib/RdcDiagnostic.h | 63 +++++++ .../rdc_lib/RdcDiagnosticLibInterface.h | 52 ++++++ projects/rdc/include/rdc_lib/RdcHandler.h | 6 + projects/rdc/include/rdc_lib/RdcModuleMgr.h | 2 + .../rdc_lib/impl/RdcDiagnosticModule.h | 78 +++++++++ .../include/rdc_lib/impl/RdcEmbeddedHandler.h | 5 + .../include/rdc_lib/impl/RdcModuleMgrImpl.h | 5 +- projects/rdc/include/rdc_lib/impl/RdcRasLib.h | 22 ++- projects/rdc/include/rdc_lib/impl/RdcSmiLib.h | 24 ++- .../rdc_lib/impl/RdcStandaloneHandler.h | 5 + .../include/rdc_lib/impl/RdcTelemetryModule.h | 3 +- projects/rdc/rdc_libs/CMakeLists.txt | 7 +- .../rdc_libs/bootstrap/src/RdcBootStrap.cc | 27 +++ .../rdc_libs/rdc/src/RdcDiagnosticModule.cc | 146 ++++++++++++++++ .../rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 16 ++ .../rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc | 24 ++- projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc | 40 +++++ projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc | 39 +++++ .../rdc_libs/rdc/src/RdcTelemetryModule.cc | 5 +- .../rdc_client/src/RdcStandaloneHandler.cc | 11 ++ 23 files changed, 856 insertions(+), 10 deletions(-) create mode 100644 projects/rdc/example/diagnostic_example.cc create mode 100644 projects/rdc/include/rdc_lib/RdcDiagnostic.h create mode 100644 projects/rdc/include/rdc_lib/RdcDiagnosticLibInterface.h create mode 100644 projects/rdc/include/rdc_lib/impl/RdcDiagnosticModule.h create mode 100644 projects/rdc/rdc_libs/rdc/src/RdcDiagnosticModule.cc diff --git a/projects/rdc/example/CMakeLists.txt b/projects/rdc/example/CMakeLists.txt index fa8dc2c740..a3576b6aac 100755 --- a/projects/rdc/example/CMakeLists.txt +++ b/projects/rdc/example/CMakeLists.txt @@ -86,6 +86,16 @@ add_executable(${FIELDVALUE_EXAMPLE_EXE} "${FIELDVALUE_EXAMPLE_SRC_LIST}") target_link_libraries(${FIELDVALUE_EXAMPLE_EXE} pthread dl rdc_bootstrap) +set(DIAGNOSTIC_EXAMPLE_SRC_LIST "${SRC_DIR}/diagnostic_example.cc") +message("DIAGNOSTIC_EXAMPLE_SRC_LIST=${DIAGNOSTIC_EXAMPLE_SRC_LIST}") +set(DIAGNOSTIC_EXAMPLE_EXE "diagnostic") + +link_directories(${LIB_BOOSTRAP_DIR}) + +add_executable(${DIAGNOSTIC_EXAMPLE_EXE} "${DIAGNOSTIC_EXAMPLE_SRC_LIST}") + +target_link_libraries(${DIAGNOSTIC_EXAMPLE_EXE} pthread dl rdc_bootstrap) + message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") message(" Finished Cmake Example ") diff --git a/projects/rdc/example/diagnostic_example.cc b/projects/rdc/example/diagnostic_example.cc new file mode 100644 index 0000000000..eb47f008e1 --- /dev/null +++ b/projects/rdc/example/diagnostic_example.cc @@ -0,0 +1,157 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include "rdc/rdc.h" + +static std::string get_test_name(rdc_diag_test_cases_t test_case) { + const std::map test_desc = { + {RDC_DIAG_COMPUTE_PROCESS, "No compute process"}, + {RDC_DIAG_SDMA_QUEUE, "SDMA Queue ready"}, + {RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"}, + {RDC_DIAG_VRAM_CHECK, "VRAM check"}, + {RDC_DIAG_SYS_MEM_CHECK, "System memory check"}, + {RDC_DIAG_NODE_TOPOLOGY, "Node topology check"}, + {RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"}, + {RDC_DIAG_TEST_LAST, "Unknown"} + }; + + auto test_name = test_desc.find(test_case); + if (test_name == test_desc.end()) { + return "Unknown Test"; + } + return test_name->second; +} + +int main(int, char **) { + rdc_status_t result; + rdc_handle_t rdc_handle; + bool standalone = false; + char hostIpAddress[] = {"127.0.0.1:50051"}; + char group_name[] = {"diag_group"}; + + // Select the embedded mode and standalone mode dynamically. + std::cout << "Start rdci in: \n"; + std::cout << "0 - Embedded mode \n"; + std::cout << "1 - Standalone mode \n"; + while (!(std::cin >> standalone)) { + std::cout << "Invalid input.\n"; + std::cin.clear(); + std::cin.ignore(); + } + std::cout << std::endl; + std::cout << (standalone? + "Standalone mode selected.\n":"Embedded mode selected.\n"); + + // Init the rdc + result = rdc_init(0); + + if (result != RDC_ST_OK) { + std::cout << "Error initializing RDC. Return: " << + rdc_status_string(result) << std::endl; + goto cleanup; + } else { + std::cout << "RDC Initialized.\n"; + } + + if (standalone) { // standalone + result = rdc_connect(hostIpAddress, &rdc_handle, + nullptr, nullptr, nullptr); + if ( result != RDC_ST_OK ) { + std::cout << "Error connecting to remote rdcd. Return: " + << rdc_status_string(result) << std::endl; + goto cleanup; + } + } else { // embedded + result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); + if (result != RDC_ST_OK) { + std::cout << "Error starting embedded RDC engine. Return: " + << rdc_status_string(result) << std::endl; + goto cleanup; + } + } + + // Now we can use the same API for both standalone and embedded + // (1) create group for all GPUs + rdc_gpu_group_t group_id; + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_DEFAULT, + group_name, &group_id); + if (result != RDC_ST_OK) { + std::cout << "Error creating group. Return: " + << rdc_status_string(result); + goto cleanup; + } + + // (2) start to run short diagnostic. + rdc_diag_response_t response; + result = rdc_diagnostic_run(rdc_handle, group_id, + RDC_DIAG_LVL_SHORT, &response); + + if (result != RDC_ST_OK) { + std::cout << "Error run RDC_DIAG_LVL_SHORT diagnostic. Return: " + << rdc_status_string(result); + goto cleanup; + } + + // (3) Check diagnostic results + for (uint32_t i=0 ; i < response.results_count; i++) { + const rdc_diag_test_result_t& test_result = + response.diag_info[i]; + std::cout << std::setw(22) << std::left + << get_test_name(test_result.test_case) + ":" + << rdc_diagnostic_result_string(test_result.status) << "\n"; + } + + // (4) diagnostic detail information + std::cout <<" =============== Diagnostic Details ==================\n"; + for (uint32_t i=0 ; i < response.results_count; i++) { + const rdc_diag_test_result_t& test_result = + response.diag_info[i]; + if (test_result.info[0] != '\0') { + std::cout << std::setw(22) << std::left + << get_test_name(test_result.test_case) + ":" + << test_result.info << "\n"; + } + for (uint32_t j=0; j < test_result.per_gpu_result_count; j++) { + const rdc_diag_per_gpu_result_t& gpu_result + = test_result.gpu_results[j]; + if (strlen(gpu_result.gpu_result.msg) > 0) { + std::cout << " GPU " << gpu_result.gpu_index + << " " << gpu_result.gpu_result.msg << "\n"; + } + } + } + // Cleanup consists of shutting down RDC. + cleanup: + std::cout << "Cleaning up.\n"; + if (standalone) + rdc_disconnect(rdc_handle); + else + rdc_stop_embedded(rdc_handle); + rdc_shutdown(); + return result; +} diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index e25148abb6..4ef192ecd4 100755 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -394,6 +394,91 @@ typedef struct { uint64_t stop_time; //!< job stop time } rdc_job_group_info_t; +/** + * @brief type of diagnostic level + */ +typedef enum { + RDC_DIAG_LVL_INVALID = 0, //!< invalid level + RDC_DIAG_LVL_SHORT, //!< take a few seconds to run + RDC_DIAG_LVL_MED, //!< take less than 2 minutes to run + RDC_DIAG_LVL_LONG //!< take up to 15 minutes to run +} rdc_diag_level_t; + +/** + * @brief type of diagnostic result + */ +typedef enum { + RDC_DIAG_RESULT_PASS, //!< The diagnostic test pass + RDC_DIAG_RESULT_SKIP, //!< The diagnostic test skipped + RDC_DIAG_RESULT_WARN, //!< The diagnostic test has warnings + RDC_DIAG_RESULT_FAIL //!< The diagnostic test fail +} rdc_diag_result_t; + +/** + * @brief The test cases to run + */ +typedef enum { + RDC_DIAG_TEST_FIRST = 0, + //!< The diagnostic test pass + RDC_DIAG_COMPUTE_PROCESS = RDC_DIAG_TEST_FIRST, + RDC_DIAG_SDMA_QUEUE, //!< The SDMA Queue is ready + RDC_DIAG_COMPUTE_QUEUE, //!< The Compute Queue is ready + RDC_DIAG_VRAM_CHECK, //!< Check VRAM + RDC_DIAG_SYS_MEM_CHECK, //!< Check System memory + RDC_DIAG_NODE_TOPOLOGY, //!< Report node topology + RDC_DIAG_GPU_PARAMETERS, //!< GPU parameters in range + RDC_DIAG_TEST_LAST = RDC_DIAG_GPU_PARAMETERS +} rdc_diag_test_cases_t; + +/** + * @brief The maximum test cases to run + */ +const uint32_t MAX_TEST_CASES = RDC_DIAG_TEST_LAST - RDC_DIAG_TEST_FIRST + 1; + +/** + * @brief The maximum length of the diagnostic messages + */ +#define MAX_DIAG_MSG_LENGTH 4096 + +/** + * @brief details of the diagnostic errors + */ +typedef struct { + char msg[MAX_DIAG_MSG_LENGTH]; //!< The test result details + uint32_t code; //!< The low level error code +} rdc_diag_detail_t; + +/** + * @brief details of the per gpu diagnostic results + */ +typedef struct { + uint32_t gpu_index; //!< The GPU index + rdc_diag_detail_t gpu_result; //!< The detail results +} rdc_diag_per_gpu_result_t; + +/** + * @brief The diagnostic results for all GPUs + */ +typedef struct { + rdc_diag_result_t status; //!< The diagnostic result + rdc_diag_detail_t details; //!< The summary details + rdc_diag_test_cases_t test_case; //!< The test case to run + + uint32_t per_gpu_result_count; //!< How many gpu_results + //!< Result details + rdc_diag_per_gpu_result_t gpu_results[RDC_MAX_NUM_DEVICES]; + + char info[MAX_DIAG_MSG_LENGTH]; //!< Detail information +} rdc_diag_test_result_t; + +/** + * @brief The diagnostic responses for test cases + */ +typedef struct { + uint32_t results_count; + rdc_diag_test_result_t diag_info[MAX_TEST_CASES]; +} rdc_diag_response_t; + /** * @brief Initialize ROCm RDC. * @@ -872,6 +957,29 @@ rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle, rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id); +/** + * @brief Run the diagnostic test cases + * + * @details Run the diagnostic test cases at differenet levles. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @param[in] level The level decides how long the test will run. + * The RDC_DIAG_LVL_SHORT only take a few seconds, and the + * the RDC_DIAG_LVL_LONG may take up to 15 minutes. + * + * @param[inout] response The detail results of the tests run. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_diagnostic_run( + rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response); + /** * @brief Get a description of a provided RDC error status * @@ -905,6 +1013,17 @@ const char* field_id_string(rdc_field_t field_id); */ rdc_field_t get_field_id_from_name(const char* name); +/** + * @brief Get a description of a diagnostic result. + * + * @details return the string in human readable format. + * + * @param[in] result The RDC diagnostic result. + * + * @retval The string to describe the RDC diagnostic result. + */ +const char* rdc_diagnostic_result_string(rdc_diag_result_t result); + #ifdef __cplusplus } #endif // __cplusplus diff --git a/projects/rdc/include/rdc_lib/RdcDiagnostic.h b/projects/rdc/include/rdc_lib/RdcDiagnostic.h new file mode 100644 index 0000000000..0f1fef4154 --- /dev/null +++ b/projects/rdc/include/rdc_lib/RdcDiagnostic.h @@ -0,0 +1,63 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_ +#define INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_ + +#include +#include "rdc/rdc.h" +#include "rdc_lib/RdcDiagnosticLibInterface.h" + +namespace amd { +namespace rdc { + +class RdcDiagnostic { + public: + // get support test cases + virtual rdc_status_t rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) = 0; + + // Run a specific test case + virtual rdc_status_t rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) = 0; + + // Run multiple test cases + virtual rdc_status_t rdc_diagnostic_run( + const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) = 0; + + virtual rdc_status_t rdc_diag_init(uint64_t flags) = 0; + virtual rdc_status_t rdc_diag_destroy() = 0; + + virtual ~RdcDiagnostic() {} +}; +typedef std::shared_ptr RdcDiagnosticPtr; + +} // namespace rdc +} // namespace amd + + +#endif // INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_ diff --git a/projects/rdc/include/rdc_lib/RdcDiagnosticLibInterface.h b/projects/rdc/include/rdc_lib/RdcDiagnosticLibInterface.h new file mode 100644 index 0000000000..d833d36648 --- /dev/null +++ b/projects/rdc/include/rdc_lib/RdcDiagnosticLibInterface.h @@ -0,0 +1,52 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_ +#define INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_ + +// The telemetry interface for libraries, for example, RAS. +#include + + +extern "C" { + +// The library will implement below function + +// Which test cases are supported in the library +rdc_status_t rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count); + +// Run a specific test case +rdc_status_t rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result); + +rdc_status_t rdc_diag_init(uint64_t flags); + +rdc_status_t rdc_diag_destroy(); + +} + + +#endif // INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_ diff --git a/projects/rdc/include/rdc_lib/RdcHandler.h b/projects/rdc/include/rdc_lib/RdcHandler.h index 08a570b7f1..a51eb90add 100644 --- a/projects/rdc/include/rdc_lib/RdcHandler.h +++ b/projects/rdc/include/rdc_lib/RdcHandler.h @@ -80,6 +80,12 @@ class RdcHandler { virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) = 0; + // Diagnostic API + virtual rdc_status_t rdc_diagnostic_run( + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) = 0; + // Control API virtual rdc_status_t rdc_field_update_all(uint32_t wait_for_update) = 0; diff --git a/projects/rdc/include/rdc_lib/RdcModuleMgr.h b/projects/rdc/include/rdc_lib/RdcModuleMgr.h index ac71057fde..cfb577cf1f 100644 --- a/projects/rdc/include/rdc_lib/RdcModuleMgr.h +++ b/projects/rdc/include/rdc_lib/RdcModuleMgr.h @@ -26,6 +26,7 @@ THE SOFTWARE. #include "rdc_lib/rdc_common.h" #include "rdc/rdc.h" #include "rdc_lib/RdcTelemetry.h" +#include "rdc_lib/RdcDiagnostic.h" namespace amd { namespace rdc { @@ -33,6 +34,7 @@ namespace rdc { class RdcModuleMgr { public: virtual RdcTelemetryPtr get_telemetry_module() = 0; + virtual RdcDiagnosticPtr get_diagnostic_module() = 0; }; typedef std::shared_ptr RdcModuleMgrPtr; diff --git a/projects/rdc/include/rdc_lib/impl/RdcDiagnosticModule.h b/projects/rdc/include/rdc_lib/impl/RdcDiagnosticModule.h new file mode 100644 index 0000000000..9ad1debb14 --- /dev/null +++ b/projects/rdc/include/rdc_lib/impl/RdcDiagnosticModule.h @@ -0,0 +1,78 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_ +#define INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_ + +#include +#include +#include +#include +#include "rdc_lib/RdcDiagnostic.h" +#include "rdc_lib/impl/RdcRasLib.h" +#include "rdc_lib/impl/RdcSmiLib.h" + +namespace amd { +namespace rdc { + +class RdcDiagnosticModule : public RdcDiagnostic { + public: + rdc_status_t rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) override; + + // Run a specific test case + rdc_status_t rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) override; + + rdc_status_t rdc_diagnostic_run( + const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) override; + + rdc_status_t rdc_diag_init(uint64_t flags) override; + rdc_status_t rdc_diag_destroy() override; + + explicit RdcDiagnosticModule(const RdcSmiLibPtr& smi_lib, + const RdcRasLibPtr& ras_module); + + private: + //< Helper function to dispatch fields to module + void get_fields_for_module( + rdc_gpu_field_t* fields, + uint32_t fields_count, + std::map> + & fields_in_module, + std::vector& unsupport_fields); // NOLINT + std::list diagnostic_modules_; + std::map testcases_to_module_; +}; + +typedef std::shared_ptr RdcDiagnosticModulePtr; + +} // namespace rdc +} // namespace amd + + +#endif // INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h index 8fa54e4723..12113aad20 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -86,6 +86,11 @@ class RdcEmbeddedHandler: public RdcHandler { uint64_t *next_since_time_stamp, rdc_field_value* value) override; rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override; + // Diagnostic API + rdc_status_t rdc_diagnostic_run( + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) override; // Control API rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; diff --git a/projects/rdc/include/rdc_lib/impl/RdcModuleMgrImpl.h b/projects/rdc/include/rdc_lib/impl/RdcModuleMgrImpl.h index e1467e5e0d..e7bffaad24 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcModuleMgrImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcModuleMgrImpl.h @@ -27,6 +27,7 @@ THE SOFTWARE. #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcTelemetry.h" #include "rdc_lib/impl/RdcRasLib.h" +#include "rdc_lib/impl/RdcSmiLib.h" namespace amd { namespace rdc { @@ -34,14 +35,16 @@ namespace rdc { class RdcModuleMgrImpl: public RdcModuleMgr { public: RdcTelemetryPtr get_telemetry_module() override; + RdcDiagnosticPtr get_diagnostic_module() override; explicit RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher); private: // Function module RdcTelemetryPtr rdc_telemetry_module_; + RdcDiagnosticPtr rdc_diagnostic_module_; // Domain module RdcRasLibPtr ras_lib_; - + RdcSmiLibPtr smi_lib_; RdcMetricFetcherPtr fetcher_; }; diff --git a/projects/rdc/include/rdc_lib/impl/RdcRasLib.h b/projects/rdc/include/rdc_lib/impl/RdcRasLib.h index ae8255e8e7..4864b0bcf1 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcRasLib.h +++ b/projects/rdc/include/rdc_lib/impl/RdcRasLib.h @@ -30,11 +30,12 @@ THE SOFTWARE. #include #include "rdc_lib/RdcLibraryLoader.h" #include "rdc_lib/RdcTelemetry.h" +#include "rdc_lib/RdcDiagnostic.h" namespace amd { namespace rdc { -class RdcRasLib: public RdcTelemetry { +class RdcRasLib: public RdcTelemetry, public RdcDiagnostic { public: // get support field ids rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], @@ -51,6 +52,25 @@ class RdcRasLib: public RdcTelemetry { rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count) override; + rdc_status_t rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) override; + + // Run a specific test case + rdc_status_t rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) override; + + rdc_status_t rdc_diagnostic_run( + const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) override; + + rdc_status_t rdc_diag_init(uint64_t flags) override; + rdc_status_t rdc_diag_destroy() override; + explicit RdcRasLib(const char* lib_name); ~RdcRasLib(); diff --git a/projects/rdc/include/rdc_lib/impl/RdcSmiLib.h b/projects/rdc/include/rdc_lib/impl/RdcSmiLib.h index 5f311b6a2b..bed6d7f7e4 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcSmiLib.h +++ b/projects/rdc/include/rdc_lib/impl/RdcSmiLib.h @@ -26,11 +26,12 @@ THE SOFTWARE. #include #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcTelemetry.h" +#include "rdc_lib/RdcDiagnostic.h" namespace amd { namespace rdc { -class RdcSmiLib : public RdcTelemetry { +class RdcSmiLib : public RdcTelemetry, public RdcDiagnostic { public: // get support field ids rdc_status_t rdc_telemetry_fields_query( @@ -46,12 +47,33 @@ class RdcSmiLib : public RdcTelemetry { rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count) override; + rdc_status_t rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) override; + + // Run a specific test case + rdc_status_t rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) override; + + rdc_status_t rdc_diagnostic_run( + const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) override; + + rdc_status_t rdc_diag_init(uint64_t flags) override; + rdc_status_t rdc_diag_destroy() override; + explicit RdcSmiLib(const RdcMetricFetcherPtr& mf); private: RdcMetricFetcherPtr metric_fetcher_; }; +typedef std::shared_ptr RdcSmiLibPtr; + } // namespace rdc } // namespace amd diff --git a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h index 439bbbe128..009775d35a 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h @@ -80,6 +80,11 @@ class RdcStandaloneHandler: public RdcHandler { uint64_t *next_since_time_stamp, rdc_field_value* value) override; rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override; + // Diagnostic API + rdc_status_t rdc_diagnostic_run( + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) override; // Control RdcAPI rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; diff --git a/projects/rdc/include/rdc_lib/impl/RdcTelemetryModule.h b/projects/rdc/include/rdc_lib/impl/RdcTelemetryModule.h index 62c588a374..c9255d6043 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcTelemetryModule.h +++ b/projects/rdc/include/rdc_lib/impl/RdcTelemetryModule.h @@ -28,6 +28,7 @@ THE SOFTWARE. #include #include "rdc_lib/RdcTelemetry.h" #include "rdc_lib/impl/RdcRasLib.h" +#include "rdc_lib/impl/RdcSmiLib.h" #include "rdc_lib/RdcMetricFetcher.h" namespace amd { @@ -48,7 +49,7 @@ class RdcTelemetryModule : public RdcTelemetry { rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count); - RdcTelemetryModule(const RdcMetricFetcherPtr& fetcher, + RdcTelemetryModule(const RdcSmiLibPtr& smi_lib, const RdcRasLibPtr& ras_module); private: diff --git a/projects/rdc/rdc_libs/CMakeLists.txt b/projects/rdc/rdc_libs/CMakeLists.txt index e4b244b1e7..6d27f93b01 100755 --- a/projects/rdc/rdc_libs/CMakeLists.txt +++ b/projects/rdc/rdc_libs/CMakeLists.txt @@ -135,6 +135,7 @@ add_library(${BOOTSTRAP_LIB} SHARED ${BOOTSTRAP_LIB_SRC_LIST} ${BOOTSTRAP_LIB_IN target_link_libraries(${BOOTSTRAP_LIB} pthread dl) target_include_directories(${BOOTSTRAP_LIB} PRIVATE "${RSMI_INC_DIR}" + "${ROCM_DIR}/include" "${PROJECT_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}/include" "${COMMON_DIR}" @@ -159,6 +160,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcWatchTableImpl.c set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcRasLib.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcSmiLib.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcTelemetryModule.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcDiagnosticModule.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcModuleMgrImpl.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcNotificationImpl.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RsmiUtils.cc") @@ -182,6 +184,9 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcMod set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcModuleMgr.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcTelemetry.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcTelemetryModule.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcDiagnostic.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcDiagnosticLibInterface.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcDiagnosticModule.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcNotification.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcNotificationImpl.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RsmiUtils.h") @@ -190,7 +195,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_capabilities.h") message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}") -link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64") +link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64" "${ROCM_DIR}/lib") add_library(${RDC_LIB} SHARED ${RDC_LIB_SRC_LIST} ${RDC_LIB_INC_LIST}) target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64 cap) target_include_directories(${RDC_LIB} PRIVATE diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc index 624c1ff9a9..db1600bc95 100644 --- a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -322,6 +322,18 @@ rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle, rdc_group_field_destroy(rdc_field_group_id); } +rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_diagnostic_run(group_id, level, response); +} + const char* rdc_status_string(rdc_status_t result) { switch (result) { case RDC_ST_OK: @@ -361,6 +373,21 @@ const char* rdc_status_string(rdc_status_t result) { } } +const char* rdc_diagnostic_result_string(rdc_diag_result_t result) { + switch (result) { + case RDC_DIAG_RESULT_PASS: + return "Pass"; + case RDC_DIAG_RESULT_SKIP: + return "Skip"; + case RDC_DIAG_RESULT_WARN: + return "Warn"; + case RDC_DIAG_RESULT_FAIL: + return "Fail"; + default: + return "Unknown"; + } +} + const char* field_id_string(rdc_field_t field_id) { amd::rdc::fld_id2name_map_t &field_id_to_descript = amd::rdc::get_field_id_description_from_id(); diff --git a/projects/rdc/rdc_libs/rdc/src/RdcDiagnosticModule.cc b/projects/rdc/rdc_libs/rdc/src/RdcDiagnosticModule.cc new file mode 100644 index 0000000000..39e65572c4 --- /dev/null +++ b/projects/rdc/rdc_libs/rdc/src/RdcDiagnosticModule.cc @@ -0,0 +1,146 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/impl/RdcDiagnosticModule.h" +#include +#include +#include +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/impl/RdcSmiLib.h" +#include "rdc_lib/impl/RdcRasLib.h" + +namespace amd { +namespace rdc { + +rdc_status_t RdcDiagnosticModule::rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) { + if (test_case_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + auto ite = diagnostic_modules_.begin(); + *test_case_count = 0; + for (; ite != diagnostic_modules_.end(); ite++) { + uint32_t count = 0; + rdc_status_t status = (*ite)->rdc_diag_test_cases_query( + &(test_cases[*test_case_count]), &count); + if (status == RDC_ST_OK) { + *test_case_count += count; + } + } + return RDC_ST_OK; +} + +rdc_status_t RdcDiagnosticModule::rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + // Init test status + auto ite = testcases_to_module_.find(test_case); + if (ite == testcases_to_module_.end()) { + result->status = RDC_DIAG_RESULT_SKIP; + strncpy_with_null(result->info, "Not implemented", MAX_DIAG_MSG_LENGTH); + return RDC_ST_NOT_SUPPORTED; + } + return ite->second->rdc_test_case_run(test_case, + gpu_index, gpu_count, result); +} + +rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run( + const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + if (response == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + std::vector rdc_runs; + if (level >= RDC_DIAG_LVL_SHORT) { // Short run and above + rdc_runs.push_back(RDC_DIAG_COMPUTE_PROCESS); + rdc_runs.push_back(RDC_DIAG_NODE_TOPOLOGY); + rdc_runs.push_back(RDC_DIAG_GPU_PARAMETERS); + rdc_runs.push_back(RDC_DIAG_COMPUTE_QUEUE); + rdc_runs.push_back(RDC_DIAG_SDMA_QUEUE); + rdc_runs.push_back(RDC_DIAG_VRAM_CHECK); + rdc_runs.push_back(RDC_DIAG_SYS_MEM_CHECK); + } + + response->results_count = 0; + for (unsigned int i=0; i < rdc_runs.size(); i++) { + response->diag_info[i].test_case = rdc_runs[i]; + rdc_test_case_run(rdc_runs[i], + const_cast(gpus.entity_ids), + gpus.count, &(response->diag_info[i])); + response->results_count++; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcDiagnosticModule::rdc_diag_init(uint64_t flag) { + auto ite = diagnostic_modules_.begin(); + for (; ite != diagnostic_modules_.end(); ite++) { + (*ite)->rdc_diag_init(flag); + } + return RDC_ST_OK; +} + +rdc_status_t RdcDiagnosticModule::RdcDiagnosticModule::rdc_diag_destroy() { + auto ite = diagnostic_modules_.begin(); + for (; ite != diagnostic_modules_.end(); ite++) { + (*ite)->rdc_diag_destroy(); + } + return RDC_ST_OK; +} + +RdcDiagnosticModule::RdcDiagnosticModule(const RdcSmiLibPtr& smi_lib, + const RdcRasLibPtr& ras_module) { + if (smi_lib) { + diagnostic_modules_.push_back(smi_lib); + } + + if (ras_module) { + diagnostic_modules_.push_back(ras_module); + } + + auto ite = diagnostic_modules_.begin(); + for (; ite != diagnostic_modules_.end(); ite++) { + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES]; + uint32_t test_count = 0; + rdc_status_t status = (*ite)-> + rdc_diag_test_cases_query(test_cases, &test_count); + if (status == RDC_ST_OK) { + for (uint32_t index = 0; index < test_count; index++) { + testcases_to_module_.insert({test_cases[index], (*ite)}); + } + } + } +} + + +} // namespace rdc +} // namespace amd + diff --git a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 40b390e597..6843699075 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -381,6 +381,22 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_unwatch(rdc_gpu_group_t group_id, return watch_table_->rdc_field_unwatch(group_id, field_group_id); } +// Diagnostic API +rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run( + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + + // Get GPU group information + rdc_group_info_t rdc_group_info; + rdc_status_t status = rdc_group_gpu_get_info( + group_id, &rdc_group_info); + if (status != RDC_ST_OK) return status; + + auto diag = rdc_module_mgr_->get_diagnostic_module(); + return diag->rdc_diagnostic_run(rdc_group_info, level, response); +} + // Control API rdc_status_t RdcEmbeddedHandler::rdc_field_update_all( uint32_t wait_for_update) { diff --git a/projects/rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc index 2dc0368935..1c7330d5fc 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc @@ -21,13 +21,15 @@ THE SOFTWARE. */ #include "rdc_lib/impl/RdcModuleMgrImpl.h" #include "rdc_lib/impl/RdcTelemetryModule.h" +#include "rdc_lib/impl/RdcDiagnosticModule.h" #include "rdc_lib/impl/RdcRasLib.h" namespace amd { namespace rdc { RdcModuleMgrImpl::RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher): - fetcher_(fetcher) { + smi_lib_(std::make_shared(fetcher)) { + // The smi_lib_ always need to be loaded. } @@ -42,12 +44,30 @@ RdcTelemetryPtr RdcModuleMgrImpl::get_telemetry_module() { } if (!rdc_telemetry_module_) { - rdc_telemetry_module_.reset(new RdcTelemetryModule(fetcher_, ras_lib_)); + rdc_telemetry_module_.reset(new RdcTelemetryModule(smi_lib_, ras_lib_)); } return rdc_telemetry_module_; } +RdcDiagnosticPtr RdcModuleMgrImpl::get_diagnostic_module() { + if (rdc_diagnostic_module_) { + return rdc_diagnostic_module_; + } + + // Delay load + if (!ras_lib_) { + ras_lib_.reset(new RdcRasLib("librdc_ras.so")); + } + + if (!rdc_diagnostic_module_) { + rdc_diagnostic_module_.reset( + new RdcDiagnosticModule(smi_lib_, ras_lib_)); + } + + return rdc_diagnostic_module_; +} + } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc index ab44ce2bcf..cca4e2e063 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc @@ -149,6 +149,46 @@ rdc_status_t RdcRasLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, return status; } + +rdc_status_t RdcRasLib::rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) { + (void)test_cases; + (void)test_case_count; + return RDC_ST_NOT_SUPPORTED; +} + + // Run a specific test case +rdc_status_t RdcRasLib::rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + (void)test_case; + (void)gpu_index; + (void)result; + (void)gpu_count; + return RDC_ST_NOT_SUPPORTED; +} + +rdc_status_t RdcRasLib::rdc_diagnostic_run( + const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + (void)gpus; + (void)level; + (void)response; + return RDC_ST_NOT_SUPPORTED; +} + +rdc_status_t RdcRasLib::rdc_diag_init(uint64_t flags) { + (void)flags; + return RDC_ST_NOT_SUPPORTED; +} +rdc_status_t RdcRasLib::rdc_diag_destroy() { + return RDC_ST_NOT_SUPPORTED; +} + } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc index 01e4d5b444..3b1b84eb69 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc @@ -24,6 +24,8 @@ THE SOFTWARE. #include "rdc_lib/RdcLogger.h" #include "rdc_lib/impl/RdcSmiLib.h" + + namespace amd { namespace rdc { @@ -161,6 +163,43 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query( return RDC_ST_OK; } +rdc_status_t RdcSmiLib::rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) { + if (test_case_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + return RDC_ST_NOT_SUPPORTED; +} + +// Run a specific test case +rdc_status_t RdcSmiLib::rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + return RDC_ST_NOT_SUPPORTED; +} + +rdc_status_t RdcSmiLib::rdc_diagnostic_run( + const rdc_group_info_t&, + rdc_diag_level_t, + rdc_diag_response_t*) { + return RDC_ST_NOT_SUPPORTED; +} + +rdc_status_t RdcSmiLib::rdc_diag_init(uint64_t) { + return RDC_ST_OK; +} + +rdc_status_t RdcSmiLib::rdc_diag_destroy() { + return RDC_ST_OK; +} + } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcTelemetryModule.cc b/projects/rdc/rdc_libs/rdc/src/RdcTelemetryModule.cc index 0f6aa7469b..f0291aaaa6 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcTelemetryModule.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcTelemetryModule.cc @@ -93,10 +93,9 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch( } RdcTelemetryModule::RdcTelemetryModule( - const RdcMetricFetcherPtr& fetcher, + const RdcSmiLibPtr& smi_lib, const RdcRasLibPtr& ras_module) { - auto smi_telemetry_module = std::make_shared(fetcher); - telemetry_modules_.push_back(smi_telemetry_module); + telemetry_modules_.push_back(smi_lib); if (ras_module) { telemetry_modules_.push_back(ras_module); } diff --git a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index 0938fd1b6e..2a5d132d52 100644 --- a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -554,6 +554,17 @@ rdc_status_t RdcStandaloneHandler::rdc_field_unwatch(rdc_gpu_group_t group_id, return error_handle(status, reply.status()); } +// Diagnostic API +rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run( + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + (void)group_id; + (void)level; + (void)response; + return RDC_ST_NOT_SUPPORTED; +} + // Control RdcAPI rdc_status_t RdcStandaloneHandler::rdc_field_update_all(