diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index fa8dc2c740..a3576b6aac 100755 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -86,6 +86,16 @@ add_executable(${FIELDVALUE_EXAMPLE_EXE} "${FIELDVALUE_EXAMPLE_SRC_LIST}") target_link_libraries(${FIELDVALUE_EXAMPLE_EXE} pthread dl rdc_bootstrap) +set(DIAGNOSTIC_EXAMPLE_SRC_LIST "${SRC_DIR}/diagnostic_example.cc") +message("DIAGNOSTIC_EXAMPLE_SRC_LIST=${DIAGNOSTIC_EXAMPLE_SRC_LIST}") +set(DIAGNOSTIC_EXAMPLE_EXE "diagnostic") + +link_directories(${LIB_BOOSTRAP_DIR}) + +add_executable(${DIAGNOSTIC_EXAMPLE_EXE} "${DIAGNOSTIC_EXAMPLE_SRC_LIST}") + +target_link_libraries(${DIAGNOSTIC_EXAMPLE_EXE} pthread dl rdc_bootstrap) + message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") message(" Finished Cmake Example ") diff --git a/example/diagnostic_example.cc b/example/diagnostic_example.cc new file mode 100644 index 0000000000..eb47f008e1 --- /dev/null +++ b/example/diagnostic_example.cc @@ -0,0 +1,157 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include +#include "rdc/rdc.h" + +static std::string get_test_name(rdc_diag_test_cases_t test_case) { + const std::map test_desc = { + {RDC_DIAG_COMPUTE_PROCESS, "No compute process"}, + {RDC_DIAG_SDMA_QUEUE, "SDMA Queue ready"}, + {RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"}, + {RDC_DIAG_VRAM_CHECK, "VRAM check"}, + {RDC_DIAG_SYS_MEM_CHECK, "System memory check"}, + {RDC_DIAG_NODE_TOPOLOGY, "Node topology check"}, + {RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"}, + {RDC_DIAG_TEST_LAST, "Unknown"} + }; + + auto test_name = test_desc.find(test_case); + if (test_name == test_desc.end()) { + return "Unknown Test"; + } + return test_name->second; +} + +int main(int, char **) { + rdc_status_t result; + rdc_handle_t rdc_handle; + bool standalone = false; + char hostIpAddress[] = {"127.0.0.1:50051"}; + char group_name[] = {"diag_group"}; + + // Select the embedded mode and standalone mode dynamically. + std::cout << "Start rdci in: \n"; + std::cout << "0 - Embedded mode \n"; + std::cout << "1 - Standalone mode \n"; + while (!(std::cin >> standalone)) { + std::cout << "Invalid input.\n"; + std::cin.clear(); + std::cin.ignore(); + } + std::cout << std::endl; + std::cout << (standalone? + "Standalone mode selected.\n":"Embedded mode selected.\n"); + + // Init the rdc + result = rdc_init(0); + + if (result != RDC_ST_OK) { + std::cout << "Error initializing RDC. Return: " << + rdc_status_string(result) << std::endl; + goto cleanup; + } else { + std::cout << "RDC Initialized.\n"; + } + + if (standalone) { // standalone + result = rdc_connect(hostIpAddress, &rdc_handle, + nullptr, nullptr, nullptr); + if ( result != RDC_ST_OK ) { + std::cout << "Error connecting to remote rdcd. Return: " + << rdc_status_string(result) << std::endl; + goto cleanup; + } + } else { // embedded + result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); + if (result != RDC_ST_OK) { + std::cout << "Error starting embedded RDC engine. Return: " + << rdc_status_string(result) << std::endl; + goto cleanup; + } + } + + // Now we can use the same API for both standalone and embedded + // (1) create group for all GPUs + rdc_gpu_group_t group_id; + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_DEFAULT, + group_name, &group_id); + if (result != RDC_ST_OK) { + std::cout << "Error creating group. Return: " + << rdc_status_string(result); + goto cleanup; + } + + // (2) start to run short diagnostic. + rdc_diag_response_t response; + result = rdc_diagnostic_run(rdc_handle, group_id, + RDC_DIAG_LVL_SHORT, &response); + + if (result != RDC_ST_OK) { + std::cout << "Error run RDC_DIAG_LVL_SHORT diagnostic. Return: " + << rdc_status_string(result); + goto cleanup; + } + + // (3) Check diagnostic results + for (uint32_t i=0 ; i < response.results_count; i++) { + const rdc_diag_test_result_t& test_result = + response.diag_info[i]; + std::cout << std::setw(22) << std::left + << get_test_name(test_result.test_case) + ":" + << rdc_diagnostic_result_string(test_result.status) << "\n"; + } + + // (4) diagnostic detail information + std::cout <<" =============== Diagnostic Details ==================\n"; + for (uint32_t i=0 ; i < response.results_count; i++) { + const rdc_diag_test_result_t& test_result = + response.diag_info[i]; + if (test_result.info[0] != '\0') { + std::cout << std::setw(22) << std::left + << get_test_name(test_result.test_case) + ":" + << test_result.info << "\n"; + } + for (uint32_t j=0; j < test_result.per_gpu_result_count; j++) { + const rdc_diag_per_gpu_result_t& gpu_result + = test_result.gpu_results[j]; + if (strlen(gpu_result.gpu_result.msg) > 0) { + std::cout << " GPU " << gpu_result.gpu_index + << " " << gpu_result.gpu_result.msg << "\n"; + } + } + } + // Cleanup consists of shutting down RDC. + cleanup: + std::cout << "Cleaning up.\n"; + if (standalone) + rdc_disconnect(rdc_handle); + else + rdc_stop_embedded(rdc_handle); + rdc_shutdown(); + return result; +} diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index e25148abb6..4ef192ecd4 100755 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -394,6 +394,91 @@ typedef struct { uint64_t stop_time; //!< job stop time } rdc_job_group_info_t; +/** + * @brief type of diagnostic level + */ +typedef enum { + RDC_DIAG_LVL_INVALID = 0, //!< invalid level + RDC_DIAG_LVL_SHORT, //!< take a few seconds to run + RDC_DIAG_LVL_MED, //!< take less than 2 minutes to run + RDC_DIAG_LVL_LONG //!< take up to 15 minutes to run +} rdc_diag_level_t; + +/** + * @brief type of diagnostic result + */ +typedef enum { + RDC_DIAG_RESULT_PASS, //!< The diagnostic test pass + RDC_DIAG_RESULT_SKIP, //!< The diagnostic test skipped + RDC_DIAG_RESULT_WARN, //!< The diagnostic test has warnings + RDC_DIAG_RESULT_FAIL //!< The diagnostic test fail +} rdc_diag_result_t; + +/** + * @brief The test cases to run + */ +typedef enum { + RDC_DIAG_TEST_FIRST = 0, + //!< The diagnostic test pass + RDC_DIAG_COMPUTE_PROCESS = RDC_DIAG_TEST_FIRST, + RDC_DIAG_SDMA_QUEUE, //!< The SDMA Queue is ready + RDC_DIAG_COMPUTE_QUEUE, //!< The Compute Queue is ready + RDC_DIAG_VRAM_CHECK, //!< Check VRAM + RDC_DIAG_SYS_MEM_CHECK, //!< Check System memory + RDC_DIAG_NODE_TOPOLOGY, //!< Report node topology + RDC_DIAG_GPU_PARAMETERS, //!< GPU parameters in range + RDC_DIAG_TEST_LAST = RDC_DIAG_GPU_PARAMETERS +} rdc_diag_test_cases_t; + +/** + * @brief The maximum test cases to run + */ +const uint32_t MAX_TEST_CASES = RDC_DIAG_TEST_LAST - RDC_DIAG_TEST_FIRST + 1; + +/** + * @brief The maximum length of the diagnostic messages + */ +#define MAX_DIAG_MSG_LENGTH 4096 + +/** + * @brief details of the diagnostic errors + */ +typedef struct { + char msg[MAX_DIAG_MSG_LENGTH]; //!< The test result details + uint32_t code; //!< The low level error code +} rdc_diag_detail_t; + +/** + * @brief details of the per gpu diagnostic results + */ +typedef struct { + uint32_t gpu_index; //!< The GPU index + rdc_diag_detail_t gpu_result; //!< The detail results +} rdc_diag_per_gpu_result_t; + +/** + * @brief The diagnostic results for all GPUs + */ +typedef struct { + rdc_diag_result_t status; //!< The diagnostic result + rdc_diag_detail_t details; //!< The summary details + rdc_diag_test_cases_t test_case; //!< The test case to run + + uint32_t per_gpu_result_count; //!< How many gpu_results + //!< Result details + rdc_diag_per_gpu_result_t gpu_results[RDC_MAX_NUM_DEVICES]; + + char info[MAX_DIAG_MSG_LENGTH]; //!< Detail information +} rdc_diag_test_result_t; + +/** + * @brief The diagnostic responses for test cases + */ +typedef struct { + uint32_t results_count; + rdc_diag_test_result_t diag_info[MAX_TEST_CASES]; +} rdc_diag_response_t; + /** * @brief Initialize ROCm RDC. * @@ -872,6 +957,29 @@ rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle, rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id); +/** + * @brief Run the diagnostic test cases + * + * @details Run the diagnostic test cases at differenet levles. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @param[in] level The level decides how long the test will run. + * The RDC_DIAG_LVL_SHORT only take a few seconds, and the + * the RDC_DIAG_LVL_LONG may take up to 15 minutes. + * + * @param[inout] response The detail results of the tests run. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_diagnostic_run( + rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response); + /** * @brief Get a description of a provided RDC error status * @@ -905,6 +1013,17 @@ const char* field_id_string(rdc_field_t field_id); */ rdc_field_t get_field_id_from_name(const char* name); +/** + * @brief Get a description of a diagnostic result. + * + * @details return the string in human readable format. + * + * @param[in] result The RDC diagnostic result. + * + * @retval The string to describe the RDC diagnostic result. + */ +const char* rdc_diagnostic_result_string(rdc_diag_result_t result); + #ifdef __cplusplus } #endif // __cplusplus diff --git a/include/rdc_lib/RdcDiagnostic.h b/include/rdc_lib/RdcDiagnostic.h new file mode 100644 index 0000000000..0f1fef4154 --- /dev/null +++ b/include/rdc_lib/RdcDiagnostic.h @@ -0,0 +1,63 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_ +#define INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_ + +#include +#include "rdc/rdc.h" +#include "rdc_lib/RdcDiagnosticLibInterface.h" + +namespace amd { +namespace rdc { + +class RdcDiagnostic { + public: + // get support test cases + virtual rdc_status_t rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) = 0; + + // Run a specific test case + virtual rdc_status_t rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) = 0; + + // Run multiple test cases + virtual rdc_status_t rdc_diagnostic_run( + const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) = 0; + + virtual rdc_status_t rdc_diag_init(uint64_t flags) = 0; + virtual rdc_status_t rdc_diag_destroy() = 0; + + virtual ~RdcDiagnostic() {} +}; +typedef std::shared_ptr RdcDiagnosticPtr; + +} // namespace rdc +} // namespace amd + + +#endif // INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_ diff --git a/include/rdc_lib/RdcDiagnosticLibInterface.h b/include/rdc_lib/RdcDiagnosticLibInterface.h new file mode 100644 index 0000000000..d833d36648 --- /dev/null +++ b/include/rdc_lib/RdcDiagnosticLibInterface.h @@ -0,0 +1,52 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_ +#define INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_ + +// The telemetry interface for libraries, for example, RAS. +#include + + +extern "C" { + +// The library will implement below function + +// Which test cases are supported in the library +rdc_status_t rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count); + +// Run a specific test case +rdc_status_t rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result); + +rdc_status_t rdc_diag_init(uint64_t flags); + +rdc_status_t rdc_diag_destroy(); + +} + + +#endif // INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_ diff --git a/include/rdc_lib/RdcHandler.h b/include/rdc_lib/RdcHandler.h index 08a570b7f1..a51eb90add 100644 --- a/include/rdc_lib/RdcHandler.h +++ b/include/rdc_lib/RdcHandler.h @@ -80,6 +80,12 @@ class RdcHandler { virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) = 0; + // Diagnostic API + virtual rdc_status_t rdc_diagnostic_run( + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) = 0; + // Control API virtual rdc_status_t rdc_field_update_all(uint32_t wait_for_update) = 0; diff --git a/include/rdc_lib/RdcModuleMgr.h b/include/rdc_lib/RdcModuleMgr.h index ac71057fde..cfb577cf1f 100644 --- a/include/rdc_lib/RdcModuleMgr.h +++ b/include/rdc_lib/RdcModuleMgr.h @@ -26,6 +26,7 @@ THE SOFTWARE. #include "rdc_lib/rdc_common.h" #include "rdc/rdc.h" #include "rdc_lib/RdcTelemetry.h" +#include "rdc_lib/RdcDiagnostic.h" namespace amd { namespace rdc { @@ -33,6 +34,7 @@ namespace rdc { class RdcModuleMgr { public: virtual RdcTelemetryPtr get_telemetry_module() = 0; + virtual RdcDiagnosticPtr get_diagnostic_module() = 0; }; typedef std::shared_ptr RdcModuleMgrPtr; diff --git a/include/rdc_lib/impl/RdcDiagnosticModule.h b/include/rdc_lib/impl/RdcDiagnosticModule.h new file mode 100644 index 0000000000..9ad1debb14 --- /dev/null +++ b/include/rdc_lib/impl/RdcDiagnosticModule.h @@ -0,0 +1,78 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_ +#define INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_ + +#include +#include +#include +#include +#include "rdc_lib/RdcDiagnostic.h" +#include "rdc_lib/impl/RdcRasLib.h" +#include "rdc_lib/impl/RdcSmiLib.h" + +namespace amd { +namespace rdc { + +class RdcDiagnosticModule : public RdcDiagnostic { + public: + rdc_status_t rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) override; + + // Run a specific test case + rdc_status_t rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) override; + + rdc_status_t rdc_diagnostic_run( + const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) override; + + rdc_status_t rdc_diag_init(uint64_t flags) override; + rdc_status_t rdc_diag_destroy() override; + + explicit RdcDiagnosticModule(const RdcSmiLibPtr& smi_lib, + const RdcRasLibPtr& ras_module); + + private: + //< Helper function to dispatch fields to module + void get_fields_for_module( + rdc_gpu_field_t* fields, + uint32_t fields_count, + std::map> + & fields_in_module, + std::vector& unsupport_fields); // NOLINT + std::list diagnostic_modules_; + std::map testcases_to_module_; +}; + +typedef std::shared_ptr RdcDiagnosticModulePtr; + +} // namespace rdc +} // namespace amd + + +#endif // INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_ diff --git a/include/rdc_lib/impl/RdcEmbeddedHandler.h b/include/rdc_lib/impl/RdcEmbeddedHandler.h index 8fa54e4723..12113aad20 100644 --- a/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -86,6 +86,11 @@ class RdcEmbeddedHandler: public RdcHandler { uint64_t *next_since_time_stamp, rdc_field_value* value) override; rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override; + // Diagnostic API + rdc_status_t rdc_diagnostic_run( + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) override; // Control API rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; diff --git a/include/rdc_lib/impl/RdcModuleMgrImpl.h b/include/rdc_lib/impl/RdcModuleMgrImpl.h index e1467e5e0d..e7bffaad24 100644 --- a/include/rdc_lib/impl/RdcModuleMgrImpl.h +++ b/include/rdc_lib/impl/RdcModuleMgrImpl.h @@ -27,6 +27,7 @@ THE SOFTWARE. #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcTelemetry.h" #include "rdc_lib/impl/RdcRasLib.h" +#include "rdc_lib/impl/RdcSmiLib.h" namespace amd { namespace rdc { @@ -34,14 +35,16 @@ namespace rdc { class RdcModuleMgrImpl: public RdcModuleMgr { public: RdcTelemetryPtr get_telemetry_module() override; + RdcDiagnosticPtr get_diagnostic_module() override; explicit RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher); private: // Function module RdcTelemetryPtr rdc_telemetry_module_; + RdcDiagnosticPtr rdc_diagnostic_module_; // Domain module RdcRasLibPtr ras_lib_; - + RdcSmiLibPtr smi_lib_; RdcMetricFetcherPtr fetcher_; }; diff --git a/include/rdc_lib/impl/RdcRasLib.h b/include/rdc_lib/impl/RdcRasLib.h index ae8255e8e7..4864b0bcf1 100644 --- a/include/rdc_lib/impl/RdcRasLib.h +++ b/include/rdc_lib/impl/RdcRasLib.h @@ -30,11 +30,12 @@ THE SOFTWARE. #include #include "rdc_lib/RdcLibraryLoader.h" #include "rdc_lib/RdcTelemetry.h" +#include "rdc_lib/RdcDiagnostic.h" namespace amd { namespace rdc { -class RdcRasLib: public RdcTelemetry { +class RdcRasLib: public RdcTelemetry, public RdcDiagnostic { public: // get support field ids rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], @@ -51,6 +52,25 @@ class RdcRasLib: public RdcTelemetry { rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count) override; + rdc_status_t rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) override; + + // Run a specific test case + rdc_status_t rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) override; + + rdc_status_t rdc_diagnostic_run( + const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) override; + + rdc_status_t rdc_diag_init(uint64_t flags) override; + rdc_status_t rdc_diag_destroy() override; + explicit RdcRasLib(const char* lib_name); ~RdcRasLib(); diff --git a/include/rdc_lib/impl/RdcSmiLib.h b/include/rdc_lib/impl/RdcSmiLib.h index 5f311b6a2b..bed6d7f7e4 100644 --- a/include/rdc_lib/impl/RdcSmiLib.h +++ b/include/rdc_lib/impl/RdcSmiLib.h @@ -26,11 +26,12 @@ THE SOFTWARE. #include #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcTelemetry.h" +#include "rdc_lib/RdcDiagnostic.h" namespace amd { namespace rdc { -class RdcSmiLib : public RdcTelemetry { +class RdcSmiLib : public RdcTelemetry, public RdcDiagnostic { public: // get support field ids rdc_status_t rdc_telemetry_fields_query( @@ -46,12 +47,33 @@ class RdcSmiLib : public RdcTelemetry { rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count) override; + rdc_status_t rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) override; + + // Run a specific test case + rdc_status_t rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) override; + + rdc_status_t rdc_diagnostic_run( + const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) override; + + rdc_status_t rdc_diag_init(uint64_t flags) override; + rdc_status_t rdc_diag_destroy() override; + explicit RdcSmiLib(const RdcMetricFetcherPtr& mf); private: RdcMetricFetcherPtr metric_fetcher_; }; +typedef std::shared_ptr RdcSmiLibPtr; + } // namespace rdc } // namespace amd diff --git a/include/rdc_lib/impl/RdcStandaloneHandler.h b/include/rdc_lib/impl/RdcStandaloneHandler.h index 439bbbe128..009775d35a 100644 --- a/include/rdc_lib/impl/RdcStandaloneHandler.h +++ b/include/rdc_lib/impl/RdcStandaloneHandler.h @@ -80,6 +80,11 @@ class RdcStandaloneHandler: public RdcHandler { uint64_t *next_since_time_stamp, rdc_field_value* value) override; rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override; + // Diagnostic API + rdc_status_t rdc_diagnostic_run( + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) override; // Control RdcAPI rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; diff --git a/include/rdc_lib/impl/RdcTelemetryModule.h b/include/rdc_lib/impl/RdcTelemetryModule.h index 62c588a374..c9255d6043 100644 --- a/include/rdc_lib/impl/RdcTelemetryModule.h +++ b/include/rdc_lib/impl/RdcTelemetryModule.h @@ -28,6 +28,7 @@ THE SOFTWARE. #include #include "rdc_lib/RdcTelemetry.h" #include "rdc_lib/impl/RdcRasLib.h" +#include "rdc_lib/impl/RdcSmiLib.h" #include "rdc_lib/RdcMetricFetcher.h" namespace amd { @@ -48,7 +49,7 @@ class RdcTelemetryModule : public RdcTelemetry { rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count); - RdcTelemetryModule(const RdcMetricFetcherPtr& fetcher, + RdcTelemetryModule(const RdcSmiLibPtr& smi_lib, const RdcRasLibPtr& ras_module); private: diff --git a/rdc_libs/CMakeLists.txt b/rdc_libs/CMakeLists.txt index e4b244b1e7..6d27f93b01 100755 --- a/rdc_libs/CMakeLists.txt +++ b/rdc_libs/CMakeLists.txt @@ -135,6 +135,7 @@ add_library(${BOOTSTRAP_LIB} SHARED ${BOOTSTRAP_LIB_SRC_LIST} ${BOOTSTRAP_LIB_IN target_link_libraries(${BOOTSTRAP_LIB} pthread dl) target_include_directories(${BOOTSTRAP_LIB} PRIVATE "${RSMI_INC_DIR}" + "${ROCM_DIR}/include" "${PROJECT_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}/include" "${COMMON_DIR}" @@ -159,6 +160,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcWatchTableImpl.c set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcRasLib.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcSmiLib.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcTelemetryModule.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcDiagnosticModule.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcModuleMgrImpl.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcNotificationImpl.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RsmiUtils.cc") @@ -182,6 +184,9 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcMod set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcModuleMgr.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcTelemetry.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcTelemetryModule.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcDiagnostic.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcDiagnosticLibInterface.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcDiagnosticModule.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcNotification.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcNotificationImpl.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RsmiUtils.h") @@ -190,7 +195,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_capabilities.h") message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}") -link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64") +link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64" "${ROCM_DIR}/lib") add_library(${RDC_LIB} SHARED ${RDC_LIB_SRC_LIST} ${RDC_LIB_INC_LIST}) target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64 cap) target_include_directories(${RDC_LIB} PRIVATE diff --git a/rdc_libs/bootstrap/src/RdcBootStrap.cc b/rdc_libs/bootstrap/src/RdcBootStrap.cc index 624c1ff9a9..db1600bc95 100644 --- a/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -322,6 +322,18 @@ rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle, rdc_group_field_destroy(rdc_field_group_id); } +rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_diagnostic_run(group_id, level, response); +} + const char* rdc_status_string(rdc_status_t result) { switch (result) { case RDC_ST_OK: @@ -361,6 +373,21 @@ const char* rdc_status_string(rdc_status_t result) { } } +const char* rdc_diagnostic_result_string(rdc_diag_result_t result) { + switch (result) { + case RDC_DIAG_RESULT_PASS: + return "Pass"; + case RDC_DIAG_RESULT_SKIP: + return "Skip"; + case RDC_DIAG_RESULT_WARN: + return "Warn"; + case RDC_DIAG_RESULT_FAIL: + return "Fail"; + default: + return "Unknown"; + } +} + const char* field_id_string(rdc_field_t field_id) { amd::rdc::fld_id2name_map_t &field_id_to_descript = amd::rdc::get_field_id_description_from_id(); diff --git a/rdc_libs/rdc/src/RdcDiagnosticModule.cc b/rdc_libs/rdc/src/RdcDiagnosticModule.cc new file mode 100644 index 0000000000..39e65572c4 --- /dev/null +++ b/rdc_libs/rdc/src/RdcDiagnosticModule.cc @@ -0,0 +1,146 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/impl/RdcDiagnosticModule.h" +#include +#include +#include +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/impl/RdcSmiLib.h" +#include "rdc_lib/impl/RdcRasLib.h" + +namespace amd { +namespace rdc { + +rdc_status_t RdcDiagnosticModule::rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) { + if (test_case_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + auto ite = diagnostic_modules_.begin(); + *test_case_count = 0; + for (; ite != diagnostic_modules_.end(); ite++) { + uint32_t count = 0; + rdc_status_t status = (*ite)->rdc_diag_test_cases_query( + &(test_cases[*test_case_count]), &count); + if (status == RDC_ST_OK) { + *test_case_count += count; + } + } + return RDC_ST_OK; +} + +rdc_status_t RdcDiagnosticModule::rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + // Init test status + auto ite = testcases_to_module_.find(test_case); + if (ite == testcases_to_module_.end()) { + result->status = RDC_DIAG_RESULT_SKIP; + strncpy_with_null(result->info, "Not implemented", MAX_DIAG_MSG_LENGTH); + return RDC_ST_NOT_SUPPORTED; + } + return ite->second->rdc_test_case_run(test_case, + gpu_index, gpu_count, result); +} + +rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run( + const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + if (response == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + std::vector rdc_runs; + if (level >= RDC_DIAG_LVL_SHORT) { // Short run and above + rdc_runs.push_back(RDC_DIAG_COMPUTE_PROCESS); + rdc_runs.push_back(RDC_DIAG_NODE_TOPOLOGY); + rdc_runs.push_back(RDC_DIAG_GPU_PARAMETERS); + rdc_runs.push_back(RDC_DIAG_COMPUTE_QUEUE); + rdc_runs.push_back(RDC_DIAG_SDMA_QUEUE); + rdc_runs.push_back(RDC_DIAG_VRAM_CHECK); + rdc_runs.push_back(RDC_DIAG_SYS_MEM_CHECK); + } + + response->results_count = 0; + for (unsigned int i=0; i < rdc_runs.size(); i++) { + response->diag_info[i].test_case = rdc_runs[i]; + rdc_test_case_run(rdc_runs[i], + const_cast(gpus.entity_ids), + gpus.count, &(response->diag_info[i])); + response->results_count++; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcDiagnosticModule::rdc_diag_init(uint64_t flag) { + auto ite = diagnostic_modules_.begin(); + for (; ite != diagnostic_modules_.end(); ite++) { + (*ite)->rdc_diag_init(flag); + } + return RDC_ST_OK; +} + +rdc_status_t RdcDiagnosticModule::RdcDiagnosticModule::rdc_diag_destroy() { + auto ite = diagnostic_modules_.begin(); + for (; ite != diagnostic_modules_.end(); ite++) { + (*ite)->rdc_diag_destroy(); + } + return RDC_ST_OK; +} + +RdcDiagnosticModule::RdcDiagnosticModule(const RdcSmiLibPtr& smi_lib, + const RdcRasLibPtr& ras_module) { + if (smi_lib) { + diagnostic_modules_.push_back(smi_lib); + } + + if (ras_module) { + diagnostic_modules_.push_back(ras_module); + } + + auto ite = diagnostic_modules_.begin(); + for (; ite != diagnostic_modules_.end(); ite++) { + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES]; + uint32_t test_count = 0; + rdc_status_t status = (*ite)-> + rdc_diag_test_cases_query(test_cases, &test_count); + if (status == RDC_ST_OK) { + for (uint32_t index = 0; index < test_count; index++) { + testcases_to_module_.insert({test_cases[index], (*ite)}); + } + } + } +} + + +} // namespace rdc +} // namespace amd + diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 40b390e597..6843699075 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -381,6 +381,22 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_unwatch(rdc_gpu_group_t group_id, return watch_table_->rdc_field_unwatch(group_id, field_group_id); } +// Diagnostic API +rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run( + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + + // Get GPU group information + rdc_group_info_t rdc_group_info; + rdc_status_t status = rdc_group_gpu_get_info( + group_id, &rdc_group_info); + if (status != RDC_ST_OK) return status; + + auto diag = rdc_module_mgr_->get_diagnostic_module(); + return diag->rdc_diagnostic_run(rdc_group_info, level, response); +} + // Control API rdc_status_t RdcEmbeddedHandler::rdc_field_update_all( uint32_t wait_for_update) { diff --git a/rdc_libs/rdc/src/RdcModuleMgrImpl.cc b/rdc_libs/rdc/src/RdcModuleMgrImpl.cc index 2dc0368935..1c7330d5fc 100644 --- a/rdc_libs/rdc/src/RdcModuleMgrImpl.cc +++ b/rdc_libs/rdc/src/RdcModuleMgrImpl.cc @@ -21,13 +21,15 @@ THE SOFTWARE. */ #include "rdc_lib/impl/RdcModuleMgrImpl.h" #include "rdc_lib/impl/RdcTelemetryModule.h" +#include "rdc_lib/impl/RdcDiagnosticModule.h" #include "rdc_lib/impl/RdcRasLib.h" namespace amd { namespace rdc { RdcModuleMgrImpl::RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher): - fetcher_(fetcher) { + smi_lib_(std::make_shared(fetcher)) { + // The smi_lib_ always need to be loaded. } @@ -42,12 +44,30 @@ RdcTelemetryPtr RdcModuleMgrImpl::get_telemetry_module() { } if (!rdc_telemetry_module_) { - rdc_telemetry_module_.reset(new RdcTelemetryModule(fetcher_, ras_lib_)); + rdc_telemetry_module_.reset(new RdcTelemetryModule(smi_lib_, ras_lib_)); } return rdc_telemetry_module_; } +RdcDiagnosticPtr RdcModuleMgrImpl::get_diagnostic_module() { + if (rdc_diagnostic_module_) { + return rdc_diagnostic_module_; + } + + // Delay load + if (!ras_lib_) { + ras_lib_.reset(new RdcRasLib("librdc_ras.so")); + } + + if (!rdc_diagnostic_module_) { + rdc_diagnostic_module_.reset( + new RdcDiagnosticModule(smi_lib_, ras_lib_)); + } + + return rdc_diagnostic_module_; +} + } // namespace rdc } // namespace amd diff --git a/rdc_libs/rdc/src/RdcRasLib.cc b/rdc_libs/rdc/src/RdcRasLib.cc index ab44ce2bcf..cca4e2e063 100644 --- a/rdc_libs/rdc/src/RdcRasLib.cc +++ b/rdc_libs/rdc/src/RdcRasLib.cc @@ -149,6 +149,46 @@ rdc_status_t RdcRasLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, return status; } + +rdc_status_t RdcRasLib::rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) { + (void)test_cases; + (void)test_case_count; + return RDC_ST_NOT_SUPPORTED; +} + + // Run a specific test case +rdc_status_t RdcRasLib::rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + (void)test_case; + (void)gpu_index; + (void)result; + (void)gpu_count; + return RDC_ST_NOT_SUPPORTED; +} + +rdc_status_t RdcRasLib::rdc_diagnostic_run( + const rdc_group_info_t& gpus, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + (void)gpus; + (void)level; + (void)response; + return RDC_ST_NOT_SUPPORTED; +} + +rdc_status_t RdcRasLib::rdc_diag_init(uint64_t flags) { + (void)flags; + return RDC_ST_NOT_SUPPORTED; +} +rdc_status_t RdcRasLib::rdc_diag_destroy() { + return RDC_ST_NOT_SUPPORTED; +} + } // namespace rdc } // namespace amd diff --git a/rdc_libs/rdc/src/RdcSmiLib.cc b/rdc_libs/rdc/src/RdcSmiLib.cc index 01e4d5b444..3b1b84eb69 100644 --- a/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/rdc_libs/rdc/src/RdcSmiLib.cc @@ -24,6 +24,8 @@ THE SOFTWARE. #include "rdc_lib/RdcLogger.h" #include "rdc_lib/impl/RdcSmiLib.h" + + namespace amd { namespace rdc { @@ -161,6 +163,43 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query( return RDC_ST_OK; } +rdc_status_t RdcSmiLib::rdc_diag_test_cases_query( + rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) { + if (test_case_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + return RDC_ST_NOT_SUPPORTED; +} + +// Run a specific test case +rdc_status_t RdcSmiLib::rdc_test_case_run( + rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + return RDC_ST_NOT_SUPPORTED; +} + +rdc_status_t RdcSmiLib::rdc_diagnostic_run( + const rdc_group_info_t&, + rdc_diag_level_t, + rdc_diag_response_t*) { + return RDC_ST_NOT_SUPPORTED; +} + +rdc_status_t RdcSmiLib::rdc_diag_init(uint64_t) { + return RDC_ST_OK; +} + +rdc_status_t RdcSmiLib::rdc_diag_destroy() { + return RDC_ST_OK; +} + } // namespace rdc } // namespace amd diff --git a/rdc_libs/rdc/src/RdcTelemetryModule.cc b/rdc_libs/rdc/src/RdcTelemetryModule.cc index 0f6aa7469b..f0291aaaa6 100644 --- a/rdc_libs/rdc/src/RdcTelemetryModule.cc +++ b/rdc_libs/rdc/src/RdcTelemetryModule.cc @@ -93,10 +93,9 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch( } RdcTelemetryModule::RdcTelemetryModule( - const RdcMetricFetcherPtr& fetcher, + const RdcSmiLibPtr& smi_lib, const RdcRasLibPtr& ras_module) { - auto smi_telemetry_module = std::make_shared(fetcher); - telemetry_modules_.push_back(smi_telemetry_module); + telemetry_modules_.push_back(smi_lib); if (ras_module) { telemetry_modules_.push_back(ras_module); } diff --git a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index 0938fd1b6e..2a5d132d52 100644 --- a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -554,6 +554,17 @@ rdc_status_t RdcStandaloneHandler::rdc_field_unwatch(rdc_gpu_group_t group_id, return error_handle(status, reply.status()); } +// Diagnostic API +rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run( + rdc_gpu_group_t group_id, + rdc_diag_level_t level, + rdc_diag_response_t* response) { + (void)group_id; + (void)level; + (void)response; + return RDC_ST_NOT_SUPPORTED; +} + // Control RdcAPI rdc_status_t RdcStandaloneHandler::rdc_field_update_all(