From 38c60ff90b3ef7c2fffd4e7571df291d9cba2c75 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 7 Jun 2023 19:46:35 -0500 Subject: [PATCH] RVS: Finish initial RVS integration NOTE: RVS Build is disabled by default due to CI build issues. Change-Id: I1593f0fe22075a9f86f54afa3ac151e109f1f7bd Signed-off-by: Galantsev, Dmitrii [ROCm/rdc commit: eaa1862a80ab9c9969d639320e6b7fb92dd45b6e] --- projects/rdc/.editorconfig | 10 +- projects/rdc/CMakeLists.txt | 21 ++- projects/rdc/cmake_modules/Findrvs.cmake | 38 +++++ .../cmake_modules/rdc-backward-compat.cmake | 2 + projects/rdc/example/diagnostic_example.cc | 6 +- projects/rdc/include/rdc/rdc.h | 27 +++- projects/rdc/include/rdc_lib/RdcDiagnostic.h | 5 +- .../rdc_lib/RdcDiagnosticLibInterface.h | 1 + projects/rdc/include/rdc_lib/RdcHandler.h | 2 + .../rdc_lib/impl/RdcDiagnosticModule.h | 2 + .../include/rdc_lib/impl/RdcEmbeddedHandler.h | 2 + .../include/rdc_lib/impl/RdcModuleMgrImpl.h | 3 - projects/rdc/include/rdc_lib/impl/RdcRVSLib.h | 71 +++++++++ projects/rdc/include/rdc_lib/impl/RdcRasLib.h | 2 + .../rdc/include/rdc_lib/impl/RdcRocrLib.h | 4 +- projects/rdc/include/rdc_lib/impl/RdcSmiLib.h | 2 + .../rdc_lib/impl/RdcStandaloneHandler.h | 2 + .../rdc/include/rdc_modules/rdc_rvs/RvsBase.h | 38 +++++ projects/rdc/protos/rdc.proto | 16 +- projects/rdc/rdc_libs/CMakeLists.txt | 4 + .../rdc_libs/bootstrap/src/RdcBootStrap.cc | 10 +- projects/rdc/rdc_libs/rdc/CMakeLists.txt | 2 + .../rdc_libs/rdc/src/RdcDiagnosticModule.cc | 24 ++- .../rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 10 +- projects/rdc/rdc_libs/rdc/src/RdcRVSLib.cc | 138 ++++++++++++++++++ projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc | 8 +- projects/rdc/rdc_libs/rdc/src/RdcRocrLib.cc | 9 +- projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc | 7 +- .../rdc_client/src/RdcStandaloneHandler.cc | 8 +- .../rdc_modules/rdc_rocr/CMakeLists.txt | 1 - .../rdc_modules/rdc_rocr/RdcDiagnosticLib.cc | 3 +- .../rdc_modules/rdc_rvs/CMakeLists.txt | 64 ++++++++ .../rdc_modules/rdc_rvs/RdcDiagnosticLib.cc | 94 ++++++++++++ .../rdc_libs/rdc_modules/rdc_rvs/RvsBase.cc | 118 +++++++++++++++ projects/rdc/rdci/include/RdciSubSystem.h | 1 + projects/rdc/rdci/src/RdciDiagSubSystem.cc | 16 +- projects/rdc/server/src/rdc_api_service.cc | 14 +- 37 files changed, 729 insertions(+), 56 deletions(-) create mode 100644 projects/rdc/cmake_modules/Findrvs.cmake create mode 100644 projects/rdc/include/rdc_lib/impl/RdcRVSLib.h create mode 100644 projects/rdc/include/rdc_modules/rdc_rvs/RvsBase.h create mode 100644 projects/rdc/rdc_libs/rdc/src/RdcRVSLib.cc create mode 100644 projects/rdc/rdc_libs/rdc_modules/rdc_rvs/CMakeLists.txt create mode 100644 projects/rdc/rdc_libs/rdc_modules/rdc_rvs/RdcDiagnosticLib.cc create mode 100644 projects/rdc/rdc_libs/rdc_modules/rdc_rvs/RvsBase.cc diff --git a/projects/rdc/.editorconfig b/projects/rdc/.editorconfig index e1c63b8bb7..73278b1892 100644 --- a/projects/rdc/.editorconfig +++ b/projects/rdc/.editorconfig @@ -4,10 +4,6 @@ # top-most EditorConfig file root = true -# Unix-style newlines with a newline ending every file and no stray whitespaces -[*] -end_of_line = lf - # Matches multiple files with brace expansion notation # Set default charset [*.{c,cc,cpp,h,hh,hpp}] @@ -18,3 +14,9 @@ indent_size = 2 [*.py] indent_style = space indent_size = 4 + +[*.proto] +charset = utf-8 +indent_style = space +indent_size = 2 + diff --git a/projects/rdc/CMakeLists.txt b/projects/rdc/CMakeLists.txt index 97a3fc2c6e..d95aa1d188 100755 --- a/projects/rdc/CMakeLists.txt +++ b/projects/rdc/CMakeLists.txt @@ -26,6 +26,9 @@ cmake_minimum_required(VERSION 3.15) set(RDC "rdc" CACHE INTERNAL "") set(RDC_PACKAGE ${RDC} CACHE STRING "") +# Default libdir to "lib", this skips GNUInstallDirs from trying to take a guess if it's unset: +set(CMAKE_INSTALL_LIBDIR "lib" CACHE STRING "Library install directory") + set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules/" CACHE INTERNAL "Default module path.") # Include common cmake modules include(utils) @@ -53,16 +56,23 @@ option(BUILD_ROCRTEST "Build targets for librdc_rocr.so" ON) # which requires the Rocm profiler. option(BUILD_ROCPTEST "Build targets for librdc_rocp.so" OFF) +# When cmake -DBUILD_RVS=off, it will not build the librdc_rvs.so +# which requires the RocmValidationSuite +option(BUILD_RVS "Build targets for librdc_rvs.so" OFF) + # When cmake -DBUILD_TESTS=off, it will not build RDC tests. option(BUILD_TESTS "Build test suite" OFF) # Enable shared libraries for gtest option(BUILD_SHARED_LIBS "Build shared library (.so) or not." ON) +# Enable address sanitizer +option(ADDRESS_SANITIZER "Enable address sanitizer" OFF) + # File reorganization enable/disable option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" OFF) -option(CMAKE_VERBOSE_MAKEFILE "Enable verbose output" ON) +option(CMAKE_VERBOSE_MAKEFILE "Enable verbose output" OFF) option(CMAKE_EXPORT_COMPILE_COMMANDS "Export compile commands for linters and autocompleters" ON) @@ -77,7 +87,8 @@ include(GNUInstallDirs) # ROCM_DIR should be passed in via command line; it will be used # in sub-projects. Override with -DROCM_DIR= -set(ROCM_DIR "/opt/rocm" CACHE STRING "ROCm directory.") +set(ROCM_DIR "/opt/rocm" CACHE PATH "ROCm directory.") +set(ROCM_PATH "${ROCM_DIR}" CACHE PATH "ROCm directory.") set(COMMON_DIR "${CMAKE_CURRENT_SOURCE_DIR}/common") @@ -89,8 +100,7 @@ set(RDC_SHARE_INSTALL_PREFIX "share/${RDC}" CACHE INTERNAL "Tests and Example in set(CMAKE_INSTALL_PREFIX ${ROCM_DIR} CACHE STRING "Default installation directory.") # add package search paths -set(CMAKE_PREFIX_PATH - ${CMAKE_PREFIX_PATH} +list(APPEND CMAKE_PREFIX_PATH ${GRPC_ROOT} /usr/local ) @@ -213,7 +223,8 @@ if(BUILD_STANDALONE) --cpp_out=${PROTOB_OUT_DIR} ${file} WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} RESULT_VARIABLE PROTOB_RESULT - OUTPUT_VARIABLE PROTOB_OUT_VAR) + OUTPUT_VARIABLE PROTOB_OUT_VAR + COMMAND_ERROR_IS_FATAL ANY) message("protoc command returned: ${PROTOB_RESULT}") message("GRPC_PLUGIN=${GRPC_PLUGIN})") diff --git a/projects/rdc/cmake_modules/Findrvs.cmake b/projects/rdc/cmake_modules/Findrvs.cmake new file mode 100644 index 0000000000..79641def04 --- /dev/null +++ b/projects/rdc/cmake_modules/Findrvs.cmake @@ -0,0 +1,38 @@ +# This module provides a rvs::rvs package +# You can specify the ROCM directory by setting ROCM_DIR + +set(NAME rvs) + +if(NOT DEFINED ROCM_DIR) + set(ROCM_DIR "/opt/rocm") +endif() + +find_library( + ${NAME}_LIBRARY + NAMES ${NAME} ${NAME}64 + HINTS "${ROCM_DIR}" + REGISTRY_VIEW BOTH + PATH_SUFFIXES lib) + +if(NOT DEFINED (${NAME}_INCLUDE_DIR)) + find_path( + ${NAME}_INCLUDE_DIR + NAMES ${NAME}.h + HINTS "${ROCM_DIR}/include" + PATH_SUFFIXES ${NAME} ${NAME}/inc) +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(${NAME} + FOUND_VAR ${NAME}_FOUND + REQUIRED_VARS + ${NAME}_LIBRARY + ${NAME}_INCLUDE_DIR) + +if(${NAME}_FOUND AND NOT TARGET ${NAME}::${NAME}) + add_library(${NAME}::${NAME} UNKNOWN IMPORTED) + set_target_properties(${NAME}::${NAME} PROPERTIES + IMPORTED_LOCATION "${${NAME}_LIBRARY}" + INTERFACE_COMPILE_OPTIONS "${PC_${NAME}_CFLAGS_OTHER}" + INTERFACE_INCLUDE_DIRECTORIES "${${NAME}_INCLUDE_DIR}") +endif() diff --git a/projects/rdc/cmake_modules/rdc-backward-compat.cmake b/projects/rdc/cmake_modules/rdc-backward-compat.cmake index 25ba07ca1f..ed3fb50484 100644 --- a/projects/rdc/cmake_modules/rdc-backward-compat.cmake +++ b/projects/rdc/cmake_modules/rdc-backward-compat.cmake @@ -60,11 +60,13 @@ function(create_library_symlink) set(LIB_RDC_ROCR "librdc_rocr.so") set(LIB_RDC_ROCP "librdc_rocp.so") set(LIB_RDC_RAS "librdc_ras.so") + set(LIB_RDC_RVS "librdc_rvs.so") set(LIB_RDC_CLIENT_SMI "librdc_client_smi.so") set(library_files "${LIB_RDC_ROCR}" "${LIB_RDC_ROCR}.${MAJ_VERSION}" "${LIB_RDC_ROCR}.${SO_VERSION}" ) set(library_files "${LIB_RDC_ROCP}" "${LIB_RDC_ROCP}.${MAJ_VERSION}" "${LIB_RDC_ROCP}.${SO_VERSION}" ) set(library_files "${library_files}" "${LIB_RDC_CLIENT_SMI}" "${LIB_RDC_CLIENT_SMI}.${MAJ_VERSION}" "${LIB_RDC_CLIENT_SMI}.${SO_VERSION}" ) set(library_files "${library_files}" "${LIB_RDC_RAS}") + set(library_files "${library_files}" "${LIB_RDC_RVS}") foreach(file_name ${library_files}) add_custom_target(link_${file_name} ALL diff --git a/projects/rdc/example/diagnostic_example.cc b/projects/rdc/example/diagnostic_example.cc index 34dd99c0df..e1d1c0c730 100644 --- a/projects/rdc/example/diagnostic_example.cc +++ b/projects/rdc/example/diagnostic_example.cc @@ -36,6 +36,7 @@ static std::string get_test_name(rdc_diag_test_cases_t test_case) { {RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"}, {RDC_DIAG_SYS_MEM_CHECK, "System memory check"}, {RDC_DIAG_NODE_TOPOLOGY, "Node topology check"}, + {RDC_DIAG_RVS_TEST, "RVS check"}, {RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"}, {RDC_DIAG_TEST_LAST, "Unknown"}}; @@ -102,7 +103,7 @@ int main(int, char**) { // (2) start to run short diagnostic. rdc_diag_response_t response; - result = rdc_diagnostic_run(rdc_handle, group_id, RDC_DIAG_LVL_SHORT, &response); + result = rdc_diagnostic_run(rdc_handle, group_id, RDC_DIAG_LVL_SHORT, nullptr, 0, &response); if (result != RDC_ST_OK) { std::cout << "Error run RDC_DIAG_LVL_SHORT diagnostic. Return: " << rdc_status_string(result); @@ -135,7 +136,8 @@ int main(int, char**) { // (5) run one test case std::cout << " ============== Run individual diagnostic test ===========\n"; rdc_diag_test_result_t test_result; - result = rdc_test_case_run(rdc_handle, group_id, RDC_DIAG_COMPUTE_PROCESS, &test_result); + result = + rdc_test_case_run(rdc_handle, group_id, RDC_DIAG_COMPUTE_PROCESS, nullptr, 0, &test_result); if (result != RDC_ST_OK) { std::cout << "Error run RDC_DIAG_COMPUTE_PROCESS diagnostic. Return: " diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index fdb941946c..c94c3f9633 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -28,10 +28,20 @@ extern "C" { #endif // __cplusplus #ifdef __cplusplus + +// cstddef include causes issues on older GCC +// use stddef.h instead +#if __GNUC__ < 9 +#include +#else +#include +#endif // __GNUC__ + #include #else +#include #include -#endif +#endif // __cplusplus /** \file rdc.h * Main header file for the ROCm RDC library. @@ -434,6 +444,7 @@ typedef enum { RDC_DIAG_COMPUTE_QUEUE, //!< The Compute Queue is ready RDC_DIAG_SYS_MEM_CHECK, //!< Check System memory RDC_DIAG_NODE_TOPOLOGY, //!< Report node topology + RDC_DIAG_RVS_TEST, //!< TODO: Replace with real RVS tests RDC_DIAG_GPU_PARAMETERS, //!< GPU parameters in range RDC_DIAG_TEST_LAST = RDC_DIAG_GPU_PARAMETERS } rdc_diag_test_cases_t; @@ -972,12 +983,17 @@ rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_ * The RDC_DIAG_LVL_SHORT only take a few seconds, and the * the RDC_DIAG_LVL_LONG may take up to 15 minutes. * + * @param[in] config Implementation specific configuration. + * + * @param[in] config_size Length of the configuration. + * * @param[inout] response The detail results of the tests run. * * @retval ::RDC_ST_OK is returned upon successful call. */ rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, - rdc_diag_level_t level, rdc_diag_response_t* response); + rdc_diag_level_t level, const char* config, size_t config_size, + rdc_diag_response_t* response); /** * @brief Run one diagnostic test case @@ -990,12 +1006,17 @@ rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group * * @param[in] test_case The test case to run. * + * @param[in] config Implementation specific configuration. + * + * @param[in] config_size Length of the configuration. + * * @param[inout] result The results of the test. * * @retval ::RDC_ST_OK is returned upon successful call. */ rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, - rdc_diag_test_cases_t test_case, rdc_diag_test_result_t* result); + rdc_diag_test_cases_t test_case, const char* config, + size_t config_size, rdc_diag_test_result_t* result); /** * @brief Get a description of a provided RDC error status diff --git a/projects/rdc/include/rdc_lib/RdcDiagnostic.h b/projects/rdc/include/rdc_lib/RdcDiagnostic.h index d915334280..1ea94a5ae2 100644 --- a/projects/rdc/include/rdc_lib/RdcDiagnostic.h +++ b/projects/rdc/include/rdc_lib/RdcDiagnostic.h @@ -25,7 +25,6 @@ THE SOFTWARE. #include #include "rdc/rdc.h" -#include "rdc_lib/RdcDiagnosticLibInterface.h" namespace amd { namespace rdc { @@ -39,10 +38,12 @@ class RdcDiagnostic { // Run a specific test case virtual rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, rdc_diag_test_result_t* result) = 0; + uint32_t gpu_count, const char* config, size_t config_size, + rdc_diag_test_result_t* result) = 0; // Run multiple test cases virtual rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + const char* config, size_t config_size, rdc_diag_response_t* response) = 0; virtual rdc_status_t rdc_diag_init(uint64_t flags) = 0; diff --git a/projects/rdc/include/rdc_lib/RdcDiagnosticLibInterface.h b/projects/rdc/include/rdc_lib/RdcDiagnosticLibInterface.h index a9739c658f..9088cce673 100644 --- a/projects/rdc/include/rdc_lib/RdcDiagnosticLibInterface.h +++ b/projects/rdc/include/rdc_lib/RdcDiagnosticLibInterface.h @@ -37,6 +37,7 @@ rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + const char* config, size_t config_size, rdc_diag_test_result_t* result); rdc_status_t rdc_diag_init(uint64_t flags); diff --git a/projects/rdc/include/rdc_lib/RdcHandler.h b/projects/rdc/include/rdc_lib/RdcHandler.h index 0e76740327..01fa29eae5 100644 --- a/projects/rdc/include/rdc_lib/RdcHandler.h +++ b/projects/rdc/include/rdc_lib/RdcHandler.h @@ -77,9 +77,11 @@ class RdcHandler { // Diagnostic API virtual rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level, + const char* config, size_t config_size, rdc_diag_response_t* response) = 0; virtual rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, + const char* config, size_t config_size, rdc_diag_test_result_t* result) = 0; // Control API diff --git a/projects/rdc/include/rdc_lib/impl/RdcDiagnosticModule.h b/projects/rdc/include/rdc_lib/impl/RdcDiagnosticModule.h index dafef2cf0d..d89e7854f4 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcDiagnosticModule.h +++ b/projects/rdc/include/rdc_lib/impl/RdcDiagnosticModule.h @@ -42,9 +42,11 @@ class RdcDiagnosticModule : public RdcDiagnostic { // Run a specific test case rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + const char* config, size_t config_size, rdc_diag_test_result_t* result) override; rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + const char* config, size_t config_size, rdc_diag_response_t* response) override; rdc_status_t rdc_diag_init(uint64_t flags) override; diff --git a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h index edbb6c1bc4..c7af1f73f6 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -81,8 +81,10 @@ class RdcEmbeddedHandler : public RdcHandler { rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override; // Diagnostic API rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level, + const char* config, size_t config_size, rdc_diag_response_t* response) override; rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, + const char* config, size_t config_size, rdc_diag_test_result_t* result) override; // Control API diff --git a/projects/rdc/include/rdc_lib/impl/RdcModuleMgrImpl.h b/projects/rdc/include/rdc_lib/impl/RdcModuleMgrImpl.h index ed19f2e824..1237f5943a 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcModuleMgrImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcModuleMgrImpl.h @@ -27,9 +27,6 @@ THE SOFTWARE. #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcModuleMgr.h" #include "rdc_lib/RdcTelemetry.h" -#include "rdc_lib/impl/RdcRasLib.h" -#include "rdc_lib/impl/RdcRocrLib.h" -#include "rdc_lib/impl/RdcSmiLib.h" namespace amd { namespace rdc { diff --git a/projects/rdc/include/rdc_lib/impl/RdcRVSLib.h b/projects/rdc/include/rdc_lib/impl/RdcRVSLib.h new file mode 100644 index 0000000000..570bccd4dd --- /dev/null +++ b/projects/rdc/include/rdc_lib/impl/RdcRVSLib.h @@ -0,0 +1,71 @@ +/* +Copyright (c) 2023 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_IMPL_RDCRVSLIB_H_ +#define INCLUDE_RDC_LIB_IMPL_RDCRVSLIB_H_ + +#include +#include + +#include "rdc/rdc.h" +#include "rdc_lib/RdcDiagnostic.h" +#include "rdc_lib/RdcLibraryLoader.h" + +namespace amd { +namespace rdc { + +class RdcRVSLib : public RdcDiagnostic { + public: + rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) override; + + // Run a specific test case + rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + const char* config, size_t config_size, + rdc_diag_test_result_t* result) override; + + rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + const char* config, size_t config_size, + rdc_diag_response_t* response) override; + + rdc_status_t rdc_diag_init(uint64_t flags) override; + rdc_status_t rdc_diag_destroy() override; + + RdcRVSLib(); + + ~RdcRVSLib() override; + + private: + RdcLibraryLoader lib_loader_; + rdc_status_t (*test_case_run_)(rdc_diag_test_cases_t, uint32_t[RDC_MAX_NUM_DEVICES], uint32_t, + const char* config, size_t config_size, rdc_diag_test_result_t*); + rdc_status_t (*diag_test_cases_query_)(rdc_diag_test_cases_t[MAX_TEST_CASES], uint32_t*); + rdc_status_t (*diag_init_)(uint64_t); + rdc_status_t (*diag_destroy_)(); +}; + +typedef std::shared_ptr RdcRVSLibPtr; + +} // namespace rdc +} // namespace amd + +#endif // INCLUDE_RDC_LIB_IMPL_RDCRVSLIB_H_ diff --git a/projects/rdc/include/rdc_lib/impl/RdcRasLib.h b/projects/rdc/include/rdc_lib/impl/RdcRasLib.h index 9d38ab91d1..329e4ba0ec 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcRasLib.h +++ b/projects/rdc/include/rdc_lib/impl/RdcRasLib.h @@ -56,9 +56,11 @@ class RdcRasLib : public RdcTelemetry, public RdcDiagnostic { // Run a specific test case rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + const char* config, size_t config_size, rdc_diag_test_result_t* result) override; rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + const char* config, size_t config_size, rdc_diag_response_t* response) override; rdc_status_t rdc_diag_init(uint64_t flags) override; diff --git a/projects/rdc/include/rdc_lib/impl/RdcRocrLib.h b/projects/rdc/include/rdc_lib/impl/RdcRocrLib.h index 423cd31e3e..b87d2528ad 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcRocrLib.h +++ b/projects/rdc/include/rdc_lib/impl/RdcRocrLib.h @@ -39,9 +39,11 @@ class RdcRocrLib : public RdcDiagnostic { // Run a specific test case rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + const char* config, size_t config_size, rdc_diag_test_result_t* result) override; rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + const char* config, size_t config_size, rdc_diag_response_t* response) override; rdc_status_t rdc_diag_init(uint64_t flags) override; @@ -54,7 +56,7 @@ class RdcRocrLib : public RdcDiagnostic { private: RdcLibraryLoader lib_loader_; rdc_status_t (*test_case_run_)(rdc_diag_test_cases_t, uint32_t[RDC_MAX_NUM_DEVICES], uint32_t, - rdc_diag_test_result_t*); + const char*, size_t, rdc_diag_test_result_t*); rdc_status_t (*diag_test_cases_query_)(rdc_diag_test_cases_t[MAX_TEST_CASES], uint32_t*); rdc_status_t (*diag_init_)(uint64_t); rdc_status_t (*diag_destroy_)(); diff --git a/projects/rdc/include/rdc_lib/impl/RdcSmiLib.h b/projects/rdc/include/rdc_lib/impl/RdcSmiLib.h index 943ceda29e..dc43823bb0 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcSmiLib.h +++ b/projects/rdc/include/rdc_lib/impl/RdcSmiLib.h @@ -53,9 +53,11 @@ class RdcSmiLib : public RdcTelemetry, public RdcDiagnostic { // Run a specific test case rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + const char* config, size_t config_size, rdc_diag_test_result_t* result) override; rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + const char* config, size_t config_size, rdc_diag_response_t* response) override; rdc_status_t rdc_diag_init(uint64_t flags) override; diff --git a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h index 065c989025..1ba8793984 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h @@ -76,8 +76,10 @@ class RdcStandaloneHandler : public RdcHandler { rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override; // Diagnostic API rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level, + const char* config, size_t config_size, rdc_diag_response_t* response) override; rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, + const char* config, size_t config_size, rdc_diag_test_result_t* result) override; // Control RdcAPI diff --git a/projects/rdc/include/rdc_modules/rdc_rvs/RvsBase.h b/projects/rdc/include/rdc_modules/rdc_rvs/RvsBase.h new file mode 100644 index 0000000000..5aa407af45 --- /dev/null +++ b/projects/rdc/include/rdc_modules/rdc_rvs/RvsBase.h @@ -0,0 +1,38 @@ +/* +Copyright (c) 2023 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDC_MODULES_RDC_RVS_RVSBASE_H_ +#define RDC_MODULES_RDC_RVS_RVSBASE_H_ + +#include + +#include "rvs/rvs.h" + +namespace amd { +namespace rdc { + +void session_callback(rvs_session_id_t session_id, const rvs_results_t* results); + +rvs_status_t run_rvs_app(const char* config, size_t config_size); +} // namespace rdc +} // namespace amd + +#endif // RDC_MODULES_RDC_RVS_RVSBASE_H_ diff --git a/projects/rdc/protos/rdc.proto b/projects/rdc/protos/rdc.proto index 0f4e8d1d33..6d2bbc5363 100755 --- a/projects/rdc/protos/rdc.proto +++ b/projects/rdc/protos/rdc.proto @@ -218,12 +218,16 @@ service RdcAPI { // rdc_status_t rdc_diagnostic_run( // rdc_gpu_group_t group_id, // rdc_diag_level_t level, + // const char* config, + // size_t config_size, // rdc_diag_response_t* response); rpc DiagnosticRun(DiagnosticRunRequest) returns (DiagnosticRunResponse) {} // rdc_status_t rdc_test_case_run( // rdc_gpu_group_t group_id, // rdc_diag_test_cases_t test_case, + // const char* config, + // size_t config_size, // rdc_diag_test_result_t* result); rpc DiagnosticTestCaseRun(DiagnosticTestCaseRunRequest) returns (DiagnosticTestCaseRunResponse) {} } @@ -476,6 +480,8 @@ message RemoveAllJobResponse { message DiagnosticRunRequest { uint32 group_id = 1; uint32 level = 2; + string config = 3; + uint32 config_size = 4; } message DiagnosticDetail { @@ -498,7 +504,8 @@ message DiagnosticTestResult { VRAM_CHECK = 3; SYS_MEM_CHECK = 4; NODE_TOPOLOGY = 5; - GPU_PARAMETERS = 6; + RVS_TEST = 6; + GPU_PARAMETERS = 7; }; DiagnosticTestCase test_case = 3; uint32 per_gpu_result_count = 4; @@ -525,12 +532,15 @@ message DiagnosticTestCaseRunRequest { VRAM_CHECK = 3; SYS_MEM_CHECK = 4; NODE_TOPOLOGY = 5; - GPU_PARAMETERS = 6; + RVS_TEST = 6; + GPU_PARAMETERS = 7; }; TestCaseType test_case = 2; + string config = 3; + uint32 config_size = 4; } message DiagnosticTestCaseRunResponse { uint32 status = 1; DiagnosticTestResult result = 2; -} \ No newline at end of file +} diff --git a/projects/rdc/rdc_libs/CMakeLists.txt b/projects/rdc/rdc_libs/CMakeLists.txt index 753a34f14c..1fecca1ed4 100755 --- a/projects/rdc/rdc_libs/CMakeLists.txt +++ b/projects/rdc/rdc_libs/CMakeLists.txt @@ -60,6 +60,7 @@ set(BOOTSTRAP_LIB "rdc_bootstrap") set(RDC_LIB "rdc") set(RDC_ROCR_LIB "rdc_rocr") set(RDC_ROCP_LIB "rdc_rocp") +set(RDC_RVS_LIB "rdc_rvs") set(RDCCLIENT_LIB "rdc_client") ################# Determine the library version ######################### @@ -96,6 +97,9 @@ add_subdirectory(rdc_modules/rdc_rocr) # add librdc_rocp.so to RDC_LIB_MODULES add_subdirectory(rdc_modules/rdc_rocp) +# add librdc_rvs.so to RDC_LIB_MODULES +add_subdirectory(rdc_modules/rdc_rvs) + if(BUILD_STANDALONE) # add librdc_client.so add_subdirectory(rdc_client) diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc index 1758e8b5d1..469455bce9 100644 --- a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -294,23 +294,25 @@ rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle, } rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, - rdc_diag_level_t level, rdc_diag_response_t* response) { + rdc_diag_level_t level, const char* config, size_t config_size, + rdc_diag_response_t* response) { if (!p_rdc_handle) { return RDC_ST_INVALID_HANDLER; } return static_cast(p_rdc_handle) - ->rdc_diagnostic_run(group_id, level, response); + ->rdc_diagnostic_run(group_id, level, config, config_size, response); } rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, - rdc_diag_test_cases_t test_case, rdc_diag_test_result_t* result) { + rdc_diag_test_cases_t test_case, const char* config, + size_t config_size, rdc_diag_test_result_t* result) { if (!p_rdc_handle) { return RDC_ST_INVALID_HANDLER; } return static_cast(p_rdc_handle) - ->rdc_test_case_run(group_id, test_case, result); + ->rdc_test_case_run(group_id, test_case, config, config_size, result); } const char* rdc_status_string(rdc_status_t result) { diff --git a/projects/rdc/rdc_libs/rdc/CMakeLists.txt b/projects/rdc/rdc_libs/rdc/CMakeLists.txt index e11b0bbb42..be5ff033e2 100644 --- a/projects/rdc/rdc_libs/rdc/CMakeLists.txt +++ b/projects/rdc/rdc_libs/rdc/CMakeLists.txt @@ -21,6 +21,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/RdcRasLib.cc" "${SRC_DIR}/RdcRocpLib.cc" "${SRC_DIR}/RdcRocrLib.cc" + "${SRC_DIR}/RdcRVSLib.cc" "${SRC_DIR}/RdcSmiDiagnosticImpl.cc" "${SRC_DIR}/RdcSmiLib.cc" "${SRC_DIR}/RdcTelemetryModule.cc" @@ -53,6 +54,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${INC_DIR}/impl/RdcRasLib.h" "${INC_DIR}/impl/RdcRocpLib.h" "${INC_DIR}/impl/RdcRocrLib.h" + "${INC_DIR}/impl/RdcRVSLib.h" "${INC_DIR}/impl/RdcSmiDiagnosticImpl.h" "${INC_DIR}/impl/RdcSmiLib.h" "${INC_DIR}/impl/RdcTelemetryModule.h" diff --git a/projects/rdc/rdc_libs/rdc/src/RdcDiagnosticModule.cc b/projects/rdc/rdc_libs/rdc/src/RdcDiagnosticModule.cc index c465c3abd9..2dba6c26c0 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcDiagnosticModule.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcDiagnosticModule.cc @@ -28,6 +28,7 @@ THE SOFTWARE. #include "rdc_lib/RdcLogger.h" #include "rdc_lib/RdcMetricFetcher.h" +#include "rdc_lib/impl/RdcRVSLib.h" #include "rdc_lib/impl/RdcRasLib.h" #include "rdc_lib/impl/RdcRocrLib.h" #include "rdc_lib/impl/RdcSmiLib.h" @@ -55,7 +56,8 @@ rdc_status_t RdcDiagnosticModule::rdc_diag_test_cases_query( rdc_status_t RdcDiagnosticModule::rdc_test_case_run(rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, + uint32_t gpu_count, const char* config, + size_t config_size, rdc_diag_test_result_t* result) { if (result == nullptr) { return RDC_ST_BAD_PARAMETER; @@ -68,11 +70,13 @@ rdc_status_t RdcDiagnosticModule::rdc_test_case_run(rdc_diag_test_cases_t test_c strncpy_with_null(result->info, "Not implemented", MAX_DIAG_MSG_LENGTH); return RDC_ST_NOT_SUPPORTED; } - return ite->second->rdc_test_case_run(test_case, gpu_index, gpu_count, result); + return ite->second->rdc_test_case_run(test_case, gpu_index, gpu_count, config, config_size, + result); } rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpus, - rdc_diag_level_t level, + rdc_diag_level_t level, const char* config, + size_t config_size, rdc_diag_response_t* response) { if (response == nullptr) { return RDC_ST_BAD_PARAMETER; @@ -87,11 +91,15 @@ rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpu rdc_runs.push_back(RDC_DIAG_SYS_MEM_CHECK); } + if (level >= RDC_DIAG_LVL_MED) { // Medium run and above + rdc_runs.push_back(RDC_DIAG_RVS_TEST); + } + response->results_count = 0; for (unsigned int i = 0; i < rdc_runs.size(); i++) { response->diag_info[i].test_case = rdc_runs[i]; - rdc_test_case_run(rdc_runs[i], const_cast(gpus.entity_ids), gpus.count, - &(response->diag_info[i])); + rdc_test_case_run(rdc_runs[i], const_cast(gpus.entity_ids), gpus.count, config, + config_size, &(response->diag_info[i])); response->results_count++; } @@ -116,8 +124,9 @@ rdc_status_t RdcDiagnosticModule::RdcDiagnosticModule::rdc_diag_destroy() { RdcDiagnosticModule::RdcDiagnosticModule(RdcMetricFetcherPtr& fetcher) { const RdcSmiLibPtr smi_module = std::make_shared(fetcher); - const RdcRasLibPtr ras_module = std::make_shared(); const RdcRocrLibPtr rocr_module = std::make_shared(); + const RdcRasLibPtr ras_module = std::make_shared(); + const RdcRVSLibPtr rvs_module = std::make_shared(); if (smi_module) { diagnostic_modules_.push_back(smi_module); } @@ -127,6 +136,9 @@ RdcDiagnosticModule::RdcDiagnosticModule(RdcMetricFetcherPtr& fetcher) { if (ras_module) { diagnostic_modules_.push_back(ras_module); } + if (rvs_module) { + diagnostic_modules_.push_back(rvs_module); + } auto ite = diagnostic_modules_.begin(); for (; ite != diagnostic_modules_.end(); ite++) { diff --git a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 0239ec1b04..5a27eba907 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -350,7 +350,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_unwatch(rdc_gpu_group_t group_id, // Diagnostic API rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id, - rdc_diag_level_t level, + rdc_diag_level_t level, const char* config, + size_t config_size, rdc_diag_response_t* response) { if (!response) { return RDC_ST_BAD_PARAMETER; @@ -362,11 +363,12 @@ rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id, if (status != RDC_ST_OK) return status; auto diag = rdc_module_mgr_->get_diagnostic_module(); - return diag->rdc_diagnostic_run(rdc_group_info, level, response); + return diag->rdc_diagnostic_run(rdc_group_info, level, config, config_size, response); } rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, + const char* config, size_t config_size, rdc_diag_test_result_t* result) { if (!result) { return RDC_ST_BAD_PARAMETER; @@ -377,8 +379,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(rdc_gpu_group_t group_id, if (status != RDC_ST_OK) return status; auto diag = rdc_module_mgr_->get_diagnostic_module(); - return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids, rdc_group_info.count, - result); + return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids, rdc_group_info.count, config, + config_size, result); } // Control API diff --git a/projects/rdc/rdc_libs/rdc/src/RdcRVSLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcRVSLib.cc new file mode 100644 index 0000000000..8de22db160 --- /dev/null +++ b/projects/rdc/rdc_libs/rdc/src/RdcRVSLib.cc @@ -0,0 +1,138 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/impl/RdcRVSLib.h" + +#include + +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/rdc_common.h" + +namespace amd { +namespace rdc { + +RdcRVSLib::RdcRVSLib() + : test_case_run_(nullptr), + diag_test_cases_query_(nullptr), + diag_init_(nullptr), + diag_destroy_(nullptr) { + rdc_status_t status = lib_loader_.load("librdc_rvs.so"); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "RVS related function will not work."); + return; + } + + status = lib_loader_.load_symbol(&diag_init_, "rdc_diag_init"); + if (status != RDC_ST_OK) { + diag_init_ = nullptr; + return; + } + + status = diag_init_(0); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Fail to init librdc_rvs.so:" << rdc_status_string(status) + << ". RVS related function will not work."); + return; + } + + status = lib_loader_.load_symbol(&diag_destroy_, "rdc_diag_destroy"); + if (status != RDC_ST_OK) { + diag_destroy_ = nullptr; + } + + status = lib_loader_.load_symbol(&test_case_run_, "rdc_diag_test_case_run"); + if (status != RDC_ST_OK) { + test_case_run_ = nullptr; + } + status = lib_loader_.load_symbol(&diag_test_cases_query_, "rdc_diag_test_cases_query"); + if (status != RDC_ST_OK) { + diag_test_cases_query_ = nullptr; + } +} + +RdcRVSLib::~RdcRVSLib() { + if (diag_destroy_) { + diag_destroy_(); + } +} + +rdc_status_t RdcRVSLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) { + if (test_case_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (!diag_test_cases_query_) { + return RDC_ST_FAIL_LOAD_MODULE; + } + + rdc_status_t status = diag_test_cases_query_(test_cases, test_case_count); + RDC_LOG(RDC_DEBUG, + "Query " << *test_case_count << " test cases from RVS: " << rdc_status_string(status)); + return status; +} + +// Run a specific test case +rdc_status_t RdcRVSLib::rdc_test_case_run(rdc_diag_test_cases_t test_case, + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, const char* config, + size_t config_size, rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + if (!test_case_run_) { + return RDC_ST_FAIL_LOAD_MODULE; + } + + rdc_status_t status = + test_case_run_(test_case, gpu_index, gpu_count, config, config_size, result); + RDC_LOG(RDC_DEBUG, "Run " << test_case << " test case from RVS: " << rdc_status_string(status) + << " config[" << config_size << "]: " << config); + return status; +} + +rdc_status_t RdcRVSLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + const char* config, size_t config_size, + rdc_diag_response_t* response) { + (void)gpus; + (void)level; + (void)config; + (void)config_size; + (void)response; + return RDC_ST_NOT_SUPPORTED; +} + +rdc_status_t RdcRVSLib::rdc_diag_init(uint64_t flags) { + if (!diag_init_) { + return RDC_ST_FAIL_LOAD_MODULE; + } + + return diag_init_(flags); +} +rdc_status_t RdcRVSLib::rdc_diag_destroy() { + if (!diag_destroy_) { + return RDC_ST_FAIL_LOAD_MODULE; + } + + return diag_destroy_(); +} + +} // namespace rdc +} // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc index 4e0b85a77e..5946fb213f 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcRasLib.cc @@ -151,18 +151,24 @@ rdc_status_t RdcRasLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cas // Run a specific test case rdc_status_t RdcRasLib::rdc_test_case_run(rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, rdc_diag_test_result_t* result) { + uint32_t gpu_count, const char* config, + size_t config_size, rdc_diag_test_result_t* result) { (void)test_case; (void)gpu_index; (void)result; (void)gpu_count; + (void)config; + (void)config_size; return RDC_ST_NOT_SUPPORTED; } rdc_status_t RdcRasLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + const char* config, size_t config_size, rdc_diag_response_t* response) { (void)gpus; (void)level; + (void)config; + (void)config_size; (void)response; return RDC_ST_NOT_SUPPORTED; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcRocrLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcRocrLib.cc index 0744b97a30..da4ac7a4ae 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcRocrLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcRocrLib.cc @@ -92,7 +92,8 @@ rdc_status_t RdcRocrLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_ca // Run a specific test case rdc_status_t RdcRocrLib::rdc_test_case_run(rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, rdc_diag_test_result_t* result) { + uint32_t gpu_count, const char* config, + size_t config_size, rdc_diag_test_result_t* result) { if (result == nullptr) { return RDC_ST_BAD_PARAMETER; } @@ -100,15 +101,19 @@ rdc_status_t RdcRocrLib::rdc_test_case_run(rdc_diag_test_cases_t test_case, return RDC_ST_FAIL_LOAD_MODULE; } - rdc_status_t status = test_case_run_(test_case, gpu_index, gpu_count, result); + rdc_status_t status = + test_case_run_(test_case, gpu_index, gpu_count, config, config_size, result); RDC_LOG(RDC_DEBUG, "Run " << test_case << " test case from Rocr: " << rdc_status_string(status)); return status; } rdc_status_t RdcRocrLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level, + const char* config, size_t config_size, rdc_diag_response_t* response) { (void)gpus; (void)level; + (void)config; + (void)config_size; (void)response; return RDC_ST_NOT_SUPPORTED; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc index 24d16a6ac3..440f8c6cc9 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc @@ -188,7 +188,8 @@ rdc_status_t RdcSmiLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cas // Run a specific test case rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, rdc_diag_test_result_t* result) { + uint32_t gpu_count, const char* /*config*/, + size_t /*config_size*/, rdc_diag_test_result_t* result) { if (result == nullptr) { return RDC_ST_BAD_PARAMETER; } @@ -204,8 +205,8 @@ rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case, } } -rdc_status_t RdcSmiLib::rdc_diagnostic_run(const rdc_group_info_t&, rdc_diag_level_t, - rdc_diag_response_t*) { +rdc_status_t RdcSmiLib::rdc_diagnostic_run(const rdc_group_info_t&, rdc_diag_level_t, const char*, + size_t, rdc_diag_response_t*) { return RDC_ST_NOT_SUPPORTED; } diff --git a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index 99ced43026..2def05c41f 100644 --- a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -530,7 +530,8 @@ rdc_status_t RdcStandaloneHandler::rdc_field_unwatch(rdc_gpu_group_t group_id, // Diagnostic API rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id, - rdc_diag_level_t level, + rdc_diag_level_t level, const char* config, + size_t config_size, rdc_diag_response_t* response) { if (!response) { return RDC_ST_BAD_PARAMETER; @@ -541,6 +542,8 @@ rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id, request.set_group_id(group_id); request.set_level(level); + request.set_config(config); + request.set_config_size(config_size); ::grpc::Status status = stub_->DiagnosticRun(&context, request, &reply); rdc_status_t err_status = error_handle(status, reply.status()); @@ -583,6 +586,7 @@ rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_status_t RdcStandaloneHandler::rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, + const char* config, size_t config_size, rdc_diag_test_result_t* to_result) { if (!to_result) { return RDC_ST_BAD_PARAMETER; @@ -592,6 +596,8 @@ rdc_status_t RdcStandaloneHandler::rdc_test_case_run(rdc_gpu_group_t group_id, ::grpc::ClientContext context; request.set_group_id(group_id); + request.set_config(config); + request.set_config_size(config_size); request.set_test_case(static_cast<::rdc::DiagnosticTestCaseRunRequest_TestCaseType>(test_case)); ::grpc::Status status = stub_->DiagnosticTestCaseRun(&context, request, &reply); diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/CMakeLists.txt b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/CMakeLists.txt index 6ba7101ac6..6cd1868565 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/CMakeLists.txt +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/CMakeLists.txt @@ -26,7 +26,6 @@ set(RDC_ROCR_LIB_INC_LIST "${INC_DIR}/base_rocr_utils.h" "${INC_DIR}/common.h" "${PROJECT_SOURCE_DIR}/include/rdc/rdc.h" - "${RDC_LIB_INC_DIR}/RdcDiagnosticLibInterface.h" "${RDC_LIB_INC_DIR}/RdcLogger.h" "${RDC_LIB_INC_DIR}/rdc_common.h") diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/RdcDiagnosticLib.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/RdcDiagnosticLib.cc index 8f03449e06..c2347a924a 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/RdcDiagnosticLib.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/RdcDiagnosticLib.cc @@ -151,6 +151,7 @@ static rdc_status_t run_compute_queue_test(uint32_t gpu_index, rdc_diag_test_res rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + const char* /*config*/, size_t /*config_size*/, rdc_diag_test_result_t* result) { if (result == nullptr || gpu_count == 0) { return RDC_ST_BAD_PARAMETER; @@ -178,7 +179,7 @@ rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case, break; default: result->status = RDC_DIAG_RESULT_SKIP; - strncpy_with_null(result->info, "Not support yet", MAX_DIAG_MSG_LENGTH); + strncpy_with_null(result->info, "Not supported yet", MAX_DIAG_MSG_LENGTH); } } diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/CMakeLists.txt b/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/CMakeLists.txt new file mode 100644 index 0000000000..57b3b5259b --- /dev/null +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/CMakeLists.txt @@ -0,0 +1,64 @@ +message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") +message(" Cmake RDC Lib-RVS ") +message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") + +set(SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}") +set(INC_DIR "${PROJECT_SOURCE_DIR}/include/rdc_modules/rdc_rvs") + +set(RDC_RVS_LIB_COMPONENT "lib${RDC_RVS_LIB}") +set(RDC_RVS_LIB_SRC_LIST + "${BOOTSTRAP_LIB_SRC_DIR}/RdcLogger.cc" + "${SRC_DIR}/RvsBase.cc" + "${SRC_DIR}/RdcDiagnosticLib.cc" + ) +set(RDC_RVS_LIB_INC_LIST + "${PROJECT_SOURCE_DIR}/include/rdc/rdc.h" + "${RDC_LIB_INC_DIR}/RdcDiagnostic.h" + "${RDC_LIB_INC_DIR}/rdc_common.h" + "${RDC_LIB_INC_DIR}/RdcLogger.h" + "${INC_DIR}/RvsBase.h" + ) + +if(BUILD_RVS) + message("Build librdc_rvs.so is enabled, make sure RVS is installed.") + + message("RDC_RVS_LIB_INC_LIST=${RDC_RVS_LIB_INC_LIST}") + + # needed to find extra packages + list(APPEND CMAKE_PREFIX_PATH ${ROCM_DIR}) + find_package(yaml-cpp REQUIRED) + find_package(hip REQUIRED) + find_package(hsa-runtime64 REQUIRED) + find_package(rvs REQUIRED + HINTS ${ROCM_DIR}/lib/cmake) + find_library(rvslib REQUIRED + NAMES rvslib) + + ## additional libraries + set(COMBINED_LIBS rocblas hsakmt hsa-runtime64 hip::amdhip64 yaml-cpp) + + set(RDC_LIB_MODULES ${RDC_LIB_MODULES} ${RDC_RVS_LIB} PARENT_SCOPE) + add_library(${RDC_RVS_LIB} SHARED ${RDC_RVS_LIB_SRC_LIST} ${RDC_RVS_LIB_INC_LIST}) + target_link_libraries(${RDC_RVS_LIB} PRIVATE ${RDC_LIB} ${BOOTSTRAP_LIB} ${rvslib} pthread dl ${COMBINED_LIBS}) + target_include_directories(${RDC_RVS_LIB} PRIVATE + "${PROJECT_SOURCE_DIR}" + "${PROJECT_SOURCE_DIR}/include" + "${COMMON_DIR}" + "${RSMI_INC_DIR}" + "${ROCM_DIR}/include" + "${ROCM_DIR}/include/hsa" + "${ROCM_VALIDATION_SUITE_INCLUDE_DIR}") + + # Set the VERSION and SOVERSION values + set_property(TARGET ${RDC_RVS_LIB} PROPERTY + SOVERSION "${VERSION_MAJOR}") + set_property(TARGET ${RDC_RVS_LIB} PROPERTY + VERSION "${SO_VERSION_STRING}") + + # If the library is a release, strip the target library + if("${CMAKE_BUILD_TYPE}" STREQUAL Release) + add_custom_command( + TARGET ${RDC_RVS_LIB} + POST_BUILD COMMAND ${CMAKE_STRIP} ${RDC_RVS_LIB_COMPONENT}.so) + endif() +endif() diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/RdcDiagnosticLib.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/RdcDiagnosticLib.cc new file mode 100644 index 0000000000..f4c6d1989c --- /dev/null +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/RdcDiagnosticLib.cc @@ -0,0 +1,94 @@ +/* +Copyright (c) 2023 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +#include +#include + +#include "rdc/rdc.h" +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/rdc_common.h" +#include "rdc_modules/rdc_rvs/RvsBase.h" + +// TODO: Replace with client-side feedback channel +#define RVS_LOG() RDC_LOG(RDC_DEBUG, "!HELLO_FROM_RVS! " << __FILE__ << ":" << __LINE__) + +rdc_status_t rdc_diag_init(uint64_t) { + RVS_LOG(); + return RDC_ST_OK; +} + +rdc_status_t rdc_diag_destroy() { + RVS_LOG(); + return RDC_ST_OK; +} + +rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], + uint32_t* test_case_count) { + RVS_LOG(); + if (test_case_count == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + *test_case_count = 1; + test_cases[0] = RDC_DIAG_RVS_TEST; + + return RDC_ST_OK; +} + +rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case, + // TODO: use gpu_index + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + const char* config, size_t config_size, + rdc_diag_test_result_t* result) { + rvs_status_t rvs_status = RVS_STATUS_SUCCESS; + RVS_LOG(); + if (result == nullptr || gpu_count == 0) { + return RDC_ST_BAD_PARAMETER; + } + + if (test_case != RDC_DIAG_RVS_TEST) { + return RDC_ST_BAD_PARAMETER; + } + + // init the return data + *result = {}; + result->test_case = test_case; + result->status = RDC_DIAG_RESULT_PASS; + result->per_gpu_result_count = 0; + + switch (test_case) { + case RDC_DIAG_RVS_TEST: + strncpy_with_null(result->info, "Finished running RDC_DIAG_RVS_TEST!", MAX_DIAG_MSG_LENGTH); + rvs_status = amd::rdc::run_rvs_app(config, config_size); + break; + default: + result->status = RDC_DIAG_RESULT_SKIP; + strncpy_with_null(result->info, "Not supported yet", MAX_DIAG_MSG_LENGTH); + } + + if (rvs_status != RVS_STATUS_SUCCESS) { + result->status = RDC_DIAG_RESULT_FAIL; + } + + return RDC_ST_OK; +} diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/RvsBase.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/RvsBase.cc new file mode 100644 index 0000000000..1428a20123 --- /dev/null +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/RvsBase.cc @@ -0,0 +1,118 @@ +/* +Copyright (c) 2023 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_modules/rdc_rvs/RvsBase.h" + +#include + +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/rdc_common.h" +#include "rvs/rvs.h" + +// TODO: Make generic test +// TODO: Allow for user to override defaults with a custom string +static constexpr size_t MAX_CONFIG_LENGTH = 1024; + +volatile rvs_session_state_t state = RVS_SESSION_STATE_IDLE; + +rvs_status_t amd::rdc::run_rvs_app(const char* config, const size_t config_size) { + char active_config[MAX_CONFIG_LENGTH]; + rvs_session_property_t session_property = {RVS_SESSION_TYPE_DEFAULT_CONF, {{RVS_MODULE_GST}}}; + rvs_session_id_t session_id; + rvs_status_t status; + const char mem_config[MAX_CONFIG_LENGTH] = + "{actions: [{name: action_1, device: all, module: mem, parallel: true, " + "count: 1, wait: 100, mapped_memory: false, mem_blocks: 128, " + "num_passes: 500, thrds_per_blk: 64, stress: true, num_iter: 50000, " + "exclude: '5 6 7 8 9 10 11'}]}"; + const char gst_config[MAX_CONFIG_LENGTH] = + "{actions: [{name: gpustress-9000-sgemm-false, device: all, " + "device_index: '0', module: gst, parallel: false, count: 1, duration: " + "10000, copy_matrix: false, target_stress: 9000, matrix_size_a: 8640, " + "matrix_size_b: 8640, matrix_size_c: 8640, ops_type: sgemm, lda: 8640, " + "ldb: 8640, ldc: 8640}]}"; + + if ((config == nullptr) || (config_size == 0)) { + RDC_LOG(RDC_INFO, "given config is NULL! Using predefined gst_config"); + strncpy_with_null(active_config, gst_config, MAX_CONFIG_LENGTH); + } else if (config_size > MAX_CONFIG_LENGTH) { + RDC_LOG(RDC_ERROR, "given config size is too large! Expected at most " + << MAX_CONFIG_LENGTH << ", got " << config_size << " instead."); + return RVS_STATUS_INVALID_ARGUMENT; + } else { + RDC_LOG(RDC_DEBUG, "given config is correct"); + strncpy_with_null(active_config, config, config_size); + } + + status = rvs_initialize(); + if (status == RVS_STATUS_FAILED) { + RDC_LOG(RDC_ERROR, "rvs initialization failed"); + return status; + } + + /*******************************/ + + state = RVS_SESSION_STATE_IDLE; + + /* Using custom gst configuration in string format */ + + status = rvs_session_create(&session_id, amd::rdc::session_callback); + + session_property.type = RVS_SESSION_TYPE_CUSTOM_ACTION; + session_property.custom_action.config = active_config; + + status = rvs_session_set_property(session_id, &session_property); + + status = rvs_session_execute(session_id); + + if (status != RVS_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "RVS session execute failed with status: " << status); + rvs_session_destroy(session_id); + return status; + } + + while (state != RVS_SESSION_STATE_COMPLETED) { + }; + + status = rvs_session_destroy(session_id); + if (status != RVS_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "RVS session destroy failed with status: " << status); + } + + return status; +} + +void amd::rdc::session_callback(rvs_session_id_t session_id, const rvs_results_t* results) { + // NOTE: This is a placeholder! + // TODO: Use GRPC to send message back to client periodically + printf( + "/*******************************************************************/" + "\n"); + printf("session id -> %d state -> %d\n", session_id, results->state); + printf("session id -> %d status -> %d\n", session_id, results->status); + printf("session id -> %d output -> %s\n", session_id, results->output_log); + printf( + "/*******************************************************************/" + "\n"); + + state = results->state; + printf("state -> %d\n", state); +} diff --git a/projects/rdc/rdci/include/RdciSubSystem.h b/projects/rdc/rdci/include/RdciSubSystem.h index 0c296cd793..d562b628a4 100644 --- a/projects/rdc/rdci/include/RdciSubSystem.h +++ b/projects/rdc/rdci/include/RdciSubSystem.h @@ -51,6 +51,7 @@ class RdciSubSystem { std::string ip_port_; bool use_auth_; + std::string config_test_; std::string root_ca_; std::string client_cert_; std::string client_key_; diff --git a/projects/rdc/rdci/src/RdciDiagSubSystem.cc b/projects/rdc/rdci/src/RdciDiagSubSystem.cc index d3bbc38635..a926b2ac13 100644 --- a/projects/rdc/rdci/src/RdciDiagSubSystem.cc +++ b/projects/rdc/rdci/src/RdciDiagSubSystem.cc @@ -54,6 +54,7 @@ void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) { const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS}, {"help", optional_argument, nullptr, 'h'}, {"unauth", optional_argument, nullptr, 'u'}, + {"config-test", optional_argument, nullptr, 'c'}, {"run-level", required_argument, nullptr, 'r'}, {"group-id", required_argument, nullptr, 'g'}, {nullptr, 0, nullptr, 0}}; @@ -62,7 +63,7 @@ void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) { int option_index = 0; int opt = 0; - while ((opt = getopt_long(argc, argv, "hug:r:", long_options, &option_index)) != -1) { + while ((opt = getopt_long(argc, argv, "hug:r:c:", long_options, &option_index)) != -1) { switch (opt) { case HOST_OPTIONS: ip_port_ = optarg; @@ -73,6 +74,11 @@ void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) { case 'u': use_auth_ = false; break; + case 'c': + config_test_ = optarg; + printf("config_test_ = %s\n", config_test_.c_str()); + printf("config_test_.length = %zu\n", config_test_.length()); + break; case 'g': if (!IsNumber(optarg)) { show_help(); @@ -93,7 +99,6 @@ void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) { throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options"); } } - if (!group_id_set) { show_help(); throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the GPU group id"); @@ -108,12 +113,13 @@ void RdciDiagSubSystem::show_help() const { // ***** 20 Chars **** std::cout << " diag -- Used to run diagnostic for GPUs.\n\n"; std::cout << "Usage\n"; - std::cout << " rdci diag [--host :port] [-u] -g " + std::cout << " rdci diag [--host :port] [-u] [-t] -g " << " -r \n"; std::cout << "\nFlags:\n"; show_common_usage(); std::cout << " -g --group-id The GPU group to diagnose" << " on the specified host.\n"; + std::cout << " -c --config-test Set custom test config (RVS)\n"; std::cout << " -r --run-level level Integer representing test" << " run levels [default = 1].\n" << " level 1: Tests take a " @@ -130,6 +136,7 @@ std::string RdciDiagSubSystem::get_test_name(rdc_diag_test_cases_t test_case) co {RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"}, {RDC_DIAG_SYS_MEM_CHECK, "System memory check"}, {RDC_DIAG_NODE_TOPOLOGY, "Node topology check"}, + {RDC_DIAG_RVS_TEST, "Pre-defined config RVS check"}, {RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"}, {RDC_DIAG_TEST_LAST, "Unknown"}}; @@ -148,7 +155,8 @@ void RdciDiagSubSystem::process() { rdc_status_t result; rdc_diag_response_t response; - result = rdc_diagnostic_run(rdc_handle_, group_id_, run_level_, &response); + result = rdc_diagnostic_run(rdc_handle_, group_id_, run_level_, config_test_.c_str(), + config_test_.length(), &response); if (result != RDC_ST_OK) { std::string error_msg = rdc_status_string(result); diff --git a/projects/rdc/server/src/rdc_api_service.cc b/projects/rdc/server/src/rdc_api_service.cc index 83528d1df1..8ee687b237 100644 --- a/projects/rdc/server/src/rdc_api_service.cc +++ b/projects/rdc/server/src/rdc_api_service.cc @@ -556,9 +556,10 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, } rdc_diag_response_t diag_response; - rdc_status_t result = - rdc_diagnostic_run(rdc_handle_, request->group_id(), - static_cast(request->level()), &diag_response); + rdc_status_t result = rdc_diagnostic_run( + rdc_handle_, request->group_id(), static_cast(request->level()), + const_cast(request->config().c_str()), static_cast(request->config().length()), + &diag_response); reply->set_status(result); if (result != RDC_ST_OK) { @@ -607,9 +608,10 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, } rdc_diag_test_result_t test_result; - rdc_status_t result = - rdc_test_case_run(rdc_handle_, request->group_id(), - static_cast(request->test_case()), &test_result); + rdc_status_t result = rdc_test_case_run( + rdc_handle_, request->group_id(), static_cast(request->test_case()), + const_cast(request->config().c_str()), static_cast(request->config().length()), + &test_result); reply->set_status(result); if (result != RDC_ST_OK) {