RVS: Finish initial RVS integration
NOTE: RVS Build is disabled by default due to CI build issues.
Change-Id: I1593f0fe22075a9f86f54afa3ac151e109f1f7bd
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
[ROCm/rdc commit: eaa1862a80]
Цей коміт міститься в:
@@ -4,10 +4,6 @@
|
||||
# top-most EditorConfig file
|
||||
root = true
|
||||
|
||||
# Unix-style newlines with a newline ending every file and no stray whitespaces
|
||||
[*]
|
||||
end_of_line = lf
|
||||
|
||||
# Matches multiple files with brace expansion notation
|
||||
# Set default charset
|
||||
[*.{c,cc,cpp,h,hh,hpp}]
|
||||
@@ -18,3 +14,9 @@ indent_size = 2
|
||||
[*.py]
|
||||
indent_style = space
|
||||
indent_size = 4
|
||||
|
||||
[*.proto]
|
||||
charset = utf-8
|
||||
indent_style = space
|
||||
indent_size = 2
|
||||
|
||||
|
||||
@@ -26,6 +26,9 @@ cmake_minimum_required(VERSION 3.15)
|
||||
set(RDC "rdc" CACHE INTERNAL "")
|
||||
set(RDC_PACKAGE ${RDC} CACHE STRING "")
|
||||
|
||||
# Default libdir to "lib", this skips GNUInstallDirs from trying to take a guess if it's unset:
|
||||
set(CMAKE_INSTALL_LIBDIR "lib" CACHE STRING "Library install directory")
|
||||
|
||||
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules/" CACHE INTERNAL "Default module path.")
|
||||
# Include common cmake modules
|
||||
include(utils)
|
||||
@@ -53,16 +56,23 @@ option(BUILD_ROCRTEST "Build targets for librdc_rocr.so" ON)
|
||||
# which requires the Rocm profiler.
|
||||
option(BUILD_ROCPTEST "Build targets for librdc_rocp.so" OFF)
|
||||
|
||||
# When cmake -DBUILD_RVS=off, it will not build the librdc_rvs.so
|
||||
# which requires the RocmValidationSuite
|
||||
option(BUILD_RVS "Build targets for librdc_rvs.so" OFF)
|
||||
|
||||
# When cmake -DBUILD_TESTS=off, it will not build RDC tests.
|
||||
option(BUILD_TESTS "Build test suite" OFF)
|
||||
|
||||
# Enable shared libraries for gtest
|
||||
option(BUILD_SHARED_LIBS "Build shared library (.so) or not." ON)
|
||||
|
||||
# Enable address sanitizer
|
||||
option(ADDRESS_SANITIZER "Enable address sanitizer" OFF)
|
||||
|
||||
# File reorganization enable/disable
|
||||
option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" OFF)
|
||||
|
||||
option(CMAKE_VERBOSE_MAKEFILE "Enable verbose output" ON)
|
||||
option(CMAKE_VERBOSE_MAKEFILE "Enable verbose output" OFF)
|
||||
|
||||
option(CMAKE_EXPORT_COMPILE_COMMANDS "Export compile commands for linters and autocompleters" ON)
|
||||
|
||||
@@ -77,7 +87,8 @@ include(GNUInstallDirs)
|
||||
|
||||
# ROCM_DIR should be passed in via command line; it will be used
|
||||
# in sub-projects. Override with -DROCM_DIR=
|
||||
set(ROCM_DIR "/opt/rocm" CACHE STRING "ROCm directory.")
|
||||
set(ROCM_DIR "/opt/rocm" CACHE PATH "ROCm directory.")
|
||||
set(ROCM_PATH "${ROCM_DIR}" CACHE PATH "ROCm directory.")
|
||||
|
||||
set(COMMON_DIR "${CMAKE_CURRENT_SOURCE_DIR}/common")
|
||||
|
||||
@@ -89,8 +100,7 @@ set(RDC_SHARE_INSTALL_PREFIX "share/${RDC}" CACHE INTERNAL "Tests and Example in
|
||||
set(CMAKE_INSTALL_PREFIX ${ROCM_DIR} CACHE STRING "Default installation directory.")
|
||||
|
||||
# add package search paths
|
||||
set(CMAKE_PREFIX_PATH
|
||||
${CMAKE_PREFIX_PATH}
|
||||
list(APPEND CMAKE_PREFIX_PATH
|
||||
${GRPC_ROOT}
|
||||
/usr/local
|
||||
)
|
||||
@@ -213,7 +223,8 @@ if(BUILD_STANDALONE)
|
||||
--cpp_out=${PROTOB_OUT_DIR} ${file}
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
RESULT_VARIABLE PROTOB_RESULT
|
||||
OUTPUT_VARIABLE PROTOB_OUT_VAR)
|
||||
OUTPUT_VARIABLE PROTOB_OUT_VAR
|
||||
COMMAND_ERROR_IS_FATAL ANY)
|
||||
message("protoc command returned: ${PROTOB_RESULT}")
|
||||
|
||||
message("GRPC_PLUGIN=${GRPC_PLUGIN})")
|
||||
|
||||
@@ -0,0 +1,38 @@
|
||||
# This module provides a rvs::rvs package
|
||||
# You can specify the ROCM directory by setting ROCM_DIR
|
||||
|
||||
set(NAME rvs)
|
||||
|
||||
if(NOT DEFINED ROCM_DIR)
|
||||
set(ROCM_DIR "/opt/rocm")
|
||||
endif()
|
||||
|
||||
find_library(
|
||||
${NAME}_LIBRARY
|
||||
NAMES ${NAME} ${NAME}64
|
||||
HINTS "${ROCM_DIR}"
|
||||
REGISTRY_VIEW BOTH
|
||||
PATH_SUFFIXES lib)
|
||||
|
||||
if(NOT DEFINED (${NAME}_INCLUDE_DIR))
|
||||
find_path(
|
||||
${NAME}_INCLUDE_DIR
|
||||
NAMES ${NAME}.h
|
||||
HINTS "${ROCM_DIR}/include"
|
||||
PATH_SUFFIXES ${NAME} ${NAME}/inc)
|
||||
endif()
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(${NAME}
|
||||
FOUND_VAR ${NAME}_FOUND
|
||||
REQUIRED_VARS
|
||||
${NAME}_LIBRARY
|
||||
${NAME}_INCLUDE_DIR)
|
||||
|
||||
if(${NAME}_FOUND AND NOT TARGET ${NAME}::${NAME})
|
||||
add_library(${NAME}::${NAME} UNKNOWN IMPORTED)
|
||||
set_target_properties(${NAME}::${NAME} PROPERTIES
|
||||
IMPORTED_LOCATION "${${NAME}_LIBRARY}"
|
||||
INTERFACE_COMPILE_OPTIONS "${PC_${NAME}_CFLAGS_OTHER}"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${${NAME}_INCLUDE_DIR}")
|
||||
endif()
|
||||
@@ -60,11 +60,13 @@ function(create_library_symlink)
|
||||
set(LIB_RDC_ROCR "librdc_rocr.so")
|
||||
set(LIB_RDC_ROCP "librdc_rocp.so")
|
||||
set(LIB_RDC_RAS "librdc_ras.so")
|
||||
set(LIB_RDC_RVS "librdc_rvs.so")
|
||||
set(LIB_RDC_CLIENT_SMI "librdc_client_smi.so")
|
||||
set(library_files "${LIB_RDC_ROCR}" "${LIB_RDC_ROCR}.${MAJ_VERSION}" "${LIB_RDC_ROCR}.${SO_VERSION}" )
|
||||
set(library_files "${LIB_RDC_ROCP}" "${LIB_RDC_ROCP}.${MAJ_VERSION}" "${LIB_RDC_ROCP}.${SO_VERSION}" )
|
||||
set(library_files "${library_files}" "${LIB_RDC_CLIENT_SMI}" "${LIB_RDC_CLIENT_SMI}.${MAJ_VERSION}" "${LIB_RDC_CLIENT_SMI}.${SO_VERSION}" )
|
||||
set(library_files "${library_files}" "${LIB_RDC_RAS}")
|
||||
set(library_files "${library_files}" "${LIB_RDC_RVS}")
|
||||
|
||||
foreach(file_name ${library_files})
|
||||
add_custom_target(link_${file_name} ALL
|
||||
|
||||
@@ -36,6 +36,7 @@ static std::string get_test_name(rdc_diag_test_cases_t test_case) {
|
||||
{RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"},
|
||||
{RDC_DIAG_SYS_MEM_CHECK, "System memory check"},
|
||||
{RDC_DIAG_NODE_TOPOLOGY, "Node topology check"},
|
||||
{RDC_DIAG_RVS_TEST, "RVS check"},
|
||||
{RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"},
|
||||
{RDC_DIAG_TEST_LAST, "Unknown"}};
|
||||
|
||||
@@ -102,7 +103,7 @@ int main(int, char**) {
|
||||
|
||||
// (2) start to run short diagnostic.
|
||||
rdc_diag_response_t response;
|
||||
result = rdc_diagnostic_run(rdc_handle, group_id, RDC_DIAG_LVL_SHORT, &response);
|
||||
result = rdc_diagnostic_run(rdc_handle, group_id, RDC_DIAG_LVL_SHORT, nullptr, 0, &response);
|
||||
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error run RDC_DIAG_LVL_SHORT diagnostic. Return: " << rdc_status_string(result);
|
||||
@@ -135,7 +136,8 @@ int main(int, char**) {
|
||||
// (5) run one test case
|
||||
std::cout << " ============== Run individual diagnostic test ===========\n";
|
||||
rdc_diag_test_result_t test_result;
|
||||
result = rdc_test_case_run(rdc_handle, group_id, RDC_DIAG_COMPUTE_PROCESS, &test_result);
|
||||
result =
|
||||
rdc_test_case_run(rdc_handle, group_id, RDC_DIAG_COMPUTE_PROCESS, nullptr, 0, &test_result);
|
||||
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error run RDC_DIAG_COMPUTE_PROCESS diagnostic. Return: "
|
||||
|
||||
@@ -28,10 +28,20 @@ extern "C" {
|
||||
#endif // __cplusplus
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
// cstddef include causes issues on older GCC
|
||||
// use stddef.h instead
|
||||
#if __GNUC__ < 9
|
||||
#include <stddef.h>
|
||||
#else
|
||||
#include <cstddef>
|
||||
#endif // __GNUC__
|
||||
|
||||
#include <cstdint>
|
||||
#else
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
#endif // __cplusplus
|
||||
|
||||
/** \file rdc.h
|
||||
* Main header file for the ROCm RDC library.
|
||||
@@ -434,6 +444,7 @@ typedef enum {
|
||||
RDC_DIAG_COMPUTE_QUEUE, //!< The Compute Queue is ready
|
||||
RDC_DIAG_SYS_MEM_CHECK, //!< Check System memory
|
||||
RDC_DIAG_NODE_TOPOLOGY, //!< Report node topology
|
||||
RDC_DIAG_RVS_TEST, //!< TODO: Replace with real RVS tests
|
||||
RDC_DIAG_GPU_PARAMETERS, //!< GPU parameters in range
|
||||
RDC_DIAG_TEST_LAST = RDC_DIAG_GPU_PARAMETERS
|
||||
} rdc_diag_test_cases_t;
|
||||
@@ -972,12 +983,17 @@ rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_
|
||||
* The RDC_DIAG_LVL_SHORT only take a few seconds, and the
|
||||
* the RDC_DIAG_LVL_LONG may take up to 15 minutes.
|
||||
*
|
||||
* @param[in] config Implementation specific configuration.
|
||||
*
|
||||
* @param[in] config_size Length of the configuration.
|
||||
*
|
||||
* @param[inout] response The detail results of the tests run.
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_diag_level_t level, rdc_diag_response_t* response);
|
||||
rdc_diag_level_t level, const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response);
|
||||
|
||||
/**
|
||||
* @brief Run one diagnostic test case
|
||||
@@ -990,12 +1006,17 @@ rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group
|
||||
*
|
||||
* @param[in] test_case The test case to run.
|
||||
*
|
||||
* @param[in] config Implementation specific configuration.
|
||||
*
|
||||
* @param[in] config_size Length of the configuration.
|
||||
*
|
||||
* @param[inout] result The results of the test.
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_diag_test_cases_t test_case, rdc_diag_test_result_t* result);
|
||||
rdc_diag_test_cases_t test_case, const char* config,
|
||||
size_t config_size, rdc_diag_test_result_t* result);
|
||||
|
||||
/**
|
||||
* @brief Get a description of a provided RDC error status
|
||||
|
||||
@@ -25,7 +25,6 @@ THE SOFTWARE.
|
||||
#include <memory>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcDiagnosticLibInterface.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
@@ -39,10 +38,12 @@ class RdcDiagnostic {
|
||||
// Run a specific test case
|
||||
virtual rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count, rdc_diag_test_result_t* result) = 0;
|
||||
uint32_t gpu_count, const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result) = 0;
|
||||
|
||||
// Run multiple test cases
|
||||
virtual rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_diag_init(uint64_t flags) = 0;
|
||||
|
||||
@@ -37,6 +37,7 @@ rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST
|
||||
|
||||
rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result);
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t flags);
|
||||
|
||||
@@ -77,9 +77,11 @@ class RdcHandler {
|
||||
|
||||
// Diagnostic API
|
||||
virtual rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result) = 0;
|
||||
|
||||
// Control API
|
||||
|
||||
@@ -42,9 +42,11 @@ class RdcDiagnosticModule : public RdcDiagnostic {
|
||||
// Run a specific test case
|
||||
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result) override;
|
||||
|
||||
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) override;
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t flags) override;
|
||||
|
||||
@@ -81,8 +81,10 @@ class RdcEmbeddedHandler : public RdcHandler {
|
||||
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override;
|
||||
// Diagnostic API
|
||||
rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) override;
|
||||
rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result) override;
|
||||
|
||||
// Control API
|
||||
|
||||
@@ -27,9 +27,6 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/RdcMetricFetcher.h"
|
||||
#include "rdc_lib/RdcModuleMgr.h"
|
||||
#include "rdc_lib/RdcTelemetry.h"
|
||||
#include "rdc_lib/impl/RdcRasLib.h"
|
||||
#include "rdc_lib/impl/RdcRocrLib.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
Copyright (c) 2023 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef INCLUDE_RDC_LIB_IMPL_RDCRVSLIB_H_
|
||||
#define INCLUDE_RDC_LIB_IMPL_RDCRVSLIB_H_
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcDiagnostic.h"
|
||||
#include "rdc_lib/RdcLibraryLoader.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RdcRVSLib : public RdcDiagnostic {
|
||||
public:
|
||||
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count) override;
|
||||
|
||||
// Run a specific test case
|
||||
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result) override;
|
||||
|
||||
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) override;
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t flags) override;
|
||||
rdc_status_t rdc_diag_destroy() override;
|
||||
|
||||
RdcRVSLib();
|
||||
|
||||
~RdcRVSLib() override;
|
||||
|
||||
private:
|
||||
RdcLibraryLoader lib_loader_;
|
||||
rdc_status_t (*test_case_run_)(rdc_diag_test_cases_t, uint32_t[RDC_MAX_NUM_DEVICES], uint32_t,
|
||||
const char* config, size_t config_size, rdc_diag_test_result_t*);
|
||||
rdc_status_t (*diag_test_cases_query_)(rdc_diag_test_cases_t[MAX_TEST_CASES], uint32_t*);
|
||||
rdc_status_t (*diag_init_)(uint64_t);
|
||||
rdc_status_t (*diag_destroy_)();
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<RdcRVSLib> RdcRVSLibPtr;
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // INCLUDE_RDC_LIB_IMPL_RDCRVSLIB_H_
|
||||
@@ -56,9 +56,11 @@ class RdcRasLib : public RdcTelemetry, public RdcDiagnostic {
|
||||
// Run a specific test case
|
||||
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result) override;
|
||||
|
||||
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) override;
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t flags) override;
|
||||
|
||||
@@ -39,9 +39,11 @@ class RdcRocrLib : public RdcDiagnostic {
|
||||
// Run a specific test case
|
||||
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result) override;
|
||||
|
||||
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) override;
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t flags) override;
|
||||
@@ -54,7 +56,7 @@ class RdcRocrLib : public RdcDiagnostic {
|
||||
private:
|
||||
RdcLibraryLoader lib_loader_;
|
||||
rdc_status_t (*test_case_run_)(rdc_diag_test_cases_t, uint32_t[RDC_MAX_NUM_DEVICES], uint32_t,
|
||||
rdc_diag_test_result_t*);
|
||||
const char*, size_t, rdc_diag_test_result_t*);
|
||||
rdc_status_t (*diag_test_cases_query_)(rdc_diag_test_cases_t[MAX_TEST_CASES], uint32_t*);
|
||||
rdc_status_t (*diag_init_)(uint64_t);
|
||||
rdc_status_t (*diag_destroy_)();
|
||||
|
||||
@@ -53,9 +53,11 @@ class RdcSmiLib : public RdcTelemetry, public RdcDiagnostic {
|
||||
// Run a specific test case
|
||||
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result) override;
|
||||
|
||||
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) override;
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t flags) override;
|
||||
|
||||
@@ -76,8 +76,10 @@ class RdcStandaloneHandler : public RdcHandler {
|
||||
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override;
|
||||
// Diagnostic API
|
||||
rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) override;
|
||||
rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result) override;
|
||||
|
||||
// Control RdcAPI
|
||||
|
||||
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
Copyright (c) 2023 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef RDC_MODULES_RDC_RVS_RVSBASE_H_
|
||||
#define RDC_MODULES_RDC_RVS_RVSBASE_H_
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
#include "rvs/rvs.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
void session_callback(rvs_session_id_t session_id, const rvs_results_t* results);
|
||||
|
||||
rvs_status_t run_rvs_app(const char* config, size_t config_size);
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // RDC_MODULES_RDC_RVS_RVSBASE_H_
|
||||
@@ -218,12 +218,16 @@ service RdcAPI {
|
||||
// rdc_status_t rdc_diagnostic_run(
|
||||
// rdc_gpu_group_t group_id,
|
||||
// rdc_diag_level_t level,
|
||||
// const char* config,
|
||||
// size_t config_size,
|
||||
// rdc_diag_response_t* response);
|
||||
rpc DiagnosticRun(DiagnosticRunRequest) returns (DiagnosticRunResponse) {}
|
||||
|
||||
// rdc_status_t rdc_test_case_run(
|
||||
// rdc_gpu_group_t group_id,
|
||||
// rdc_diag_test_cases_t test_case,
|
||||
// const char* config,
|
||||
// size_t config_size,
|
||||
// rdc_diag_test_result_t* result);
|
||||
rpc DiagnosticTestCaseRun(DiagnosticTestCaseRunRequest) returns (DiagnosticTestCaseRunResponse) {}
|
||||
}
|
||||
@@ -476,6 +480,8 @@ message RemoveAllJobResponse {
|
||||
message DiagnosticRunRequest {
|
||||
uint32 group_id = 1;
|
||||
uint32 level = 2;
|
||||
string config = 3;
|
||||
uint32 config_size = 4;
|
||||
}
|
||||
|
||||
message DiagnosticDetail {
|
||||
@@ -498,7 +504,8 @@ message DiagnosticTestResult {
|
||||
VRAM_CHECK = 3;
|
||||
SYS_MEM_CHECK = 4;
|
||||
NODE_TOPOLOGY = 5;
|
||||
GPU_PARAMETERS = 6;
|
||||
RVS_TEST = 6;
|
||||
GPU_PARAMETERS = 7;
|
||||
};
|
||||
DiagnosticTestCase test_case = 3;
|
||||
uint32 per_gpu_result_count = 4;
|
||||
@@ -525,12 +532,15 @@ message DiagnosticTestCaseRunRequest {
|
||||
VRAM_CHECK = 3;
|
||||
SYS_MEM_CHECK = 4;
|
||||
NODE_TOPOLOGY = 5;
|
||||
GPU_PARAMETERS = 6;
|
||||
RVS_TEST = 6;
|
||||
GPU_PARAMETERS = 7;
|
||||
};
|
||||
TestCaseType test_case = 2;
|
||||
string config = 3;
|
||||
uint32 config_size = 4;
|
||||
}
|
||||
|
||||
message DiagnosticTestCaseRunResponse {
|
||||
uint32 status = 1;
|
||||
DiagnosticTestResult result = 2;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,6 +60,7 @@ set(BOOTSTRAP_LIB "rdc_bootstrap")
|
||||
set(RDC_LIB "rdc")
|
||||
set(RDC_ROCR_LIB "rdc_rocr")
|
||||
set(RDC_ROCP_LIB "rdc_rocp")
|
||||
set(RDC_RVS_LIB "rdc_rvs")
|
||||
set(RDCCLIENT_LIB "rdc_client")
|
||||
|
||||
################# Determine the library version #########################
|
||||
@@ -96,6 +97,9 @@ add_subdirectory(rdc_modules/rdc_rocr)
|
||||
# add librdc_rocp.so to RDC_LIB_MODULES
|
||||
add_subdirectory(rdc_modules/rdc_rocp)
|
||||
|
||||
# add librdc_rvs.so to RDC_LIB_MODULES
|
||||
add_subdirectory(rdc_modules/rdc_rvs)
|
||||
|
||||
if(BUILD_STANDALONE)
|
||||
# add librdc_client.so
|
||||
add_subdirectory(rdc_client)
|
||||
|
||||
@@ -294,23 +294,25 @@ rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle,
|
||||
}
|
||||
|
||||
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_diag_level_t level, rdc_diag_response_t* response) {
|
||||
rdc_diag_level_t level, const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) {
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_diagnostic_run(group_id, level, response);
|
||||
->rdc_diagnostic_run(group_id, level, config, config_size, response);
|
||||
}
|
||||
|
||||
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_diag_test_cases_t test_case, rdc_diag_test_result_t* result) {
|
||||
rdc_diag_test_cases_t test_case, const char* config,
|
||||
size_t config_size, rdc_diag_test_result_t* result) {
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_test_case_run(group_id, test_case, result);
|
||||
->rdc_test_case_run(group_id, test_case, config, config_size, result);
|
||||
}
|
||||
|
||||
const char* rdc_status_string(rdc_status_t result) {
|
||||
|
||||
@@ -21,6 +21,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST}
|
||||
"${SRC_DIR}/RdcRasLib.cc"
|
||||
"${SRC_DIR}/RdcRocpLib.cc"
|
||||
"${SRC_DIR}/RdcRocrLib.cc"
|
||||
"${SRC_DIR}/RdcRVSLib.cc"
|
||||
"${SRC_DIR}/RdcSmiDiagnosticImpl.cc"
|
||||
"${SRC_DIR}/RdcSmiLib.cc"
|
||||
"${SRC_DIR}/RdcTelemetryModule.cc"
|
||||
@@ -53,6 +54,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST}
|
||||
"${INC_DIR}/impl/RdcRasLib.h"
|
||||
"${INC_DIR}/impl/RdcRocpLib.h"
|
||||
"${INC_DIR}/impl/RdcRocrLib.h"
|
||||
"${INC_DIR}/impl/RdcRVSLib.h"
|
||||
"${INC_DIR}/impl/RdcSmiDiagnosticImpl.h"
|
||||
"${INC_DIR}/impl/RdcSmiLib.h"
|
||||
"${INC_DIR}/impl/RdcTelemetryModule.h"
|
||||
|
||||
@@ -28,6 +28,7 @@ THE SOFTWARE.
|
||||
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/RdcMetricFetcher.h"
|
||||
#include "rdc_lib/impl/RdcRVSLib.h"
|
||||
#include "rdc_lib/impl/RdcRasLib.h"
|
||||
#include "rdc_lib/impl/RdcRocrLib.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
@@ -55,7 +56,8 @@ rdc_status_t RdcDiagnosticModule::rdc_diag_test_cases_query(
|
||||
|
||||
rdc_status_t RdcDiagnosticModule::rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
uint32_t gpu_count, const char* config,
|
||||
size_t config_size,
|
||||
rdc_diag_test_result_t* result) {
|
||||
if (result == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
@@ -68,11 +70,13 @@ rdc_status_t RdcDiagnosticModule::rdc_test_case_run(rdc_diag_test_cases_t test_c
|
||||
strncpy_with_null(result->info, "Not implemented", MAX_DIAG_MSG_LENGTH);
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
return ite->second->rdc_test_case_run(test_case, gpu_index, gpu_count, result);
|
||||
return ite->second->rdc_test_case_run(test_case, gpu_index, gpu_count, config, config_size,
|
||||
result);
|
||||
}
|
||||
|
||||
rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpus,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_level_t level, const char* config,
|
||||
size_t config_size,
|
||||
rdc_diag_response_t* response) {
|
||||
if (response == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
@@ -87,11 +91,15 @@ rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpu
|
||||
rdc_runs.push_back(RDC_DIAG_SYS_MEM_CHECK);
|
||||
}
|
||||
|
||||
if (level >= RDC_DIAG_LVL_MED) { // Medium run and above
|
||||
rdc_runs.push_back(RDC_DIAG_RVS_TEST);
|
||||
}
|
||||
|
||||
response->results_count = 0;
|
||||
for (unsigned int i = 0; i < rdc_runs.size(); i++) {
|
||||
response->diag_info[i].test_case = rdc_runs[i];
|
||||
rdc_test_case_run(rdc_runs[i], const_cast<uint32_t*>(gpus.entity_ids), gpus.count,
|
||||
&(response->diag_info[i]));
|
||||
rdc_test_case_run(rdc_runs[i], const_cast<uint32_t*>(gpus.entity_ids), gpus.count, config,
|
||||
config_size, &(response->diag_info[i]));
|
||||
response->results_count++;
|
||||
}
|
||||
|
||||
@@ -116,8 +124,9 @@ rdc_status_t RdcDiagnosticModule::RdcDiagnosticModule::rdc_diag_destroy() {
|
||||
|
||||
RdcDiagnosticModule::RdcDiagnosticModule(RdcMetricFetcherPtr& fetcher) {
|
||||
const RdcSmiLibPtr smi_module = std::make_shared<RdcSmiLib>(fetcher);
|
||||
const RdcRasLibPtr ras_module = std::make_shared<RdcRasLib>();
|
||||
const RdcRocrLibPtr rocr_module = std::make_shared<RdcRocrLib>();
|
||||
const RdcRasLibPtr ras_module = std::make_shared<RdcRasLib>();
|
||||
const RdcRVSLibPtr rvs_module = std::make_shared<RdcRVSLib>();
|
||||
if (smi_module) {
|
||||
diagnostic_modules_.push_back(smi_module);
|
||||
}
|
||||
@@ -127,6 +136,9 @@ RdcDiagnosticModule::RdcDiagnosticModule(RdcMetricFetcherPtr& fetcher) {
|
||||
if (ras_module) {
|
||||
diagnostic_modules_.push_back(ras_module);
|
||||
}
|
||||
if (rvs_module) {
|
||||
diagnostic_modules_.push_back(rvs_module);
|
||||
}
|
||||
|
||||
auto ite = diagnostic_modules_.begin();
|
||||
for (; ite != diagnostic_modules_.end(); ite++) {
|
||||
|
||||
@@ -350,7 +350,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_unwatch(rdc_gpu_group_t group_id,
|
||||
|
||||
// Diagnostic API
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_level_t level, const char* config,
|
||||
size_t config_size,
|
||||
rdc_diag_response_t* response) {
|
||||
if (!response) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
@@ -362,11 +363,12 @@ rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
|
||||
if (status != RDC_ST_OK) return status;
|
||||
|
||||
auto diag = rdc_module_mgr_->get_diagnostic_module();
|
||||
return diag->rdc_diagnostic_run(rdc_group_info, level, response);
|
||||
return diag->rdc_diagnostic_run(rdc_group_info, level, config, config_size, response);
|
||||
}
|
||||
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
|
||||
rdc_diag_test_cases_t test_case,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result) {
|
||||
if (!result) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
@@ -377,8 +379,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
|
||||
if (status != RDC_ST_OK) return status;
|
||||
|
||||
auto diag = rdc_module_mgr_->get_diagnostic_module();
|
||||
return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids, rdc_group_info.count,
|
||||
result);
|
||||
return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids, rdc_group_info.count, config,
|
||||
config_size, result);
|
||||
}
|
||||
|
||||
// Control API
|
||||
|
||||
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include "rdc_lib/impl/RdcRVSLib.h"
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
RdcRVSLib::RdcRVSLib()
|
||||
: test_case_run_(nullptr),
|
||||
diag_test_cases_query_(nullptr),
|
||||
diag_init_(nullptr),
|
||||
diag_destroy_(nullptr) {
|
||||
rdc_status_t status = lib_loader_.load("librdc_rvs.so");
|
||||
if (status != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR, "RVS related function will not work.");
|
||||
return;
|
||||
}
|
||||
|
||||
status = lib_loader_.load_symbol(&diag_init_, "rdc_diag_init");
|
||||
if (status != RDC_ST_OK) {
|
||||
diag_init_ = nullptr;
|
||||
return;
|
||||
}
|
||||
|
||||
status = diag_init_(0);
|
||||
if (status != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR, "Fail to init librdc_rvs.so:" << rdc_status_string(status)
|
||||
<< ". RVS related function will not work.");
|
||||
return;
|
||||
}
|
||||
|
||||
status = lib_loader_.load_symbol(&diag_destroy_, "rdc_diag_destroy");
|
||||
if (status != RDC_ST_OK) {
|
||||
diag_destroy_ = nullptr;
|
||||
}
|
||||
|
||||
status = lib_loader_.load_symbol(&test_case_run_, "rdc_diag_test_case_run");
|
||||
if (status != RDC_ST_OK) {
|
||||
test_case_run_ = nullptr;
|
||||
}
|
||||
status = lib_loader_.load_symbol(&diag_test_cases_query_, "rdc_diag_test_cases_query");
|
||||
if (status != RDC_ST_OK) {
|
||||
diag_test_cases_query_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
RdcRVSLib::~RdcRVSLib() {
|
||||
if (diag_destroy_) {
|
||||
diag_destroy_();
|
||||
}
|
||||
}
|
||||
|
||||
rdc_status_t RdcRVSLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count) {
|
||||
if (test_case_count == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
if (!diag_test_cases_query_) {
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
|
||||
rdc_status_t status = diag_test_cases_query_(test_cases, test_case_count);
|
||||
RDC_LOG(RDC_DEBUG,
|
||||
"Query " << *test_case_count << " test cases from RVS: " << rdc_status_string(status));
|
||||
return status;
|
||||
}
|
||||
|
||||
// Run a specific test case
|
||||
rdc_status_t RdcRVSLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count, const char* config,
|
||||
size_t config_size, rdc_diag_test_result_t* result) {
|
||||
if (result == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
if (!test_case_run_) {
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
|
||||
rdc_status_t status =
|
||||
test_case_run_(test_case, gpu_index, gpu_count, config, config_size, result);
|
||||
RDC_LOG(RDC_DEBUG, "Run " << test_case << " test case from RVS: " << rdc_status_string(status)
|
||||
<< " config[" << config_size << "]: " << config);
|
||||
return status;
|
||||
}
|
||||
|
||||
rdc_status_t RdcRVSLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) {
|
||||
(void)gpus;
|
||||
(void)level;
|
||||
(void)config;
|
||||
(void)config_size;
|
||||
(void)response;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
rdc_status_t RdcRVSLib::rdc_diag_init(uint64_t flags) {
|
||||
if (!diag_init_) {
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
|
||||
return diag_init_(flags);
|
||||
}
|
||||
rdc_status_t RdcRVSLib::rdc_diag_destroy() {
|
||||
if (!diag_destroy_) {
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
|
||||
return diag_destroy_();
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
@@ -151,18 +151,24 @@ rdc_status_t RdcRasLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cas
|
||||
// Run a specific test case
|
||||
rdc_status_t RdcRasLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count, rdc_diag_test_result_t* result) {
|
||||
uint32_t gpu_count, const char* config,
|
||||
size_t config_size, rdc_diag_test_result_t* result) {
|
||||
(void)test_case;
|
||||
(void)gpu_index;
|
||||
(void)result;
|
||||
(void)gpu_count;
|
||||
(void)config;
|
||||
(void)config_size;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
rdc_status_t RdcRasLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) {
|
||||
(void)gpus;
|
||||
(void)level;
|
||||
(void)config;
|
||||
(void)config_size;
|
||||
(void)response;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
@@ -92,7 +92,8 @@ rdc_status_t RdcRocrLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_ca
|
||||
// Run a specific test case
|
||||
rdc_status_t RdcRocrLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count, rdc_diag_test_result_t* result) {
|
||||
uint32_t gpu_count, const char* config,
|
||||
size_t config_size, rdc_diag_test_result_t* result) {
|
||||
if (result == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
@@ -100,15 +101,19 @@ rdc_status_t RdcRocrLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
return RDC_ST_FAIL_LOAD_MODULE;
|
||||
}
|
||||
|
||||
rdc_status_t status = test_case_run_(test_case, gpu_index, gpu_count, result);
|
||||
rdc_status_t status =
|
||||
test_case_run_(test_case, gpu_index, gpu_count, config, config_size, result);
|
||||
RDC_LOG(RDC_DEBUG, "Run " << test_case << " test case from Rocr: " << rdc_status_string(status));
|
||||
return status;
|
||||
}
|
||||
|
||||
rdc_status_t RdcRocrLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_response_t* response) {
|
||||
(void)gpus;
|
||||
(void)level;
|
||||
(void)config;
|
||||
(void)config_size;
|
||||
(void)response;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
@@ -188,7 +188,8 @@ rdc_status_t RdcSmiLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cas
|
||||
// Run a specific test case
|
||||
rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count, rdc_diag_test_result_t* result) {
|
||||
uint32_t gpu_count, const char* /*config*/,
|
||||
size_t /*config_size*/, rdc_diag_test_result_t* result) {
|
||||
if (result == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
@@ -204,8 +205,8 @@ rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
}
|
||||
}
|
||||
|
||||
rdc_status_t RdcSmiLib::rdc_diagnostic_run(const rdc_group_info_t&, rdc_diag_level_t,
|
||||
rdc_diag_response_t*) {
|
||||
rdc_status_t RdcSmiLib::rdc_diagnostic_run(const rdc_group_info_t&, rdc_diag_level_t, const char*,
|
||||
size_t, rdc_diag_response_t*) {
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
|
||||
@@ -530,7 +530,8 @@ rdc_status_t RdcStandaloneHandler::rdc_field_unwatch(rdc_gpu_group_t group_id,
|
||||
|
||||
// Diagnostic API
|
||||
rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_level_t level, const char* config,
|
||||
size_t config_size,
|
||||
rdc_diag_response_t* response) {
|
||||
if (!response) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
@@ -541,6 +542,8 @@ rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
|
||||
|
||||
request.set_group_id(group_id);
|
||||
request.set_level(level);
|
||||
request.set_config(config);
|
||||
request.set_config_size(config_size);
|
||||
|
||||
::grpc::Status status = stub_->DiagnosticRun(&context, request, &reply);
|
||||
rdc_status_t err_status = error_handle(status, reply.status());
|
||||
@@ -583,6 +586,7 @@ rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
|
||||
rdc_diag_test_cases_t test_case,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* to_result) {
|
||||
if (!to_result) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
@@ -592,6 +596,8 @@ rdc_status_t RdcStandaloneHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
|
||||
::grpc::ClientContext context;
|
||||
|
||||
request.set_group_id(group_id);
|
||||
request.set_config(config);
|
||||
request.set_config_size(config_size);
|
||||
request.set_test_case(static_cast<::rdc::DiagnosticTestCaseRunRequest_TestCaseType>(test_case));
|
||||
|
||||
::grpc::Status status = stub_->DiagnosticTestCaseRun(&context, request, &reply);
|
||||
|
||||
@@ -26,7 +26,6 @@ set(RDC_ROCR_LIB_INC_LIST
|
||||
"${INC_DIR}/base_rocr_utils.h"
|
||||
"${INC_DIR}/common.h"
|
||||
"${PROJECT_SOURCE_DIR}/include/rdc/rdc.h"
|
||||
"${RDC_LIB_INC_DIR}/RdcDiagnosticLibInterface.h"
|
||||
"${RDC_LIB_INC_DIR}/RdcLogger.h"
|
||||
"${RDC_LIB_INC_DIR}/rdc_common.h")
|
||||
|
||||
|
||||
@@ -151,6 +151,7 @@ static rdc_status_t run_compute_queue_test(uint32_t gpu_index, rdc_diag_test_res
|
||||
|
||||
rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
const char* /*config*/, size_t /*config_size*/,
|
||||
rdc_diag_test_result_t* result) {
|
||||
if (result == nullptr || gpu_count == 0) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
@@ -178,7 +179,7 @@ rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
break;
|
||||
default:
|
||||
result->status = RDC_DIAG_RESULT_SKIP;
|
||||
strncpy_with_null(result->info, "Not support yet", MAX_DIAG_MSG_LENGTH);
|
||||
strncpy_with_null(result->info, "Not supported yet", MAX_DIAG_MSG_LENGTH);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
message(" Cmake RDC Lib-RVS ")
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
|
||||
set(SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
set(INC_DIR "${PROJECT_SOURCE_DIR}/include/rdc_modules/rdc_rvs")
|
||||
|
||||
set(RDC_RVS_LIB_COMPONENT "lib${RDC_RVS_LIB}")
|
||||
set(RDC_RVS_LIB_SRC_LIST
|
||||
"${BOOTSTRAP_LIB_SRC_DIR}/RdcLogger.cc"
|
||||
"${SRC_DIR}/RvsBase.cc"
|
||||
"${SRC_DIR}/RdcDiagnosticLib.cc"
|
||||
)
|
||||
set(RDC_RVS_LIB_INC_LIST
|
||||
"${PROJECT_SOURCE_DIR}/include/rdc/rdc.h"
|
||||
"${RDC_LIB_INC_DIR}/RdcDiagnostic.h"
|
||||
"${RDC_LIB_INC_DIR}/rdc_common.h"
|
||||
"${RDC_LIB_INC_DIR}/RdcLogger.h"
|
||||
"${INC_DIR}/RvsBase.h"
|
||||
)
|
||||
|
||||
if(BUILD_RVS)
|
||||
message("Build librdc_rvs.so is enabled, make sure RVS is installed.")
|
||||
|
||||
message("RDC_RVS_LIB_INC_LIST=${RDC_RVS_LIB_INC_LIST}")
|
||||
|
||||
# needed to find extra packages
|
||||
list(APPEND CMAKE_PREFIX_PATH ${ROCM_DIR})
|
||||
find_package(yaml-cpp REQUIRED)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hsa-runtime64 REQUIRED)
|
||||
find_package(rvs REQUIRED
|
||||
HINTS ${ROCM_DIR}/lib/cmake)
|
||||
find_library(rvslib REQUIRED
|
||||
NAMES rvslib)
|
||||
|
||||
## additional libraries
|
||||
set(COMBINED_LIBS rocblas hsakmt hsa-runtime64 hip::amdhip64 yaml-cpp)
|
||||
|
||||
set(RDC_LIB_MODULES ${RDC_LIB_MODULES} ${RDC_RVS_LIB} PARENT_SCOPE)
|
||||
add_library(${RDC_RVS_LIB} SHARED ${RDC_RVS_LIB_SRC_LIST} ${RDC_RVS_LIB_INC_LIST})
|
||||
target_link_libraries(${RDC_RVS_LIB} PRIVATE ${RDC_LIB} ${BOOTSTRAP_LIB} ${rvslib} pthread dl ${COMBINED_LIBS})
|
||||
target_include_directories(${RDC_RVS_LIB} PRIVATE
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
"${COMMON_DIR}"
|
||||
"${RSMI_INC_DIR}"
|
||||
"${ROCM_DIR}/include"
|
||||
"${ROCM_DIR}/include/hsa"
|
||||
"${ROCM_VALIDATION_SUITE_INCLUDE_DIR}")
|
||||
|
||||
# Set the VERSION and SOVERSION values
|
||||
set_property(TARGET ${RDC_RVS_LIB} PROPERTY
|
||||
SOVERSION "${VERSION_MAJOR}")
|
||||
set_property(TARGET ${RDC_RVS_LIB} PROPERTY
|
||||
VERSION "${SO_VERSION_STRING}")
|
||||
|
||||
# If the library is a release, strip the target library
|
||||
if("${CMAKE_BUILD_TYPE}" STREQUAL Release)
|
||||
add_custom_command(
|
||||
TARGET ${RDC_RVS_LIB}
|
||||
POST_BUILD COMMAND ${CMAKE_STRIP} ${RDC_RVS_LIB_COMPONENT}.so)
|
||||
endif()
|
||||
endif()
|
||||
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
Copyright (c) 2023 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include <string.h>
|
||||
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rdc_modules/rdc_rvs/RvsBase.h"
|
||||
|
||||
// TODO: Replace with client-side feedback channel
|
||||
#define RVS_LOG() RDC_LOG(RDC_DEBUG, "!HELLO_FROM_RVS! " << __FILE__ << ":" << __LINE__)
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t) {
|
||||
RVS_LOG();
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t rdc_diag_destroy() {
|
||||
RVS_LOG();
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count) {
|
||||
RVS_LOG();
|
||||
if (test_case_count == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
*test_case_count = 1;
|
||||
test_cases[0] = RDC_DIAG_RVS_TEST;
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
// TODO: use gpu_index
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
const char* config, size_t config_size,
|
||||
rdc_diag_test_result_t* result) {
|
||||
rvs_status_t rvs_status = RVS_STATUS_SUCCESS;
|
||||
RVS_LOG();
|
||||
if (result == nullptr || gpu_count == 0) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
if (test_case != RDC_DIAG_RVS_TEST) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
// init the return data
|
||||
*result = {};
|
||||
result->test_case = test_case;
|
||||
result->status = RDC_DIAG_RESULT_PASS;
|
||||
result->per_gpu_result_count = 0;
|
||||
|
||||
switch (test_case) {
|
||||
case RDC_DIAG_RVS_TEST:
|
||||
strncpy_with_null(result->info, "Finished running RDC_DIAG_RVS_TEST!", MAX_DIAG_MSG_LENGTH);
|
||||
rvs_status = amd::rdc::run_rvs_app(config, config_size);
|
||||
break;
|
||||
default:
|
||||
result->status = RDC_DIAG_RESULT_SKIP;
|
||||
strncpy_with_null(result->info, "Not supported yet", MAX_DIAG_MSG_LENGTH);
|
||||
}
|
||||
|
||||
if (rvs_status != RVS_STATUS_SUCCESS) {
|
||||
result->status = RDC_DIAG_RESULT_FAIL;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
Copyright (c) 2023 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include "rdc_modules/rdc_rvs/RvsBase.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rvs/rvs.h"
|
||||
|
||||
// TODO: Make generic test
|
||||
// TODO: Allow for user to override defaults with a custom string
|
||||
static constexpr size_t MAX_CONFIG_LENGTH = 1024;
|
||||
|
||||
volatile rvs_session_state_t state = RVS_SESSION_STATE_IDLE;
|
||||
|
||||
rvs_status_t amd::rdc::run_rvs_app(const char* config, const size_t config_size) {
|
||||
char active_config[MAX_CONFIG_LENGTH];
|
||||
rvs_session_property_t session_property = {RVS_SESSION_TYPE_DEFAULT_CONF, {{RVS_MODULE_GST}}};
|
||||
rvs_session_id_t session_id;
|
||||
rvs_status_t status;
|
||||
const char mem_config[MAX_CONFIG_LENGTH] =
|
||||
"{actions: [{name: action_1, device: all, module: mem, parallel: true, "
|
||||
"count: 1, wait: 100, mapped_memory: false, mem_blocks: 128, "
|
||||
"num_passes: 500, thrds_per_blk: 64, stress: true, num_iter: 50000, "
|
||||
"exclude: '5 6 7 8 9 10 11'}]}";
|
||||
const char gst_config[MAX_CONFIG_LENGTH] =
|
||||
"{actions: [{name: gpustress-9000-sgemm-false, device: all, "
|
||||
"device_index: '0', module: gst, parallel: false, count: 1, duration: "
|
||||
"10000, copy_matrix: false, target_stress: 9000, matrix_size_a: 8640, "
|
||||
"matrix_size_b: 8640, matrix_size_c: 8640, ops_type: sgemm, lda: 8640, "
|
||||
"ldb: 8640, ldc: 8640}]}";
|
||||
|
||||
if ((config == nullptr) || (config_size == 0)) {
|
||||
RDC_LOG(RDC_INFO, "given config is NULL! Using predefined gst_config");
|
||||
strncpy_with_null(active_config, gst_config, MAX_CONFIG_LENGTH);
|
||||
} else if (config_size > MAX_CONFIG_LENGTH) {
|
||||
RDC_LOG(RDC_ERROR, "given config size is too large! Expected at most "
|
||||
<< MAX_CONFIG_LENGTH << ", got " << config_size << " instead.");
|
||||
return RVS_STATUS_INVALID_ARGUMENT;
|
||||
} else {
|
||||
RDC_LOG(RDC_DEBUG, "given config is correct");
|
||||
strncpy_with_null(active_config, config, config_size);
|
||||
}
|
||||
|
||||
status = rvs_initialize();
|
||||
if (status == RVS_STATUS_FAILED) {
|
||||
RDC_LOG(RDC_ERROR, "rvs initialization failed");
|
||||
return status;
|
||||
}
|
||||
|
||||
/*******************************/
|
||||
|
||||
state = RVS_SESSION_STATE_IDLE;
|
||||
|
||||
/* Using custom gst configuration in string format */
|
||||
|
||||
status = rvs_session_create(&session_id, amd::rdc::session_callback);
|
||||
|
||||
session_property.type = RVS_SESSION_TYPE_CUSTOM_ACTION;
|
||||
session_property.custom_action.config = active_config;
|
||||
|
||||
status = rvs_session_set_property(session_id, &session_property);
|
||||
|
||||
status = rvs_session_execute(session_id);
|
||||
|
||||
if (status != RVS_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "RVS session execute failed with status: " << status);
|
||||
rvs_session_destroy(session_id);
|
||||
return status;
|
||||
}
|
||||
|
||||
while (state != RVS_SESSION_STATE_COMPLETED) {
|
||||
};
|
||||
|
||||
status = rvs_session_destroy(session_id);
|
||||
if (status != RVS_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "RVS session destroy failed with status: " << status);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void amd::rdc::session_callback(rvs_session_id_t session_id, const rvs_results_t* results) {
|
||||
// NOTE: This is a placeholder!
|
||||
// TODO: Use GRPC to send message back to client periodically
|
||||
printf(
|
||||
"/*******************************************************************/"
|
||||
"\n");
|
||||
printf("session id -> %d state -> %d\n", session_id, results->state);
|
||||
printf("session id -> %d status -> %d\n", session_id, results->status);
|
||||
printf("session id -> %d output -> %s\n", session_id, results->output_log);
|
||||
printf(
|
||||
"/*******************************************************************/"
|
||||
"\n");
|
||||
|
||||
state = results->state;
|
||||
printf("state -> %d\n", state);
|
||||
}
|
||||
@@ -51,6 +51,7 @@ class RdciSubSystem {
|
||||
std::string ip_port_;
|
||||
|
||||
bool use_auth_;
|
||||
std::string config_test_;
|
||||
std::string root_ca_;
|
||||
std::string client_cert_;
|
||||
std::string client_key_;
|
||||
|
||||
@@ -54,6 +54,7 @@ void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) {
|
||||
const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS},
|
||||
{"help", optional_argument, nullptr, 'h'},
|
||||
{"unauth", optional_argument, nullptr, 'u'},
|
||||
{"config-test", optional_argument, nullptr, 'c'},
|
||||
{"run-level", required_argument, nullptr, 'r'},
|
||||
{"group-id", required_argument, nullptr, 'g'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
@@ -62,7 +63,7 @@ void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) {
|
||||
int option_index = 0;
|
||||
int opt = 0;
|
||||
|
||||
while ((opt = getopt_long(argc, argv, "hug:r:", long_options, &option_index)) != -1) {
|
||||
while ((opt = getopt_long(argc, argv, "hug:r:c:", long_options, &option_index)) != -1) {
|
||||
switch (opt) {
|
||||
case HOST_OPTIONS:
|
||||
ip_port_ = optarg;
|
||||
@@ -73,6 +74,11 @@ void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) {
|
||||
case 'u':
|
||||
use_auth_ = false;
|
||||
break;
|
||||
case 'c':
|
||||
config_test_ = optarg;
|
||||
printf("config_test_ = %s\n", config_test_.c_str());
|
||||
printf("config_test_.length = %zu\n", config_test_.length());
|
||||
break;
|
||||
case 'g':
|
||||
if (!IsNumber(optarg)) {
|
||||
show_help();
|
||||
@@ -93,7 +99,6 @@ void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options");
|
||||
}
|
||||
}
|
||||
|
||||
if (!group_id_set) {
|
||||
show_help();
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the GPU group id");
|
||||
@@ -108,12 +113,13 @@ void RdciDiagSubSystem::show_help() const {
|
||||
// ***** 20 Chars ****
|
||||
std::cout << " diag -- Used to run diagnostic for GPUs.\n\n";
|
||||
std::cout << "Usage\n";
|
||||
std::cout << " rdci diag [--host <IP/FQDN>:port] [-u] -g <groupId>"
|
||||
std::cout << " rdci diag [--host <IP/FQDN>:port] [-u] [-t] -g <groupId>"
|
||||
<< " -r <runLevel>\n";
|
||||
std::cout << "\nFlags:\n";
|
||||
show_common_usage();
|
||||
std::cout << " -g --group-id The GPU group to diagnose"
|
||||
<< " on the specified host.\n";
|
||||
std::cout << " -c --config-test Set custom test config (RVS)\n";
|
||||
std::cout << " -r --run-level level Integer representing test"
|
||||
<< " run levels [default = 1].\n"
|
||||
<< " level 1: Tests take a "
|
||||
@@ -130,6 +136,7 @@ std::string RdciDiagSubSystem::get_test_name(rdc_diag_test_cases_t test_case) co
|
||||
{RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"},
|
||||
{RDC_DIAG_SYS_MEM_CHECK, "System memory check"},
|
||||
{RDC_DIAG_NODE_TOPOLOGY, "Node topology check"},
|
||||
{RDC_DIAG_RVS_TEST, "Pre-defined config RVS check"},
|
||||
{RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"},
|
||||
{RDC_DIAG_TEST_LAST, "Unknown"}};
|
||||
|
||||
@@ -148,7 +155,8 @@ void RdciDiagSubSystem::process() {
|
||||
|
||||
rdc_status_t result;
|
||||
rdc_diag_response_t response;
|
||||
result = rdc_diagnostic_run(rdc_handle_, group_id_, run_level_, &response);
|
||||
result = rdc_diagnostic_run(rdc_handle_, group_id_, run_level_, config_test_.c_str(),
|
||||
config_test_.length(), &response);
|
||||
|
||||
if (result != RDC_ST_OK) {
|
||||
std::string error_msg = rdc_status_string(result);
|
||||
|
||||
@@ -556,9 +556,10 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
|
||||
}
|
||||
|
||||
rdc_diag_response_t diag_response;
|
||||
rdc_status_t result =
|
||||
rdc_diagnostic_run(rdc_handle_, request->group_id(),
|
||||
static_cast<rdc_diag_level_t>(request->level()), &diag_response);
|
||||
rdc_status_t result = rdc_diagnostic_run(
|
||||
rdc_handle_, request->group_id(), static_cast<rdc_diag_level_t>(request->level()),
|
||||
const_cast<char*>(request->config().c_str()), static_cast<size_t>(request->config().length()),
|
||||
&diag_response);
|
||||
|
||||
reply->set_status(result);
|
||||
if (result != RDC_ST_OK) {
|
||||
@@ -607,9 +608,10 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
|
||||
}
|
||||
|
||||
rdc_diag_test_result_t test_result;
|
||||
rdc_status_t result =
|
||||
rdc_test_case_run(rdc_handle_, request->group_id(),
|
||||
static_cast<rdc_diag_test_cases_t>(request->test_case()), &test_result);
|
||||
rdc_status_t result = rdc_test_case_run(
|
||||
rdc_handle_, request->group_id(), static_cast<rdc_diag_test_cases_t>(request->test_case()),
|
||||
const_cast<char*>(request->config().c_str()), static_cast<size_t>(request->config().length()),
|
||||
&test_result);
|
||||
|
||||
reply->set_status(result);
|
||||
if (result != RDC_ST_OK) {
|
||||
|
||||
Посилання в новій задачі
Заблокувати користувача