RVS: Finish initial RVS integration

NOTE: RVS Build is disabled by default due to CI build issues.

Change-Id: I1593f0fe22075a9f86f54afa3ac151e109f1f7bd
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: eaa1862a80]
Dieser Commit ist enthalten in:
Galantsev, Dmitrii
2023-06-07 19:46:35 -05:00
Ursprung ea624cbb7c
Commit 38c60ff90b
37 geänderte Dateien mit 729 neuen und 56 gelöschten Zeilen
+6 -4
Datei anzeigen
@@ -4,10 +4,6 @@
# top-most EditorConfig file
root = true
# Unix-style newlines with a newline ending every file and no stray whitespaces
[*]
end_of_line = lf
# Matches multiple files with brace expansion notation
# Set default charset
[*.{c,cc,cpp,h,hh,hpp}]
@@ -18,3 +14,9 @@ indent_size = 2
[*.py]
indent_style = space
indent_size = 4
[*.proto]
charset = utf-8
indent_style = space
indent_size = 2
+16 -5
Datei anzeigen
@@ -26,6 +26,9 @@ cmake_minimum_required(VERSION 3.15)
set(RDC "rdc" CACHE INTERNAL "")
set(RDC_PACKAGE ${RDC} CACHE STRING "")
# Default libdir to "lib", this skips GNUInstallDirs from trying to take a guess if it's unset:
set(CMAKE_INSTALL_LIBDIR "lib" CACHE STRING "Library install directory")
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules/" CACHE INTERNAL "Default module path.")
# Include common cmake modules
include(utils)
@@ -53,16 +56,23 @@ option(BUILD_ROCRTEST "Build targets for librdc_rocr.so" ON)
# which requires the Rocm profiler.
option(BUILD_ROCPTEST "Build targets for librdc_rocp.so" OFF)
# When cmake -DBUILD_RVS=off, it will not build the librdc_rvs.so
# which requires the RocmValidationSuite
option(BUILD_RVS "Build targets for librdc_rvs.so" OFF)
# When cmake -DBUILD_TESTS=off, it will not build RDC tests.
option(BUILD_TESTS "Build test suite" OFF)
# Enable shared libraries for gtest
option(BUILD_SHARED_LIBS "Build shared library (.so) or not." ON)
# Enable address sanitizer
option(ADDRESS_SANITIZER "Enable address sanitizer" OFF)
# File reorganization enable/disable
option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" OFF)
option(CMAKE_VERBOSE_MAKEFILE "Enable verbose output" ON)
option(CMAKE_VERBOSE_MAKEFILE "Enable verbose output" OFF)
option(CMAKE_EXPORT_COMPILE_COMMANDS "Export compile commands for linters and autocompleters" ON)
@@ -77,7 +87,8 @@ include(GNUInstallDirs)
# ROCM_DIR should be passed in via command line; it will be used
# in sub-projects. Override with -DROCM_DIR=
set(ROCM_DIR "/opt/rocm" CACHE STRING "ROCm directory.")
set(ROCM_DIR "/opt/rocm" CACHE PATH "ROCm directory.")
set(ROCM_PATH "${ROCM_DIR}" CACHE PATH "ROCm directory.")
set(COMMON_DIR "${CMAKE_CURRENT_SOURCE_DIR}/common")
@@ -89,8 +100,7 @@ set(RDC_SHARE_INSTALL_PREFIX "share/${RDC}" CACHE INTERNAL "Tests and Example in
set(CMAKE_INSTALL_PREFIX ${ROCM_DIR} CACHE STRING "Default installation directory.")
# add package search paths
set(CMAKE_PREFIX_PATH
${CMAKE_PREFIX_PATH}
list(APPEND CMAKE_PREFIX_PATH
${GRPC_ROOT}
/usr/local
)
@@ -213,7 +223,8 @@ if(BUILD_STANDALONE)
--cpp_out=${PROTOB_OUT_DIR} ${file}
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
RESULT_VARIABLE PROTOB_RESULT
OUTPUT_VARIABLE PROTOB_OUT_VAR)
OUTPUT_VARIABLE PROTOB_OUT_VAR
COMMAND_ERROR_IS_FATAL ANY)
message("protoc command returned: ${PROTOB_RESULT}")
message("GRPC_PLUGIN=${GRPC_PLUGIN})")
@@ -0,0 +1,38 @@
# This module provides a rvs::rvs package
# You can specify the ROCM directory by setting ROCM_DIR
set(NAME rvs)
if(NOT DEFINED ROCM_DIR)
set(ROCM_DIR "/opt/rocm")
endif()
find_library(
${NAME}_LIBRARY
NAMES ${NAME} ${NAME}64
HINTS "${ROCM_DIR}"
REGISTRY_VIEW BOTH
PATH_SUFFIXES lib)
if(NOT DEFINED (${NAME}_INCLUDE_DIR))
find_path(
${NAME}_INCLUDE_DIR
NAMES ${NAME}.h
HINTS "${ROCM_DIR}/include"
PATH_SUFFIXES ${NAME} ${NAME}/inc)
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(${NAME}
FOUND_VAR ${NAME}_FOUND
REQUIRED_VARS
${NAME}_LIBRARY
${NAME}_INCLUDE_DIR)
if(${NAME}_FOUND AND NOT TARGET ${NAME}::${NAME})
add_library(${NAME}::${NAME} UNKNOWN IMPORTED)
set_target_properties(${NAME}::${NAME} PROPERTIES
IMPORTED_LOCATION "${${NAME}_LIBRARY}"
INTERFACE_COMPILE_OPTIONS "${PC_${NAME}_CFLAGS_OTHER}"
INTERFACE_INCLUDE_DIRECTORIES "${${NAME}_INCLUDE_DIR}")
endif()
@@ -60,11 +60,13 @@ function(create_library_symlink)
set(LIB_RDC_ROCR "librdc_rocr.so")
set(LIB_RDC_ROCP "librdc_rocp.so")
set(LIB_RDC_RAS "librdc_ras.so")
set(LIB_RDC_RVS "librdc_rvs.so")
set(LIB_RDC_CLIENT_SMI "librdc_client_smi.so")
set(library_files "${LIB_RDC_ROCR}" "${LIB_RDC_ROCR}.${MAJ_VERSION}" "${LIB_RDC_ROCR}.${SO_VERSION}" )
set(library_files "${LIB_RDC_ROCP}" "${LIB_RDC_ROCP}.${MAJ_VERSION}" "${LIB_RDC_ROCP}.${SO_VERSION}" )
set(library_files "${library_files}" "${LIB_RDC_CLIENT_SMI}" "${LIB_RDC_CLIENT_SMI}.${MAJ_VERSION}" "${LIB_RDC_CLIENT_SMI}.${SO_VERSION}" )
set(library_files "${library_files}" "${LIB_RDC_RAS}")
set(library_files "${library_files}" "${LIB_RDC_RVS}")
foreach(file_name ${library_files})
add_custom_target(link_${file_name} ALL
+4 -2
Datei anzeigen
@@ -36,6 +36,7 @@ static std::string get_test_name(rdc_diag_test_cases_t test_case) {
{RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"},
{RDC_DIAG_SYS_MEM_CHECK, "System memory check"},
{RDC_DIAG_NODE_TOPOLOGY, "Node topology check"},
{RDC_DIAG_RVS_TEST, "RVS check"},
{RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"},
{RDC_DIAG_TEST_LAST, "Unknown"}};
@@ -102,7 +103,7 @@ int main(int, char**) {
// (2) start to run short diagnostic.
rdc_diag_response_t response;
result = rdc_diagnostic_run(rdc_handle, group_id, RDC_DIAG_LVL_SHORT, &response);
result = rdc_diagnostic_run(rdc_handle, group_id, RDC_DIAG_LVL_SHORT, nullptr, 0, &response);
if (result != RDC_ST_OK) {
std::cout << "Error run RDC_DIAG_LVL_SHORT diagnostic. Return: " << rdc_status_string(result);
@@ -135,7 +136,8 @@ int main(int, char**) {
// (5) run one test case
std::cout << " ============== Run individual diagnostic test ===========\n";
rdc_diag_test_result_t test_result;
result = rdc_test_case_run(rdc_handle, group_id, RDC_DIAG_COMPUTE_PROCESS, &test_result);
result =
rdc_test_case_run(rdc_handle, group_id, RDC_DIAG_COMPUTE_PROCESS, nullptr, 0, &test_result);
if (result != RDC_ST_OK) {
std::cout << "Error run RDC_DIAG_COMPUTE_PROCESS diagnostic. Return: "
+24 -3
Datei anzeigen
@@ -28,10 +28,20 @@ extern "C" {
#endif // __cplusplus
#ifdef __cplusplus
// cstddef include causes issues on older GCC
// use stddef.h instead
#if __GNUC__ < 9
#include <stddef.h>
#else
#include <cstddef>
#endif // __GNUC__
#include <cstdint>
#else
#include <stddef.h>
#include <stdint.h>
#endif
#endif // __cplusplus
/** \file rdc.h
* Main header file for the ROCm RDC library.
@@ -434,6 +444,7 @@ typedef enum {
RDC_DIAG_COMPUTE_QUEUE, //!< The Compute Queue is ready
RDC_DIAG_SYS_MEM_CHECK, //!< Check System memory
RDC_DIAG_NODE_TOPOLOGY, //!< Report node topology
RDC_DIAG_RVS_TEST, //!< TODO: Replace with real RVS tests
RDC_DIAG_GPU_PARAMETERS, //!< GPU parameters in range
RDC_DIAG_TEST_LAST = RDC_DIAG_GPU_PARAMETERS
} rdc_diag_test_cases_t;
@@ -972,12 +983,17 @@ rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_
* The RDC_DIAG_LVL_SHORT only take a few seconds, and the
* the RDC_DIAG_LVL_LONG may take up to 15 minutes.
*
* @param[in] config Implementation specific configuration.
*
* @param[in] config_size Length of the configuration.
*
* @param[inout] response The detail results of the tests run.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_level_t level, rdc_diag_response_t* response);
rdc_diag_level_t level, const char* config, size_t config_size,
rdc_diag_response_t* response);
/**
* @brief Run one diagnostic test case
@@ -990,12 +1006,17 @@ rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group
*
* @param[in] test_case The test case to run.
*
* @param[in] config Implementation specific configuration.
*
* @param[in] config_size Length of the configuration.
*
* @param[inout] result The results of the test.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case, rdc_diag_test_result_t* result);
rdc_diag_test_cases_t test_case, const char* config,
size_t config_size, rdc_diag_test_result_t* result);
/**
* @brief Get a description of a provided RDC error status
@@ -25,7 +25,6 @@ THE SOFTWARE.
#include <memory>
#include "rdc/rdc.h"
#include "rdc_lib/RdcDiagnosticLibInterface.h"
namespace amd {
namespace rdc {
@@ -39,10 +38,12 @@ class RdcDiagnostic {
// Run a specific test case
virtual rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, rdc_diag_test_result_t* result) = 0;
uint32_t gpu_count, const char* config, size_t config_size,
rdc_diag_test_result_t* result) = 0;
// Run multiple test cases
virtual rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) = 0;
virtual rdc_status_t rdc_diag_init(uint64_t flags) = 0;
@@ -37,6 +37,7 @@ rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST
rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result);
rdc_status_t rdc_diag_init(uint64_t flags);
@@ -77,9 +77,11 @@ class RdcHandler {
// Diagnostic API
virtual rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) = 0;
virtual rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) = 0;
// Control API
@@ -42,9 +42,11 @@ class RdcDiagnosticModule : public RdcDiagnostic {
// Run a specific test case
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
@@ -81,8 +81,10 @@ class RdcEmbeddedHandler : public RdcHandler {
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override;
// Diagnostic API
rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
// Control API
@@ -27,9 +27,6 @@ THE SOFTWARE.
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/RdcModuleMgr.h"
#include "rdc_lib/RdcTelemetry.h"
#include "rdc_lib/impl/RdcRasLib.h"
#include "rdc_lib/impl/RdcRocrLib.h"
#include "rdc_lib/impl/RdcSmiLib.h"
namespace amd {
namespace rdc {
@@ -0,0 +1,71 @@
/*
Copyright (c) 2023 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef INCLUDE_RDC_LIB_IMPL_RDCRVSLIB_H_
#define INCLUDE_RDC_LIB_IMPL_RDCRVSLIB_H_
#include <memory>
#include <vector>
#include "rdc/rdc.h"
#include "rdc_lib/RdcDiagnostic.h"
#include "rdc_lib/RdcLibraryLoader.h"
namespace amd {
namespace rdc {
class RdcRVSLib : public RdcDiagnostic {
public:
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) override;
// Run a specific test case
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
RdcRVSLib();
~RdcRVSLib() override;
private:
RdcLibraryLoader lib_loader_;
rdc_status_t (*test_case_run_)(rdc_diag_test_cases_t, uint32_t[RDC_MAX_NUM_DEVICES], uint32_t,
const char* config, size_t config_size, rdc_diag_test_result_t*);
rdc_status_t (*diag_test_cases_query_)(rdc_diag_test_cases_t[MAX_TEST_CASES], uint32_t*);
rdc_status_t (*diag_init_)(uint64_t);
rdc_status_t (*diag_destroy_)();
};
typedef std::shared_ptr<RdcRVSLib> RdcRVSLibPtr;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCRVSLIB_H_
@@ -56,9 +56,11 @@ class RdcRasLib : public RdcTelemetry, public RdcDiagnostic {
// Run a specific test case
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
@@ -39,9 +39,11 @@ class RdcRocrLib : public RdcDiagnostic {
// Run a specific test case
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
@@ -54,7 +56,7 @@ class RdcRocrLib : public RdcDiagnostic {
private:
RdcLibraryLoader lib_loader_;
rdc_status_t (*test_case_run_)(rdc_diag_test_cases_t, uint32_t[RDC_MAX_NUM_DEVICES], uint32_t,
rdc_diag_test_result_t*);
const char*, size_t, rdc_diag_test_result_t*);
rdc_status_t (*diag_test_cases_query_)(rdc_diag_test_cases_t[MAX_TEST_CASES], uint32_t*);
rdc_status_t (*diag_init_)(uint64_t);
rdc_status_t (*diag_destroy_)();
@@ -53,9 +53,11 @@ class RdcSmiLib : public RdcTelemetry, public RdcDiagnostic {
// Run a specific test case
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
@@ -76,8 +76,10 @@ class RdcStandaloneHandler : public RdcHandler {
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override;
// Diagnostic API
rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
// Control RdcAPI
@@ -0,0 +1,38 @@
/*
Copyright (c) 2023 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef RDC_MODULES_RDC_RVS_RVSBASE_H_
#define RDC_MODULES_RDC_RVS_RVSBASE_H_
#include <cstddef>
#include "rvs/rvs.h"
namespace amd {
namespace rdc {
void session_callback(rvs_session_id_t session_id, const rvs_results_t* results);
rvs_status_t run_rvs_app(const char* config, size_t config_size);
} // namespace rdc
} // namespace amd
#endif // RDC_MODULES_RDC_RVS_RVSBASE_H_
+13 -3
Datei anzeigen
@@ -218,12 +218,16 @@ service RdcAPI {
// rdc_status_t rdc_diagnostic_run(
// rdc_gpu_group_t group_id,
// rdc_diag_level_t level,
// const char* config,
// size_t config_size,
// rdc_diag_response_t* response);
rpc DiagnosticRun(DiagnosticRunRequest) returns (DiagnosticRunResponse) {}
// rdc_status_t rdc_test_case_run(
// rdc_gpu_group_t group_id,
// rdc_diag_test_cases_t test_case,
// const char* config,
// size_t config_size,
// rdc_diag_test_result_t* result);
rpc DiagnosticTestCaseRun(DiagnosticTestCaseRunRequest) returns (DiagnosticTestCaseRunResponse) {}
}
@@ -476,6 +480,8 @@ message RemoveAllJobResponse {
message DiagnosticRunRequest {
uint32 group_id = 1;
uint32 level = 2;
string config = 3;
uint32 config_size = 4;
}
message DiagnosticDetail {
@@ -498,7 +504,8 @@ message DiagnosticTestResult {
VRAM_CHECK = 3;
SYS_MEM_CHECK = 4;
NODE_TOPOLOGY = 5;
GPU_PARAMETERS = 6;
RVS_TEST = 6;
GPU_PARAMETERS = 7;
};
DiagnosticTestCase test_case = 3;
uint32 per_gpu_result_count = 4;
@@ -525,12 +532,15 @@ message DiagnosticTestCaseRunRequest {
VRAM_CHECK = 3;
SYS_MEM_CHECK = 4;
NODE_TOPOLOGY = 5;
GPU_PARAMETERS = 6;
RVS_TEST = 6;
GPU_PARAMETERS = 7;
};
TestCaseType test_case = 2;
string config = 3;
uint32 config_size = 4;
}
message DiagnosticTestCaseRunResponse {
uint32 status = 1;
DiagnosticTestResult result = 2;
}
}
+4
Datei anzeigen
@@ -60,6 +60,7 @@ set(BOOTSTRAP_LIB "rdc_bootstrap")
set(RDC_LIB "rdc")
set(RDC_ROCR_LIB "rdc_rocr")
set(RDC_ROCP_LIB "rdc_rocp")
set(RDC_RVS_LIB "rdc_rvs")
set(RDCCLIENT_LIB "rdc_client")
################# Determine the library version #########################
@@ -96,6 +97,9 @@ add_subdirectory(rdc_modules/rdc_rocr)
# add librdc_rocp.so to RDC_LIB_MODULES
add_subdirectory(rdc_modules/rdc_rocp)
# add librdc_rvs.so to RDC_LIB_MODULES
add_subdirectory(rdc_modules/rdc_rvs)
if(BUILD_STANDALONE)
# add librdc_client.so
add_subdirectory(rdc_client)
@@ -294,23 +294,25 @@ rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle,
}
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_level_t level, rdc_diag_response_t* response) {
rdc_diag_level_t level, const char* config, size_t config_size,
rdc_diag_response_t* response) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_diagnostic_run(group_id, level, response);
->rdc_diagnostic_run(group_id, level, config, config_size, response);
}
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case, rdc_diag_test_result_t* result) {
rdc_diag_test_cases_t test_case, const char* config,
size_t config_size, rdc_diag_test_result_t* result) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_test_case_run(group_id, test_case, result);
->rdc_test_case_run(group_id, test_case, config, config_size, result);
}
const char* rdc_status_string(rdc_status_t result) {
@@ -21,6 +21,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST}
"${SRC_DIR}/RdcRasLib.cc"
"${SRC_DIR}/RdcRocpLib.cc"
"${SRC_DIR}/RdcRocrLib.cc"
"${SRC_DIR}/RdcRVSLib.cc"
"${SRC_DIR}/RdcSmiDiagnosticImpl.cc"
"${SRC_DIR}/RdcSmiLib.cc"
"${SRC_DIR}/RdcTelemetryModule.cc"
@@ -53,6 +54,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST}
"${INC_DIR}/impl/RdcRasLib.h"
"${INC_DIR}/impl/RdcRocpLib.h"
"${INC_DIR}/impl/RdcRocrLib.h"
"${INC_DIR}/impl/RdcRVSLib.h"
"${INC_DIR}/impl/RdcSmiDiagnosticImpl.h"
"${INC_DIR}/impl/RdcSmiLib.h"
"${INC_DIR}/impl/RdcTelemetryModule.h"
@@ -28,6 +28,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/impl/RdcRVSLib.h"
#include "rdc_lib/impl/RdcRasLib.h"
#include "rdc_lib/impl/RdcRocrLib.h"
#include "rdc_lib/impl/RdcSmiLib.h"
@@ -55,7 +56,8 @@ rdc_status_t RdcDiagnosticModule::rdc_diag_test_cases_query(
rdc_status_t RdcDiagnosticModule::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
uint32_t gpu_count, const char* config,
size_t config_size,
rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
@@ -68,11 +70,13 @@ rdc_status_t RdcDiagnosticModule::rdc_test_case_run(rdc_diag_test_cases_t test_c
strncpy_with_null(result->info, "Not implemented", MAX_DIAG_MSG_LENGTH);
return RDC_ST_NOT_SUPPORTED;
}
return ite->second->rdc_test_case_run(test_case, gpu_index, gpu_count, result);
return ite->second->rdc_test_case_run(test_case, gpu_index, gpu_count, config, config_size,
result);
}
rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpus,
rdc_diag_level_t level,
rdc_diag_level_t level, const char* config,
size_t config_size,
rdc_diag_response_t* response) {
if (response == nullptr) {
return RDC_ST_BAD_PARAMETER;
@@ -87,11 +91,15 @@ rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpu
rdc_runs.push_back(RDC_DIAG_SYS_MEM_CHECK);
}
if (level >= RDC_DIAG_LVL_MED) { // Medium run and above
rdc_runs.push_back(RDC_DIAG_RVS_TEST);
}
response->results_count = 0;
for (unsigned int i = 0; i < rdc_runs.size(); i++) {
response->diag_info[i].test_case = rdc_runs[i];
rdc_test_case_run(rdc_runs[i], const_cast<uint32_t*>(gpus.entity_ids), gpus.count,
&(response->diag_info[i]));
rdc_test_case_run(rdc_runs[i], const_cast<uint32_t*>(gpus.entity_ids), gpus.count, config,
config_size, &(response->diag_info[i]));
response->results_count++;
}
@@ -116,8 +124,9 @@ rdc_status_t RdcDiagnosticModule::RdcDiagnosticModule::rdc_diag_destroy() {
RdcDiagnosticModule::RdcDiagnosticModule(RdcMetricFetcherPtr& fetcher) {
const RdcSmiLibPtr smi_module = std::make_shared<RdcSmiLib>(fetcher);
const RdcRasLibPtr ras_module = std::make_shared<RdcRasLib>();
const RdcRocrLibPtr rocr_module = std::make_shared<RdcRocrLib>();
const RdcRasLibPtr ras_module = std::make_shared<RdcRasLib>();
const RdcRVSLibPtr rvs_module = std::make_shared<RdcRVSLib>();
if (smi_module) {
diagnostic_modules_.push_back(smi_module);
}
@@ -127,6 +136,9 @@ RdcDiagnosticModule::RdcDiagnosticModule(RdcMetricFetcherPtr& fetcher) {
if (ras_module) {
diagnostic_modules_.push_back(ras_module);
}
if (rvs_module) {
diagnostic_modules_.push_back(rvs_module);
}
auto ite = diagnostic_modules_.begin();
for (; ite != diagnostic_modules_.end(); ite++) {
@@ -350,7 +350,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_unwatch(rdc_gpu_group_t group_id,
// Diagnostic API
rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_level_t level, const char* config,
size_t config_size,
rdc_diag_response_t* response) {
if (!response) {
return RDC_ST_BAD_PARAMETER;
@@ -362,11 +363,12 @@ rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
if (status != RDC_ST_OK) return status;
auto diag = rdc_module_mgr_->get_diagnostic_module();
return diag->rdc_diagnostic_run(rdc_group_info, level, response);
return diag->rdc_diagnostic_run(rdc_group_info, level, config, config_size, response);
}
rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) {
if (!result) {
return RDC_ST_BAD_PARAMETER;
@@ -377,8 +379,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
if (status != RDC_ST_OK) return status;
auto diag = rdc_module_mgr_->get_diagnostic_module();
return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids, rdc_group_info.count,
result);
return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids, rdc_group_info.count, config,
config_size, result);
}
// Control API
@@ -0,0 +1,138 @@
/*
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcRVSLib.h"
#include <functional>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdcRVSLib::RdcRVSLib()
: test_case_run_(nullptr),
diag_test_cases_query_(nullptr),
diag_init_(nullptr),
diag_destroy_(nullptr) {
rdc_status_t status = lib_loader_.load("librdc_rvs.so");
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "RVS related function will not work.");
return;
}
status = lib_loader_.load_symbol(&diag_init_, "rdc_diag_init");
if (status != RDC_ST_OK) {
diag_init_ = nullptr;
return;
}
status = diag_init_(0);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Fail to init librdc_rvs.so:" << rdc_status_string(status)
<< ". RVS related function will not work.");
return;
}
status = lib_loader_.load_symbol(&diag_destroy_, "rdc_diag_destroy");
if (status != RDC_ST_OK) {
diag_destroy_ = nullptr;
}
status = lib_loader_.load_symbol(&test_case_run_, "rdc_diag_test_case_run");
if (status != RDC_ST_OK) {
test_case_run_ = nullptr;
}
status = lib_loader_.load_symbol(&diag_test_cases_query_, "rdc_diag_test_cases_query");
if (status != RDC_ST_OK) {
diag_test_cases_query_ = nullptr;
}
}
RdcRVSLib::~RdcRVSLib() {
if (diag_destroy_) {
diag_destroy_();
}
}
rdc_status_t RdcRVSLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) {
if (test_case_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!diag_test_cases_query_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = diag_test_cases_query_(test_cases, test_case_count);
RDC_LOG(RDC_DEBUG,
"Query " << *test_case_count << " test cases from RVS: " << rdc_status_string(status));
return status;
}
// Run a specific test case
rdc_status_t RdcRVSLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, const char* config,
size_t config_size, rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!test_case_run_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status =
test_case_run_(test_case, gpu_index, gpu_count, config, config_size, result);
RDC_LOG(RDC_DEBUG, "Run " << test_case << " test case from RVS: " << rdc_status_string(status)
<< " config[" << config_size << "]: " << config);
return status;
}
rdc_status_t RdcRVSLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) {
(void)gpus;
(void)level;
(void)config;
(void)config_size;
(void)response;
return RDC_ST_NOT_SUPPORTED;
}
rdc_status_t RdcRVSLib::rdc_diag_init(uint64_t flags) {
if (!diag_init_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
return diag_init_(flags);
}
rdc_status_t RdcRVSLib::rdc_diag_destroy() {
if (!diag_destroy_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
return diag_destroy_();
}
} // namespace rdc
} // namespace amd
+7 -1
Datei anzeigen
@@ -151,18 +151,24 @@ rdc_status_t RdcRasLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cas
// Run a specific test case
rdc_status_t RdcRasLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, rdc_diag_test_result_t* result) {
uint32_t gpu_count, const char* config,
size_t config_size, rdc_diag_test_result_t* result) {
(void)test_case;
(void)gpu_index;
(void)result;
(void)gpu_count;
(void)config;
(void)config_size;
return RDC_ST_NOT_SUPPORTED;
}
rdc_status_t RdcRasLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) {
(void)gpus;
(void)level;
(void)config;
(void)config_size;
(void)response;
return RDC_ST_NOT_SUPPORTED;
}
@@ -92,7 +92,8 @@ rdc_status_t RdcRocrLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_ca
// Run a specific test case
rdc_status_t RdcRocrLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, rdc_diag_test_result_t* result) {
uint32_t gpu_count, const char* config,
size_t config_size, rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
@@ -100,15 +101,19 @@ rdc_status_t RdcRocrLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = test_case_run_(test_case, gpu_index, gpu_count, result);
rdc_status_t status =
test_case_run_(test_case, gpu_index, gpu_count, config, config_size, result);
RDC_LOG(RDC_DEBUG, "Run " << test_case << " test case from Rocr: " << rdc_status_string(status));
return status;
}
rdc_status_t RdcRocrLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) {
(void)gpus;
(void)level;
(void)config;
(void)config_size;
(void)response;
return RDC_ST_NOT_SUPPORTED;
}
+4 -3
Datei anzeigen
@@ -188,7 +188,8 @@ rdc_status_t RdcSmiLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cas
// Run a specific test case
rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, rdc_diag_test_result_t* result) {
uint32_t gpu_count, const char* /*config*/,
size_t /*config_size*/, rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
@@ -204,8 +205,8 @@ rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
}
}
rdc_status_t RdcSmiLib::rdc_diagnostic_run(const rdc_group_info_t&, rdc_diag_level_t,
rdc_diag_response_t*) {
rdc_status_t RdcSmiLib::rdc_diagnostic_run(const rdc_group_info_t&, rdc_diag_level_t, const char*,
size_t, rdc_diag_response_t*) {
return RDC_ST_NOT_SUPPORTED;
}
@@ -530,7 +530,8 @@ rdc_status_t RdcStandaloneHandler::rdc_field_unwatch(rdc_gpu_group_t group_id,
// Diagnostic API
rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_level_t level, const char* config,
size_t config_size,
rdc_diag_response_t* response) {
if (!response) {
return RDC_ST_BAD_PARAMETER;
@@ -541,6 +542,8 @@ rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
request.set_group_id(group_id);
request.set_level(level);
request.set_config(config);
request.set_config_size(config_size);
::grpc::Status status = stub_->DiagnosticRun(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
@@ -583,6 +586,7 @@ rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
rdc_status_t RdcStandaloneHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* to_result) {
if (!to_result) {
return RDC_ST_BAD_PARAMETER;
@@ -592,6 +596,8 @@ rdc_status_t RdcStandaloneHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
::grpc::ClientContext context;
request.set_group_id(group_id);
request.set_config(config);
request.set_config_size(config_size);
request.set_test_case(static_cast<::rdc::DiagnosticTestCaseRunRequest_TestCaseType>(test_case));
::grpc::Status status = stub_->DiagnosticTestCaseRun(&context, request, &reply);
@@ -26,7 +26,6 @@ set(RDC_ROCR_LIB_INC_LIST
"${INC_DIR}/base_rocr_utils.h"
"${INC_DIR}/common.h"
"${PROJECT_SOURCE_DIR}/include/rdc/rdc.h"
"${RDC_LIB_INC_DIR}/RdcDiagnosticLibInterface.h"
"${RDC_LIB_INC_DIR}/RdcLogger.h"
"${RDC_LIB_INC_DIR}/rdc_common.h")
@@ -151,6 +151,7 @@ static rdc_status_t run_compute_queue_test(uint32_t gpu_index, rdc_diag_test_res
rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* /*config*/, size_t /*config_size*/,
rdc_diag_test_result_t* result) {
if (result == nullptr || gpu_count == 0) {
return RDC_ST_BAD_PARAMETER;
@@ -178,7 +179,7 @@ rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
break;
default:
result->status = RDC_DIAG_RESULT_SKIP;
strncpy_with_null(result->info, "Not support yet", MAX_DIAG_MSG_LENGTH);
strncpy_with_null(result->info, "Not supported yet", MAX_DIAG_MSG_LENGTH);
}
}
@@ -0,0 +1,64 @@
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" Cmake RDC Lib-RVS ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
set(SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
set(INC_DIR "${PROJECT_SOURCE_DIR}/include/rdc_modules/rdc_rvs")
set(RDC_RVS_LIB_COMPONENT "lib${RDC_RVS_LIB}")
set(RDC_RVS_LIB_SRC_LIST
"${BOOTSTRAP_LIB_SRC_DIR}/RdcLogger.cc"
"${SRC_DIR}/RvsBase.cc"
"${SRC_DIR}/RdcDiagnosticLib.cc"
)
set(RDC_RVS_LIB_INC_LIST
"${PROJECT_SOURCE_DIR}/include/rdc/rdc.h"
"${RDC_LIB_INC_DIR}/RdcDiagnostic.h"
"${RDC_LIB_INC_DIR}/rdc_common.h"
"${RDC_LIB_INC_DIR}/RdcLogger.h"
"${INC_DIR}/RvsBase.h"
)
if(BUILD_RVS)
message("Build librdc_rvs.so is enabled, make sure RVS is installed.")
message("RDC_RVS_LIB_INC_LIST=${RDC_RVS_LIB_INC_LIST}")
# needed to find extra packages
list(APPEND CMAKE_PREFIX_PATH ${ROCM_DIR})
find_package(yaml-cpp REQUIRED)
find_package(hip REQUIRED)
find_package(hsa-runtime64 REQUIRED)
find_package(rvs REQUIRED
HINTS ${ROCM_DIR}/lib/cmake)
find_library(rvslib REQUIRED
NAMES rvslib)
## additional libraries
set(COMBINED_LIBS rocblas hsakmt hsa-runtime64 hip::amdhip64 yaml-cpp)
set(RDC_LIB_MODULES ${RDC_LIB_MODULES} ${RDC_RVS_LIB} PARENT_SCOPE)
add_library(${RDC_RVS_LIB} SHARED ${RDC_RVS_LIB_SRC_LIST} ${RDC_RVS_LIB_INC_LIST})
target_link_libraries(${RDC_RVS_LIB} PRIVATE ${RDC_LIB} ${BOOTSTRAP_LIB} ${rvslib} pthread dl ${COMBINED_LIBS})
target_include_directories(${RDC_RVS_LIB} PRIVATE
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
"${COMMON_DIR}"
"${RSMI_INC_DIR}"
"${ROCM_DIR}/include"
"${ROCM_DIR}/include/hsa"
"${ROCM_VALIDATION_SUITE_INCLUDE_DIR}")
# Set the VERSION and SOVERSION values
set_property(TARGET ${RDC_RVS_LIB} PROPERTY
SOVERSION "${VERSION_MAJOR}")
set_property(TARGET ${RDC_RVS_LIB} PROPERTY
VERSION "${SO_VERSION_STRING}")
# If the library is a release, strip the target library
if("${CMAKE_BUILD_TYPE}" STREQUAL Release)
add_custom_command(
TARGET ${RDC_RVS_LIB}
POST_BUILD COMMAND ${CMAKE_STRIP} ${RDC_RVS_LIB_COMPONENT}.so)
endif()
endif()
@@ -0,0 +1,94 @@
/*
Copyright (c) 2023 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <string.h>
#include <memory>
#include <stdexcept>
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rvs/RvsBase.h"
// TODO: Replace with client-side feedback channel
#define RVS_LOG() RDC_LOG(RDC_DEBUG, "!HELLO_FROM_RVS! " << __FILE__ << ":" << __LINE__)
rdc_status_t rdc_diag_init(uint64_t) {
RVS_LOG();
return RDC_ST_OK;
}
rdc_status_t rdc_diag_destroy() {
RVS_LOG();
return RDC_ST_OK;
}
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) {
RVS_LOG();
if (test_case_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
*test_case_count = 1;
test_cases[0] = RDC_DIAG_RVS_TEST;
return RDC_ST_OK;
}
rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
// TODO: use gpu_index
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) {
rvs_status_t rvs_status = RVS_STATUS_SUCCESS;
RVS_LOG();
if (result == nullptr || gpu_count == 0) {
return RDC_ST_BAD_PARAMETER;
}
if (test_case != RDC_DIAG_RVS_TEST) {
return RDC_ST_BAD_PARAMETER;
}
// init the return data
*result = {};
result->test_case = test_case;
result->status = RDC_DIAG_RESULT_PASS;
result->per_gpu_result_count = 0;
switch (test_case) {
case RDC_DIAG_RVS_TEST:
strncpy_with_null(result->info, "Finished running RDC_DIAG_RVS_TEST!", MAX_DIAG_MSG_LENGTH);
rvs_status = amd::rdc::run_rvs_app(config, config_size);
break;
default:
result->status = RDC_DIAG_RESULT_SKIP;
strncpy_with_null(result->info, "Not supported yet", MAX_DIAG_MSG_LENGTH);
}
if (rvs_status != RVS_STATUS_SUCCESS) {
result->status = RDC_DIAG_RESULT_FAIL;
}
return RDC_ST_OK;
}
@@ -0,0 +1,118 @@
/*
Copyright (c) 2023 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_modules/rdc_rvs/RvsBase.h"
#include <string.h>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
#include "rvs/rvs.h"
// TODO: Make generic test
// TODO: Allow for user to override defaults with a custom string
static constexpr size_t MAX_CONFIG_LENGTH = 1024;
volatile rvs_session_state_t state = RVS_SESSION_STATE_IDLE;
rvs_status_t amd::rdc::run_rvs_app(const char* config, const size_t config_size) {
char active_config[MAX_CONFIG_LENGTH];
rvs_session_property_t session_property = {RVS_SESSION_TYPE_DEFAULT_CONF, {{RVS_MODULE_GST}}};
rvs_session_id_t session_id;
rvs_status_t status;
const char mem_config[MAX_CONFIG_LENGTH] =
"{actions: [{name: action_1, device: all, module: mem, parallel: true, "
"count: 1, wait: 100, mapped_memory: false, mem_blocks: 128, "
"num_passes: 500, thrds_per_blk: 64, stress: true, num_iter: 50000, "
"exclude: '5 6 7 8 9 10 11'}]}";
const char gst_config[MAX_CONFIG_LENGTH] =
"{actions: [{name: gpustress-9000-sgemm-false, device: all, "
"device_index: '0', module: gst, parallel: false, count: 1, duration: "
"10000, copy_matrix: false, target_stress: 9000, matrix_size_a: 8640, "
"matrix_size_b: 8640, matrix_size_c: 8640, ops_type: sgemm, lda: 8640, "
"ldb: 8640, ldc: 8640}]}";
if ((config == nullptr) || (config_size == 0)) {
RDC_LOG(RDC_INFO, "given config is NULL! Using predefined gst_config");
strncpy_with_null(active_config, gst_config, MAX_CONFIG_LENGTH);
} else if (config_size > MAX_CONFIG_LENGTH) {
RDC_LOG(RDC_ERROR, "given config size is too large! Expected at most "
<< MAX_CONFIG_LENGTH << ", got " << config_size << " instead.");
return RVS_STATUS_INVALID_ARGUMENT;
} else {
RDC_LOG(RDC_DEBUG, "given config is correct");
strncpy_with_null(active_config, config, config_size);
}
status = rvs_initialize();
if (status == RVS_STATUS_FAILED) {
RDC_LOG(RDC_ERROR, "rvs initialization failed");
return status;
}
/*******************************/
state = RVS_SESSION_STATE_IDLE;
/* Using custom gst configuration in string format */
status = rvs_session_create(&session_id, amd::rdc::session_callback);
session_property.type = RVS_SESSION_TYPE_CUSTOM_ACTION;
session_property.custom_action.config = active_config;
status = rvs_session_set_property(session_id, &session_property);
status = rvs_session_execute(session_id);
if (status != RVS_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "RVS session execute failed with status: " << status);
rvs_session_destroy(session_id);
return status;
}
while (state != RVS_SESSION_STATE_COMPLETED) {
};
status = rvs_session_destroy(session_id);
if (status != RVS_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "RVS session destroy failed with status: " << status);
}
return status;
}
void amd::rdc::session_callback(rvs_session_id_t session_id, const rvs_results_t* results) {
// NOTE: This is a placeholder!
// TODO: Use GRPC to send message back to client periodically
printf(
"/*******************************************************************/"
"\n");
printf("session id -> %d state -> %d\n", session_id, results->state);
printf("session id -> %d status -> %d\n", session_id, results->status);
printf("session id -> %d output -> %s\n", session_id, results->output_log);
printf(
"/*******************************************************************/"
"\n");
state = results->state;
printf("state -> %d\n", state);
}
@@ -51,6 +51,7 @@ class RdciSubSystem {
std::string ip_port_;
bool use_auth_;
std::string config_test_;
std::string root_ca_;
std::string client_cert_;
std::string client_key_;
+12 -4
Datei anzeigen
@@ -54,6 +54,7 @@ void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) {
const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS},
{"help", optional_argument, nullptr, 'h'},
{"unauth", optional_argument, nullptr, 'u'},
{"config-test", optional_argument, nullptr, 'c'},
{"run-level", required_argument, nullptr, 'r'},
{"group-id", required_argument, nullptr, 'g'},
{nullptr, 0, nullptr, 0}};
@@ -62,7 +63,7 @@ void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) {
int option_index = 0;
int opt = 0;
while ((opt = getopt_long(argc, argv, "hug:r:", long_options, &option_index)) != -1) {
while ((opt = getopt_long(argc, argv, "hug:r:c:", long_options, &option_index)) != -1) {
switch (opt) {
case HOST_OPTIONS:
ip_port_ = optarg;
@@ -73,6 +74,11 @@ void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) {
case 'u':
use_auth_ = false;
break;
case 'c':
config_test_ = optarg;
printf("config_test_ = %s\n", config_test_.c_str());
printf("config_test_.length = %zu\n", config_test_.length());
break;
case 'g':
if (!IsNumber(optarg)) {
show_help();
@@ -93,7 +99,6 @@ void RdciDiagSubSystem::parse_cmd_opts(int argc, char** argv) {
throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options");
}
}
if (!group_id_set) {
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the GPU group id");
@@ -108,12 +113,13 @@ void RdciDiagSubSystem::show_help() const {
// ***** 20 Chars ****
std::cout << " diag -- Used to run diagnostic for GPUs.\n\n";
std::cout << "Usage\n";
std::cout << " rdci diag [--host <IP/FQDN>:port] [-u] -g <groupId>"
std::cout << " rdci diag [--host <IP/FQDN>:port] [-u] [-t] -g <groupId>"
<< " -r <runLevel>\n";
std::cout << "\nFlags:\n";
show_common_usage();
std::cout << " -g --group-id The GPU group to diagnose"
<< " on the specified host.\n";
std::cout << " -c --config-test Set custom test config (RVS)\n";
std::cout << " -r --run-level level Integer representing test"
<< " run levels [default = 1].\n"
<< " level 1: Tests take a "
@@ -130,6 +136,7 @@ std::string RdciDiagSubSystem::get_test_name(rdc_diag_test_cases_t test_case) co
{RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"},
{RDC_DIAG_SYS_MEM_CHECK, "System memory check"},
{RDC_DIAG_NODE_TOPOLOGY, "Node topology check"},
{RDC_DIAG_RVS_TEST, "Pre-defined config RVS check"},
{RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"},
{RDC_DIAG_TEST_LAST, "Unknown"}};
@@ -148,7 +155,8 @@ void RdciDiagSubSystem::process() {
rdc_status_t result;
rdc_diag_response_t response;
result = rdc_diagnostic_run(rdc_handle_, group_id_, run_level_, &response);
result = rdc_diagnostic_run(rdc_handle_, group_id_, run_level_, config_test_.c_str(),
config_test_.length(), &response);
if (result != RDC_ST_OK) {
std::string error_msg = rdc_status_string(result);
+8 -6
Datei anzeigen
@@ -556,9 +556,10 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
}
rdc_diag_response_t diag_response;
rdc_status_t result =
rdc_diagnostic_run(rdc_handle_, request->group_id(),
static_cast<rdc_diag_level_t>(request->level()), &diag_response);
rdc_status_t result = rdc_diagnostic_run(
rdc_handle_, request->group_id(), static_cast<rdc_diag_level_t>(request->level()),
const_cast<char*>(request->config().c_str()), static_cast<size_t>(request->config().length()),
&diag_response);
reply->set_status(result);
if (result != RDC_ST_OK) {
@@ -607,9 +608,10 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
}
rdc_diag_test_result_t test_result;
rdc_status_t result =
rdc_test_case_run(rdc_handle_, request->group_id(),
static_cast<rdc_diag_test_cases_t>(request->test_case()), &test_result);
rdc_status_t result = rdc_test_case_run(
rdc_handle_, request->group_id(), static_cast<rdc_diag_test_cases_t>(request->test_case()),
const_cast<char*>(request->config().c_str()), static_cast<size_t>(request->config().length()),
&test_result);
reply->set_status(result);
if (result != RDC_ST_OK) {