The Diagnostic API interface
The API interface defines how the caller will use the API. An
example also shows how the API can be used.
It also defines the RdcDiagnostic module which can load the
library dynamically and then dispatch diagnostic test to run.
Change-Id: I1e041aab86f7e19338860f5ba65262977f4ea9cb
[ROCm/rdc commit: eab3625d65]
Этот коммит содержится в:
@@ -86,6 +86,16 @@ add_executable(${FIELDVALUE_EXAMPLE_EXE} "${FIELDVALUE_EXAMPLE_SRC_LIST}")
|
||||
|
||||
target_link_libraries(${FIELDVALUE_EXAMPLE_EXE} pthread dl rdc_bootstrap)
|
||||
|
||||
set(DIAGNOSTIC_EXAMPLE_SRC_LIST "${SRC_DIR}/diagnostic_example.cc")
|
||||
message("DIAGNOSTIC_EXAMPLE_SRC_LIST=${DIAGNOSTIC_EXAMPLE_SRC_LIST}")
|
||||
set(DIAGNOSTIC_EXAMPLE_EXE "diagnostic")
|
||||
|
||||
link_directories(${LIB_BOOSTRAP_DIR})
|
||||
|
||||
add_executable(${DIAGNOSTIC_EXAMPLE_EXE} "${DIAGNOSTIC_EXAMPLE_SRC_LIST}")
|
||||
|
||||
target_link_libraries(${DIAGNOSTIC_EXAMPLE_EXE} pthread dl rdc_bootstrap)
|
||||
|
||||
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
message(" Finished Cmake Example ")
|
||||
|
||||
@@ -0,0 +1,157 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include "rdc/rdc.h"
|
||||
|
||||
static std::string get_test_name(rdc_diag_test_cases_t test_case) {
|
||||
const std::map<rdc_diag_test_cases_t, std::string> test_desc = {
|
||||
{RDC_DIAG_COMPUTE_PROCESS, "No compute process"},
|
||||
{RDC_DIAG_SDMA_QUEUE, "SDMA Queue ready"},
|
||||
{RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"},
|
||||
{RDC_DIAG_VRAM_CHECK, "VRAM check"},
|
||||
{RDC_DIAG_SYS_MEM_CHECK, "System memory check"},
|
||||
{RDC_DIAG_NODE_TOPOLOGY, "Node topology check"},
|
||||
{RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"},
|
||||
{RDC_DIAG_TEST_LAST, "Unknown"}
|
||||
};
|
||||
|
||||
auto test_name = test_desc.find(test_case);
|
||||
if (test_name == test_desc.end()) {
|
||||
return "Unknown Test";
|
||||
}
|
||||
return test_name->second;
|
||||
}
|
||||
|
||||
int main(int, char **) {
|
||||
rdc_status_t result;
|
||||
rdc_handle_t rdc_handle;
|
||||
bool standalone = false;
|
||||
char hostIpAddress[] = {"127.0.0.1:50051"};
|
||||
char group_name[] = {"diag_group"};
|
||||
|
||||
// Select the embedded mode and standalone mode dynamically.
|
||||
std::cout << "Start rdci in: \n";
|
||||
std::cout << "0 - Embedded mode \n";
|
||||
std::cout << "1 - Standalone mode \n";
|
||||
while (!(std::cin >> standalone)) {
|
||||
std::cout << "Invalid input.\n";
|
||||
std::cin.clear();
|
||||
std::cin.ignore();
|
||||
}
|
||||
std::cout << std::endl;
|
||||
std::cout << (standalone?
|
||||
"Standalone mode selected.\n":"Embedded mode selected.\n");
|
||||
|
||||
// Init the rdc
|
||||
result = rdc_init(0);
|
||||
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error initializing RDC. Return: " <<
|
||||
rdc_status_string(result) << std::endl;
|
||||
goto cleanup;
|
||||
} else {
|
||||
std::cout << "RDC Initialized.\n";
|
||||
}
|
||||
|
||||
if (standalone) { // standalone
|
||||
result = rdc_connect(hostIpAddress, &rdc_handle,
|
||||
nullptr, nullptr, nullptr);
|
||||
if ( result != RDC_ST_OK ) {
|
||||
std::cout << "Error connecting to remote rdcd. Return: "
|
||||
<< rdc_status_string(result) << std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
} else { // embedded
|
||||
result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error starting embedded RDC engine. Return: "
|
||||
<< rdc_status_string(result) << std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
// Now we can use the same API for both standalone and embedded
|
||||
// (1) create group for all GPUs
|
||||
rdc_gpu_group_t group_id;
|
||||
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_DEFAULT,
|
||||
group_name, &group_id);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error creating group. Return: "
|
||||
<< rdc_status_string(result);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// (2) start to run short diagnostic.
|
||||
rdc_diag_response_t response;
|
||||
result = rdc_diagnostic_run(rdc_handle, group_id,
|
||||
RDC_DIAG_LVL_SHORT, &response);
|
||||
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error run RDC_DIAG_LVL_SHORT diagnostic. Return: "
|
||||
<< rdc_status_string(result);
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// (3) Check diagnostic results
|
||||
for (uint32_t i=0 ; i < response.results_count; i++) {
|
||||
const rdc_diag_test_result_t& test_result =
|
||||
response.diag_info[i];
|
||||
std::cout << std::setw(22) << std::left
|
||||
<< get_test_name(test_result.test_case) + ":"
|
||||
<< rdc_diagnostic_result_string(test_result.status) << "\n";
|
||||
}
|
||||
|
||||
// (4) diagnostic detail information
|
||||
std::cout <<" =============== Diagnostic Details ==================\n";
|
||||
for (uint32_t i=0 ; i < response.results_count; i++) {
|
||||
const rdc_diag_test_result_t& test_result =
|
||||
response.diag_info[i];
|
||||
if (test_result.info[0] != '\0') {
|
||||
std::cout << std::setw(22) << std::left
|
||||
<< get_test_name(test_result.test_case) + ":"
|
||||
<< test_result.info << "\n";
|
||||
}
|
||||
for (uint32_t j=0; j < test_result.per_gpu_result_count; j++) {
|
||||
const rdc_diag_per_gpu_result_t& gpu_result
|
||||
= test_result.gpu_results[j];
|
||||
if (strlen(gpu_result.gpu_result.msg) > 0) {
|
||||
std::cout << " GPU " << gpu_result.gpu_index
|
||||
<< " " << gpu_result.gpu_result.msg << "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
// Cleanup consists of shutting down RDC.
|
||||
cleanup:
|
||||
std::cout << "Cleaning up.\n";
|
||||
if (standalone)
|
||||
rdc_disconnect(rdc_handle);
|
||||
else
|
||||
rdc_stop_embedded(rdc_handle);
|
||||
rdc_shutdown();
|
||||
return result;
|
||||
}
|
||||
@@ -394,6 +394,91 @@ typedef struct {
|
||||
uint64_t stop_time; //!< job stop time
|
||||
} rdc_job_group_info_t;
|
||||
|
||||
/**
|
||||
* @brief type of diagnostic level
|
||||
*/
|
||||
typedef enum {
|
||||
RDC_DIAG_LVL_INVALID = 0, //!< invalid level
|
||||
RDC_DIAG_LVL_SHORT, //!< take a few seconds to run
|
||||
RDC_DIAG_LVL_MED, //!< take less than 2 minutes to run
|
||||
RDC_DIAG_LVL_LONG //!< take up to 15 minutes to run
|
||||
} rdc_diag_level_t;
|
||||
|
||||
/**
|
||||
* @brief type of diagnostic result
|
||||
*/
|
||||
typedef enum {
|
||||
RDC_DIAG_RESULT_PASS, //!< The diagnostic test pass
|
||||
RDC_DIAG_RESULT_SKIP, //!< The diagnostic test skipped
|
||||
RDC_DIAG_RESULT_WARN, //!< The diagnostic test has warnings
|
||||
RDC_DIAG_RESULT_FAIL //!< The diagnostic test fail
|
||||
} rdc_diag_result_t;
|
||||
|
||||
/**
|
||||
* @brief The test cases to run
|
||||
*/
|
||||
typedef enum {
|
||||
RDC_DIAG_TEST_FIRST = 0,
|
||||
//!< The diagnostic test pass
|
||||
RDC_DIAG_COMPUTE_PROCESS = RDC_DIAG_TEST_FIRST,
|
||||
RDC_DIAG_SDMA_QUEUE, //!< The SDMA Queue is ready
|
||||
RDC_DIAG_COMPUTE_QUEUE, //!< The Compute Queue is ready
|
||||
RDC_DIAG_VRAM_CHECK, //!< Check VRAM
|
||||
RDC_DIAG_SYS_MEM_CHECK, //!< Check System memory
|
||||
RDC_DIAG_NODE_TOPOLOGY, //!< Report node topology
|
||||
RDC_DIAG_GPU_PARAMETERS, //!< GPU parameters in range
|
||||
RDC_DIAG_TEST_LAST = RDC_DIAG_GPU_PARAMETERS
|
||||
} rdc_diag_test_cases_t;
|
||||
|
||||
/**
|
||||
* @brief The maximum test cases to run
|
||||
*/
|
||||
const uint32_t MAX_TEST_CASES = RDC_DIAG_TEST_LAST - RDC_DIAG_TEST_FIRST + 1;
|
||||
|
||||
/**
|
||||
* @brief The maximum length of the diagnostic messages
|
||||
*/
|
||||
#define MAX_DIAG_MSG_LENGTH 4096
|
||||
|
||||
/**
|
||||
* @brief details of the diagnostic errors
|
||||
*/
|
||||
typedef struct {
|
||||
char msg[MAX_DIAG_MSG_LENGTH]; //!< The test result details
|
||||
uint32_t code; //!< The low level error code
|
||||
} rdc_diag_detail_t;
|
||||
|
||||
/**
|
||||
* @brief details of the per gpu diagnostic results
|
||||
*/
|
||||
typedef struct {
|
||||
uint32_t gpu_index; //!< The GPU index
|
||||
rdc_diag_detail_t gpu_result; //!< The detail results
|
||||
} rdc_diag_per_gpu_result_t;
|
||||
|
||||
/**
|
||||
* @brief The diagnostic results for all GPUs
|
||||
*/
|
||||
typedef struct {
|
||||
rdc_diag_result_t status; //!< The diagnostic result
|
||||
rdc_diag_detail_t details; //!< The summary details
|
||||
rdc_diag_test_cases_t test_case; //!< The test case to run
|
||||
|
||||
uint32_t per_gpu_result_count; //!< How many gpu_results
|
||||
//!< Result details
|
||||
rdc_diag_per_gpu_result_t gpu_results[RDC_MAX_NUM_DEVICES];
|
||||
|
||||
char info[MAX_DIAG_MSG_LENGTH]; //!< Detail information
|
||||
} rdc_diag_test_result_t;
|
||||
|
||||
/**
|
||||
* @brief The diagnostic responses for test cases
|
||||
*/
|
||||
typedef struct {
|
||||
uint32_t results_count;
|
||||
rdc_diag_test_result_t diag_info[MAX_TEST_CASES];
|
||||
} rdc_diag_response_t;
|
||||
|
||||
/**
|
||||
* @brief Initialize ROCm RDC.
|
||||
*
|
||||
@@ -872,6 +957,29 @@ rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle,
|
||||
rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle,
|
||||
rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id);
|
||||
|
||||
/**
|
||||
* @brief Run the diagnostic test cases
|
||||
*
|
||||
* @details Run the diagnostic test cases at differenet levles.
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
*
|
||||
* @param[in] group_id The GPU group id.
|
||||
*
|
||||
* @param[in] level The level decides how long the test will run.
|
||||
* The RDC_DIAG_LVL_SHORT only take a few seconds, and the
|
||||
* the RDC_DIAG_LVL_LONG may take up to 15 minutes.
|
||||
*
|
||||
* @param[inout] response The detail results of the tests run.
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_diagnostic_run(
|
||||
rdc_handle_t p_rdc_handle,
|
||||
rdc_gpu_group_t group_id,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response);
|
||||
|
||||
/**
|
||||
* @brief Get a description of a provided RDC error status
|
||||
*
|
||||
@@ -905,6 +1013,17 @@ const char* field_id_string(rdc_field_t field_id);
|
||||
*/
|
||||
rdc_field_t get_field_id_from_name(const char* name);
|
||||
|
||||
/**
|
||||
* @brief Get a description of a diagnostic result.
|
||||
*
|
||||
* @details return the string in human readable format.
|
||||
*
|
||||
* @param[in] result The RDC diagnostic result.
|
||||
*
|
||||
* @retval The string to describe the RDC diagnostic result.
|
||||
*/
|
||||
const char* rdc_diagnostic_result_string(rdc_diag_result_t result);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // __cplusplus
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_
|
||||
#define INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_
|
||||
|
||||
#include <memory>
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcDiagnosticLibInterface.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RdcDiagnostic {
|
||||
public:
|
||||
// get support test cases
|
||||
virtual rdc_status_t rdc_diag_test_cases_query(
|
||||
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count) = 0;
|
||||
|
||||
// Run a specific test case
|
||||
virtual rdc_status_t rdc_test_case_run(
|
||||
rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) = 0;
|
||||
|
||||
// Run multiple test cases
|
||||
virtual rdc_status_t rdc_diagnostic_run(
|
||||
const rdc_group_info_t& gpus,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_diag_init(uint64_t flags) = 0;
|
||||
virtual rdc_status_t rdc_diag_destroy() = 0;
|
||||
|
||||
virtual ~RdcDiagnostic() {}
|
||||
};
|
||||
typedef std::shared_ptr<RdcDiagnostic> RdcDiagnosticPtr;
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
|
||||
#endif // INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_
|
||||
@@ -0,0 +1,52 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_
|
||||
#define INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_
|
||||
|
||||
// The telemetry interface for libraries, for example, RAS.
|
||||
#include <rdc/rdc.h>
|
||||
|
||||
|
||||
extern "C" {
|
||||
|
||||
// The library will implement below function
|
||||
|
||||
// Which test cases are supported in the library
|
||||
rdc_status_t rdc_diag_test_cases_query(
|
||||
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count);
|
||||
|
||||
// Run a specific test case
|
||||
rdc_status_t rdc_test_case_run(
|
||||
rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result);
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t flags);
|
||||
|
||||
rdc_status_t rdc_diag_destroy();
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif // INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_
|
||||
@@ -80,6 +80,12 @@ class RdcHandler {
|
||||
virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
|
||||
rdc_field_grp_t field_group_id) = 0;
|
||||
|
||||
// Diagnostic API
|
||||
virtual rdc_status_t rdc_diagnostic_run(
|
||||
rdc_gpu_group_t group_id,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response) = 0;
|
||||
|
||||
// Control API
|
||||
virtual rdc_status_t rdc_field_update_all(uint32_t wait_for_update) = 0;
|
||||
|
||||
|
||||
@@ -26,6 +26,7 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcTelemetry.h"
|
||||
#include "rdc_lib/RdcDiagnostic.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
@@ -33,6 +34,7 @@ namespace rdc {
|
||||
class RdcModuleMgr {
|
||||
public:
|
||||
virtual RdcTelemetryPtr get_telemetry_module() = 0;
|
||||
virtual RdcDiagnosticPtr get_diagnostic_module() = 0;
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<RdcModuleMgr> RdcModuleMgrPtr;
|
||||
|
||||
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_
|
||||
#define INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_
|
||||
|
||||
#include <map>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "rdc_lib/RdcDiagnostic.h"
|
||||
#include "rdc_lib/impl/RdcRasLib.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RdcDiagnosticModule : public RdcDiagnostic {
|
||||
public:
|
||||
rdc_status_t rdc_diag_test_cases_query(
|
||||
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count) override;
|
||||
|
||||
// Run a specific test case
|
||||
rdc_status_t rdc_test_case_run(
|
||||
rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) override;
|
||||
|
||||
rdc_status_t rdc_diagnostic_run(
|
||||
const rdc_group_info_t& gpus,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response) override;
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t flags) override;
|
||||
rdc_status_t rdc_diag_destroy() override;
|
||||
|
||||
explicit RdcDiagnosticModule(const RdcSmiLibPtr& smi_lib,
|
||||
const RdcRasLibPtr& ras_module);
|
||||
|
||||
private:
|
||||
//< Helper function to dispatch fields to module
|
||||
void get_fields_for_module(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count,
|
||||
std::map<RdcDiagnosticPtr, std::vector<rdc_gpu_field_t>>
|
||||
& fields_in_module,
|
||||
std::vector<rdc_gpu_field_value_t>& unsupport_fields); // NOLINT
|
||||
std::list<RdcDiagnosticPtr> diagnostic_modules_;
|
||||
std::map<rdc_diag_test_cases_t, RdcDiagnosticPtr> testcases_to_module_;
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<RdcDiagnosticModule> RdcDiagnosticModulePtr;
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
|
||||
#endif // INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_
|
||||
@@ -86,6 +86,11 @@ class RdcEmbeddedHandler: public RdcHandler {
|
||||
uint64_t *next_since_time_stamp, rdc_field_value* value) override;
|
||||
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
|
||||
rdc_field_grp_t field_group_id) override;
|
||||
// Diagnostic API
|
||||
rdc_status_t rdc_diagnostic_run(
|
||||
rdc_gpu_group_t group_id,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response) override;
|
||||
// Control API
|
||||
rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override;
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/RdcMetricFetcher.h"
|
||||
#include "rdc_lib/RdcTelemetry.h"
|
||||
#include "rdc_lib/impl/RdcRasLib.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
@@ -34,14 +35,16 @@ namespace rdc {
|
||||
class RdcModuleMgrImpl: public RdcModuleMgr {
|
||||
public:
|
||||
RdcTelemetryPtr get_telemetry_module() override;
|
||||
RdcDiagnosticPtr get_diagnostic_module() override;
|
||||
explicit RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher);
|
||||
private:
|
||||
// Function module
|
||||
RdcTelemetryPtr rdc_telemetry_module_;
|
||||
RdcDiagnosticPtr rdc_diagnostic_module_;
|
||||
|
||||
// Domain module
|
||||
RdcRasLibPtr ras_lib_;
|
||||
|
||||
RdcSmiLibPtr smi_lib_;
|
||||
RdcMetricFetcherPtr fetcher_;
|
||||
};
|
||||
|
||||
|
||||
@@ -30,11 +30,12 @@ THE SOFTWARE.
|
||||
#include <string>
|
||||
#include "rdc_lib/RdcLibraryLoader.h"
|
||||
#include "rdc_lib/RdcTelemetry.h"
|
||||
#include "rdc_lib/RdcDiagnostic.h"
|
||||
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
class RdcRasLib: public RdcTelemetry {
|
||||
class RdcRasLib: public RdcTelemetry, public RdcDiagnostic {
|
||||
public:
|
||||
// get support field ids
|
||||
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
|
||||
@@ -51,6 +52,25 @@ class RdcRasLib: public RdcTelemetry {
|
||||
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) override;
|
||||
|
||||
rdc_status_t rdc_diag_test_cases_query(
|
||||
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count) override;
|
||||
|
||||
// Run a specific test case
|
||||
rdc_status_t rdc_test_case_run(
|
||||
rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) override;
|
||||
|
||||
rdc_status_t rdc_diagnostic_run(
|
||||
const rdc_group_info_t& gpus,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response) override;
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t flags) override;
|
||||
rdc_status_t rdc_diag_destroy() override;
|
||||
|
||||
explicit RdcRasLib(const char* lib_name);
|
||||
|
||||
~RdcRasLib();
|
||||
|
||||
@@ -26,11 +26,12 @@ THE SOFTWARE.
|
||||
#include <memory>
|
||||
#include "rdc_lib/RdcMetricFetcher.h"
|
||||
#include "rdc_lib/RdcTelemetry.h"
|
||||
#include "rdc_lib/RdcDiagnostic.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RdcSmiLib : public RdcTelemetry {
|
||||
class RdcSmiLib : public RdcTelemetry, public RdcDiagnostic {
|
||||
public:
|
||||
// get support field ids
|
||||
rdc_status_t rdc_telemetry_fields_query(
|
||||
@@ -46,12 +47,33 @@ class RdcSmiLib : public RdcTelemetry {
|
||||
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) override;
|
||||
|
||||
rdc_status_t rdc_diag_test_cases_query(
|
||||
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count) override;
|
||||
|
||||
// Run a specific test case
|
||||
rdc_status_t rdc_test_case_run(
|
||||
rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) override;
|
||||
|
||||
rdc_status_t rdc_diagnostic_run(
|
||||
const rdc_group_info_t& gpus,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response) override;
|
||||
|
||||
rdc_status_t rdc_diag_init(uint64_t flags) override;
|
||||
rdc_status_t rdc_diag_destroy() override;
|
||||
|
||||
explicit RdcSmiLib(const RdcMetricFetcherPtr& mf);
|
||||
|
||||
private:
|
||||
RdcMetricFetcherPtr metric_fetcher_;
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<RdcSmiLib> RdcSmiLibPtr;
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -80,6 +80,11 @@ class RdcStandaloneHandler: public RdcHandler {
|
||||
uint64_t *next_since_time_stamp, rdc_field_value* value) override;
|
||||
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
|
||||
rdc_field_grp_t field_group_id) override;
|
||||
// Diagnostic API
|
||||
rdc_status_t rdc_diagnostic_run(
|
||||
rdc_gpu_group_t group_id,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response) override;
|
||||
|
||||
// Control RdcAPI
|
||||
rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override;
|
||||
|
||||
@@ -28,6 +28,7 @@ THE SOFTWARE.
|
||||
#include <memory>
|
||||
#include "rdc_lib/RdcTelemetry.h"
|
||||
#include "rdc_lib/impl/RdcRasLib.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
#include "rdc_lib/RdcMetricFetcher.h"
|
||||
|
||||
namespace amd {
|
||||
@@ -48,7 +49,7 @@ class RdcTelemetryModule : public RdcTelemetry {
|
||||
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count);
|
||||
|
||||
RdcTelemetryModule(const RdcMetricFetcherPtr& fetcher,
|
||||
RdcTelemetryModule(const RdcSmiLibPtr& smi_lib,
|
||||
const RdcRasLibPtr& ras_module);
|
||||
|
||||
private:
|
||||
|
||||
@@ -135,6 +135,7 @@ add_library(${BOOTSTRAP_LIB} SHARED ${BOOTSTRAP_LIB_SRC_LIST} ${BOOTSTRAP_LIB_IN
|
||||
target_link_libraries(${BOOTSTRAP_LIB} pthread dl)
|
||||
target_include_directories(${BOOTSTRAP_LIB} PRIVATE
|
||||
"${RSMI_INC_DIR}"
|
||||
"${ROCM_DIR}/include"
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
"${COMMON_DIR}"
|
||||
@@ -159,6 +160,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcWatchTableImpl.c
|
||||
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcRasLib.cc")
|
||||
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcSmiLib.cc")
|
||||
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcTelemetryModule.cc")
|
||||
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcDiagnosticModule.cc")
|
||||
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcModuleMgrImpl.cc")
|
||||
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcNotificationImpl.cc")
|
||||
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RsmiUtils.cc")
|
||||
@@ -182,6 +184,9 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcMod
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcModuleMgr.h")
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcTelemetry.h")
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcTelemetryModule.h")
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcDiagnostic.h")
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcDiagnosticLibInterface.h")
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcDiagnosticModule.h")
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcNotification.h")
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcNotificationImpl.h")
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RsmiUtils.h")
|
||||
@@ -190,7 +195,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${COMMON_DIR}/rdc_capabilities.h")
|
||||
|
||||
message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}")
|
||||
|
||||
link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64")
|
||||
link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64" "${ROCM_DIR}/lib")
|
||||
add_library(${RDC_LIB} SHARED ${RDC_LIB_SRC_LIST} ${RDC_LIB_INC_LIST})
|
||||
target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64 cap)
|
||||
target_include_directories(${RDC_LIB} PRIVATE
|
||||
|
||||
@@ -322,6 +322,18 @@ rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle,
|
||||
rdc_group_field_destroy(rdc_field_group_id);
|
||||
}
|
||||
|
||||
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle,
|
||||
rdc_gpu_group_t group_id,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response) {
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
|
||||
rdc_diagnostic_run(group_id, level, response);
|
||||
}
|
||||
|
||||
const char* rdc_status_string(rdc_status_t result) {
|
||||
switch (result) {
|
||||
case RDC_ST_OK:
|
||||
@@ -361,6 +373,21 @@ const char* rdc_status_string(rdc_status_t result) {
|
||||
}
|
||||
}
|
||||
|
||||
const char* rdc_diagnostic_result_string(rdc_diag_result_t result) {
|
||||
switch (result) {
|
||||
case RDC_DIAG_RESULT_PASS:
|
||||
return "Pass";
|
||||
case RDC_DIAG_RESULT_SKIP:
|
||||
return "Skip";
|
||||
case RDC_DIAG_RESULT_WARN:
|
||||
return "Warn";
|
||||
case RDC_DIAG_RESULT_FAIL:
|
||||
return "Fail";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
const char* field_id_string(rdc_field_t field_id) {
|
||||
amd::rdc::fld_id2name_map_t &field_id_to_descript =
|
||||
amd::rdc::get_field_id_description_from_id();
|
||||
|
||||
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include "rdc_lib/impl/RdcDiagnosticModule.h"
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
#include "rdc_lib/impl/RdcRasLib.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
rdc_status_t RdcDiagnosticModule::rdc_diag_test_cases_query(
|
||||
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count) {
|
||||
if (test_case_count == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
auto ite = diagnostic_modules_.begin();
|
||||
*test_case_count = 0;
|
||||
for (; ite != diagnostic_modules_.end(); ite++) {
|
||||
uint32_t count = 0;
|
||||
rdc_status_t status = (*ite)->rdc_diag_test_cases_query(
|
||||
&(test_cases[*test_case_count]), &count);
|
||||
if (status == RDC_ST_OK) {
|
||||
*test_case_count += count;
|
||||
}
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcDiagnosticModule::rdc_test_case_run(
|
||||
rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) {
|
||||
if (result == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
// Init test status
|
||||
auto ite = testcases_to_module_.find(test_case);
|
||||
if (ite == testcases_to_module_.end()) {
|
||||
result->status = RDC_DIAG_RESULT_SKIP;
|
||||
strncpy_with_null(result->info, "Not implemented", MAX_DIAG_MSG_LENGTH);
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
return ite->second->rdc_test_case_run(test_case,
|
||||
gpu_index, gpu_count, result);
|
||||
}
|
||||
|
||||
rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(
|
||||
const rdc_group_info_t& gpus,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response) {
|
||||
if (response == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
std::vector<rdc_diag_test_cases_t> rdc_runs;
|
||||
if (level >= RDC_DIAG_LVL_SHORT) { // Short run and above
|
||||
rdc_runs.push_back(RDC_DIAG_COMPUTE_PROCESS);
|
||||
rdc_runs.push_back(RDC_DIAG_NODE_TOPOLOGY);
|
||||
rdc_runs.push_back(RDC_DIAG_GPU_PARAMETERS);
|
||||
rdc_runs.push_back(RDC_DIAG_COMPUTE_QUEUE);
|
||||
rdc_runs.push_back(RDC_DIAG_SDMA_QUEUE);
|
||||
rdc_runs.push_back(RDC_DIAG_VRAM_CHECK);
|
||||
rdc_runs.push_back(RDC_DIAG_SYS_MEM_CHECK);
|
||||
}
|
||||
|
||||
response->results_count = 0;
|
||||
for (unsigned int i=0; i < rdc_runs.size(); i++) {
|
||||
response->diag_info[i].test_case = rdc_runs[i];
|
||||
rdc_test_case_run(rdc_runs[i],
|
||||
const_cast<uint32_t*>(gpus.entity_ids),
|
||||
gpus.count, &(response->diag_info[i]));
|
||||
response->results_count++;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcDiagnosticModule::rdc_diag_init(uint64_t flag) {
|
||||
auto ite = diagnostic_modules_.begin();
|
||||
for (; ite != diagnostic_modules_.end(); ite++) {
|
||||
(*ite)->rdc_diag_init(flag);
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcDiagnosticModule::RdcDiagnosticModule::rdc_diag_destroy() {
|
||||
auto ite = diagnostic_modules_.begin();
|
||||
for (; ite != diagnostic_modules_.end(); ite++) {
|
||||
(*ite)->rdc_diag_destroy();
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
RdcDiagnosticModule::RdcDiagnosticModule(const RdcSmiLibPtr& smi_lib,
|
||||
const RdcRasLibPtr& ras_module) {
|
||||
if (smi_lib) {
|
||||
diagnostic_modules_.push_back(smi_lib);
|
||||
}
|
||||
|
||||
if (ras_module) {
|
||||
diagnostic_modules_.push_back(ras_module);
|
||||
}
|
||||
|
||||
auto ite = diagnostic_modules_.begin();
|
||||
for (; ite != diagnostic_modules_.end(); ite++) {
|
||||
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES];
|
||||
uint32_t test_count = 0;
|
||||
rdc_status_t status = (*ite)->
|
||||
rdc_diag_test_cases_query(test_cases, &test_count);
|
||||
if (status == RDC_ST_OK) {
|
||||
for (uint32_t index = 0; index < test_count; index++) {
|
||||
testcases_to_module_.insert({test_cases[index], (*ite)});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
@@ -381,6 +381,22 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_unwatch(rdc_gpu_group_t group_id,
|
||||
return watch_table_->rdc_field_unwatch(group_id, field_group_id);
|
||||
}
|
||||
|
||||
// Diagnostic API
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(
|
||||
rdc_gpu_group_t group_id,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response) {
|
||||
|
||||
// Get GPU group information
|
||||
rdc_group_info_t rdc_group_info;
|
||||
rdc_status_t status = rdc_group_gpu_get_info(
|
||||
group_id, &rdc_group_info);
|
||||
if (status != RDC_ST_OK) return status;
|
||||
|
||||
auto diag = rdc_module_mgr_->get_diagnostic_module();
|
||||
return diag->rdc_diagnostic_run(rdc_group_info, level, response);
|
||||
}
|
||||
|
||||
// Control API
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_field_update_all(
|
||||
uint32_t wait_for_update) {
|
||||
|
||||
@@ -21,13 +21,15 @@ THE SOFTWARE.
|
||||
*/
|
||||
#include "rdc_lib/impl/RdcModuleMgrImpl.h"
|
||||
#include "rdc_lib/impl/RdcTelemetryModule.h"
|
||||
#include "rdc_lib/impl/RdcDiagnosticModule.h"
|
||||
#include "rdc_lib/impl/RdcRasLib.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
RdcModuleMgrImpl::RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher):
|
||||
fetcher_(fetcher) {
|
||||
smi_lib_(std::make_shared<RdcSmiLib>(fetcher)) {
|
||||
// The smi_lib_ always need to be loaded.
|
||||
}
|
||||
|
||||
|
||||
@@ -42,12 +44,30 @@ RdcTelemetryPtr RdcModuleMgrImpl::get_telemetry_module() {
|
||||
}
|
||||
|
||||
if (!rdc_telemetry_module_) {
|
||||
rdc_telemetry_module_.reset(new RdcTelemetryModule(fetcher_, ras_lib_));
|
||||
rdc_telemetry_module_.reset(new RdcTelemetryModule(smi_lib_, ras_lib_));
|
||||
}
|
||||
|
||||
return rdc_telemetry_module_;
|
||||
}
|
||||
|
||||
RdcDiagnosticPtr RdcModuleMgrImpl::get_diagnostic_module() {
|
||||
if (rdc_diagnostic_module_) {
|
||||
return rdc_diagnostic_module_;
|
||||
}
|
||||
|
||||
// Delay load
|
||||
if (!ras_lib_) {
|
||||
ras_lib_.reset(new RdcRasLib("librdc_ras.so"));
|
||||
}
|
||||
|
||||
if (!rdc_diagnostic_module_) {
|
||||
rdc_diagnostic_module_.reset(
|
||||
new RdcDiagnosticModule(smi_lib_, ras_lib_));
|
||||
}
|
||||
|
||||
return rdc_diagnostic_module_;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -149,6 +149,46 @@ rdc_status_t RdcRasLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
rdc_status_t RdcRasLib::rdc_diag_test_cases_query(
|
||||
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count) {
|
||||
(void)test_cases;
|
||||
(void)test_case_count;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
// Run a specific test case
|
||||
rdc_status_t RdcRasLib::rdc_test_case_run(
|
||||
rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) {
|
||||
(void)test_case;
|
||||
(void)gpu_index;
|
||||
(void)result;
|
||||
(void)gpu_count;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
rdc_status_t RdcRasLib::rdc_diagnostic_run(
|
||||
const rdc_group_info_t& gpus,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response) {
|
||||
(void)gpus;
|
||||
(void)level;
|
||||
(void)response;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
rdc_status_t RdcRasLib::rdc_diag_init(uint64_t flags) {
|
||||
(void)flags;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
rdc_status_t RdcRasLib::rdc_diag_destroy() {
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -24,6 +24,8 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
|
||||
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
@@ -161,6 +163,43 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcSmiLib::rdc_diag_test_cases_query(
|
||||
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
|
||||
uint32_t* test_case_count) {
|
||||
if (test_case_count == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
// Run a specific test case
|
||||
rdc_status_t RdcSmiLib::rdc_test_case_run(
|
||||
rdc_diag_test_cases_t test_case,
|
||||
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) {
|
||||
if (result == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
rdc_status_t RdcSmiLib::rdc_diagnostic_run(
|
||||
const rdc_group_info_t&,
|
||||
rdc_diag_level_t,
|
||||
rdc_diag_response_t*) {
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
rdc_status_t RdcSmiLib::rdc_diag_init(uint64_t) {
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcSmiLib::rdc_diag_destroy() {
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -93,10 +93,9 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch(
|
||||
}
|
||||
|
||||
RdcTelemetryModule::RdcTelemetryModule(
|
||||
const RdcMetricFetcherPtr& fetcher,
|
||||
const RdcSmiLibPtr& smi_lib,
|
||||
const RdcRasLibPtr& ras_module) {
|
||||
auto smi_telemetry_module = std::make_shared<RdcSmiLib>(fetcher);
|
||||
telemetry_modules_.push_back(smi_telemetry_module);
|
||||
telemetry_modules_.push_back(smi_lib);
|
||||
if (ras_module) {
|
||||
telemetry_modules_.push_back(ras_module);
|
||||
}
|
||||
|
||||
@@ -554,6 +554,17 @@ rdc_status_t RdcStandaloneHandler::rdc_field_unwatch(rdc_gpu_group_t group_id,
|
||||
return error_handle(status, reply.status());
|
||||
}
|
||||
|
||||
// Diagnostic API
|
||||
rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(
|
||||
rdc_gpu_group_t group_id,
|
||||
rdc_diag_level_t level,
|
||||
rdc_diag_response_t* response) {
|
||||
(void)group_id;
|
||||
(void)level;
|
||||
(void)response;
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
|
||||
// Control RdcAPI
|
||||
rdc_status_t RdcStandaloneHandler::rdc_field_update_all(
|
||||
|
||||
Ссылка в новой задаче
Block a user