Change-Id: Ia3d6ac80f4832f1bfb63573c543659abd5f84341
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: 9c77312c51]
Этот коммит содержится в:
Galantsev, Dmitrii
2024-10-08 19:17:35 -05:00
коммит произвёл Galantsev, Dmitrii
родитель b0035605ee
Коммит 73c79fcd83
24 изменённых файлов: 284 добавлений и 131 удалений
+1 -1
Просмотреть файл
@@ -137,7 +137,7 @@ int main(int, char**) {
std::cout << " ============== Run individual diagnostic test ===========\n";
rdc_diag_test_result_t test_result;
result =
rdc_test_case_run(rdc_handle, group_id, RDC_DIAG_COMPUTE_PROCESS, nullptr, 0, &test_result);
rdc_test_case_run(rdc_handle, group_id, RDC_DIAG_COMPUTE_PROCESS, nullptr, 0, &test_result, nullptr);
if (result != RDC_ST_OK) {
std::cout << "Error run RDC_DIAG_COMPUTE_PROCESS diagnostic. Return: "
+13 -2
Просмотреть файл
@@ -556,6 +556,12 @@ typedef struct {
rdc_diag_test_result_t diag_info[MAX_TEST_CASES];
} rdc_diag_response_t;
typedef void (*rdc_callback_t)(void*, void*);
typedef struct {
rdc_callback_t callback; //!< Callback sends logs for running diagnostics
void* cookie; //!< Cookie is used to identify different callbacks and supply them with data
} rdc_diag_callback_t;
/**
* @brief The policy type to support
*/
@@ -1087,11 +1093,13 @@ rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_
*
* @param[inout] response The detail results of the tests run.
*
* @param[inout] callback Callback for realtime communication
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_level_t level, const char* config, size_t config_size,
rdc_diag_response_t* response);
rdc_diag_response_t* response, rdc_diag_callback_t* callback);
/**
* @brief Run one diagnostic test case
@@ -1110,11 +1118,14 @@ rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group
*
* @param[inout] result The results of the test.
*
* @param[inout] callback Callback for realtime communication
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case, const char* config,
size_t config_size, rdc_diag_test_result_t* result);
size_t config_size, rdc_diag_test_result_t* result,
rdc_diag_callback_t* callback);
/**
* @brief Get a description of a provided RDC error status
+2 -2
Просмотреть файл
@@ -39,12 +39,12 @@ class RdcDiagnostic {
virtual rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, const char* config, size_t config_size,
rdc_diag_test_result_t* result) = 0;
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) = 0;
// Run multiple test cases
virtual rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) = 0;
rdc_diag_response_t* response, rdc_diag_callback_t* callback) = 0;
virtual rdc_status_t rdc_diag_init(uint64_t flags) = 0;
virtual rdc_status_t rdc_diag_destroy() = 0;
+1 -1
Просмотреть файл
@@ -38,7 +38,7 @@ rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST
rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result);
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback);
rdc_status_t rdc_diag_init(uint64_t flags);
+2 -2
Просмотреть файл
@@ -80,11 +80,11 @@ class RdcHandler {
// Diagnostic API
virtual rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) = 0;
rdc_diag_response_t* response, rdc_diag_callback_t* callback) = 0;
virtual rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) = 0;
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) = 0;
// Control API
virtual rdc_status_t rdc_field_update_all(uint32_t wait_for_update) = 0;
+3 -2
Просмотреть файл
@@ -27,6 +27,7 @@ THE SOFTWARE.
#include <memory>
#include <vector>
#include "rdc/rdc.h"
#include "rdc_lib/RdcDiagnostic.h"
#include "rdc_lib/RdcTelemetryLibInterface.h"
@@ -42,11 +43,11 @@ class RdcDiagnosticModule : public RdcDiagnostic {
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_diag_response_t* response, rdc_diag_callback_t* callback) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
+2 -2
Просмотреть файл
@@ -84,10 +84,10 @@ class RdcEmbeddedHandler final : public RdcHandler {
// Diagnostic API
rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_diag_response_t* response, rdc_diag_callback_t* callback) override;
rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) override;
// Control API
rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override;
+4 -3
Просмотреть файл
@@ -40,11 +40,11 @@ class RdcRVSLib : public RdcDiagnostic {
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_diag_response_t* response, rdc_diag_callback_t* callback) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
@@ -55,7 +55,8 @@ class RdcRVSLib : public RdcDiagnostic {
private:
RdcLibraryLoader lib_loader_;
rdc_status_t (*test_case_run_)(rdc_diag_test_cases_t, uint32_t[RDC_MAX_NUM_DEVICES], uint32_t,
const char* config, size_t config_size, rdc_diag_test_result_t*);
const char* config, size_t config_size, rdc_diag_test_result_t*,
rdc_diag_callback_t*);
rdc_status_t (*diag_test_cases_query_)(rdc_diag_test_cases_t[MAX_TEST_CASES], uint32_t*);
rdc_status_t (*diag_init_)(uint64_t);
rdc_status_t (*diag_destroy_)();
+3 -3
Просмотреть файл
@@ -40,11 +40,11 @@ class RdcRocrLib : public RdcDiagnostic {
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_diag_response_t* response, rdc_diag_callback_t* callback) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
@@ -56,7 +56,7 @@ class RdcRocrLib : public RdcDiagnostic {
private:
RdcLibraryLoader lib_loader_;
rdc_status_t (*test_case_run_)(rdc_diag_test_cases_t, uint32_t[RDC_MAX_NUM_DEVICES], uint32_t,
const char*, size_t, rdc_diag_test_result_t*);
const char*, size_t, rdc_diag_test_result_t*, rdc_diag_callback_t*);
rdc_status_t (*diag_test_cases_query_)(rdc_diag_test_cases_t[MAX_TEST_CASES], uint32_t*);
rdc_status_t (*diag_init_)(uint64_t);
rdc_status_t (*diag_destroy_)();
+2 -2
Просмотреть файл
@@ -53,11 +53,11 @@ class RdcSmiLib : public RdcTelemetry, public RdcDiagnostic {
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_diag_response_t* response, rdc_diag_callback_t* callback) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
+3 -2
Просмотреть файл
@@ -28,6 +28,7 @@ THE SOFTWARE.
#include <thread>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc.h"
#include "rdc_lib/RdcHandler.h"
namespace amd {
@@ -80,10 +81,10 @@ class RdcStandaloneHandler : public RdcHandler {
// Diagnostic API
rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) override;
rdc_diag_response_t* response, rdc_diag_callback_t* callback) override;
rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) override;
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) override;
// Control RdcAPI
rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override;
+8 -4
Просмотреть файл
@@ -141,16 +141,18 @@ service RdcAPI {
// rdc_diag_level_t level,
// const char* config,
// size_t config_size,
// rdc_diag_response_t* response);
rpc DiagnosticRun(DiagnosticRunRequest) returns (DiagnosticRunResponse) {}
// rdc_diag_response_t* response,
// rdc_diag_callback_t* callback);
rpc DiagnosticRun(DiagnosticRunRequest) returns (stream DiagnosticRunResponse) {}
// rdc_status_t rdc_test_case_run(
// rdc_gpu_group_t group_id,
// rdc_diag_test_cases_t test_case,
// const char* config,
// size_t config_size,
// rdc_diag_test_result_t* result);
rpc DiagnosticTestCaseRun(DiagnosticTestCaseRunRequest) returns (DiagnosticTestCaseRunResponse) {}
// rdc_diag_test_result_t* result,
// rdc_diag_callback_t* callback);
rpc DiagnosticTestCaseRun(DiagnosticTestCaseRunRequest) returns (stream DiagnosticTestCaseRunResponse) {}
//Just an RPC method not used as an API
rpc GetMixedComponentVersion(GetMixedComponentVersionRequest) returns (GetMixedComponentVersionResponse) {}
@@ -488,6 +490,7 @@ message DiagnosticResponse {
message DiagnosticRunResponse {
uint32 status = 1;
DiagnosticResponse response = 2;
optional string log = 3;
}
message DiagnosticTestCaseRunRequest {
@@ -510,6 +513,7 @@ message DiagnosticTestCaseRunRequest {
message DiagnosticTestCaseRunResponse {
uint32 status = 1;
DiagnosticTestResult result = 2;
optional string log = 3;
}
message GetMixedComponentVersionRequest {
+5 -5
Просмотреть файл
@@ -305,24 +305,24 @@ rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle,
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_level_t level, const char* config, size_t config_size,
rdc_diag_response_t* response) {
rdc_diag_response_t* response, rdc_diag_callback_t* callback) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_diagnostic_run(group_id, level, config, config_size, response);
->rdc_diagnostic_run(group_id, level, config, config_size, response, callback);
}
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case, const char* config,
size_t config_size, rdc_diag_test_result_t* result) {
size_t config_size, rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_test_case_run(group_id, test_case, config, config_size, result);
->rdc_test_case_run(group_id, test_case, config, config_size, result, callback);
}
rdc_status_t get_mixed_component_version(rdc_handle_t p_rdc_handle, mixed_component_t component, mixed_component_version_t* p_mixed_compv) {
@@ -454,4 +454,4 @@ rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t gr
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_policy_unregister(group_id);
}
}
+26 -9
Просмотреть файл
@@ -25,6 +25,7 @@ THE SOFTWARE.
#include <vector>
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
@@ -48,11 +49,9 @@ rdc_status_t RdcDiagnosticModule::rdc_diag_test_cases_query(
return RDC_ST_OK;
}
rdc_status_t RdcDiagnosticModule::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, const char* config,
size_t config_size,
rdc_diag_test_result_t* result) {
rdc_status_t RdcDiagnosticModule::rdc_test_case_run(
rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size, rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
@@ -64,14 +63,17 @@ rdc_status_t RdcDiagnosticModule::rdc_test_case_run(rdc_diag_test_cases_t test_c
strncpy_with_null(result->info, "Not implemented", MAX_DIAG_MSG_LENGTH);
return RDC_ST_NOT_SUPPORTED;
}
return ite->second->rdc_test_case_run(test_case, gpu_index, gpu_count, config, config_size,
result);
rdc_status_t status = ite->second->rdc_test_case_run(test_case, gpu_index, gpu_count, config,
config_size, result, callback);
return status;
}
rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpus,
rdc_diag_level_t level, const char* config,
size_t config_size,
rdc_diag_response_t* response) {
rdc_diag_response_t* response,
rdc_diag_callback_t* callback) {
if (response == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
@@ -89,14 +91,29 @@ rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpu
rdc_runs.push_back(RDC_DIAG_RVS_TEST);
}
if (callback != nullptr && callback->callback != nullptr && callback->cookie != nullptr) {
std::string log = "DiagnosticRun started";
callback->callback(callback->cookie, log.data());
}
response->results_count = 0;
for (unsigned int i = 0; i < rdc_runs.size(); i++) {
if (callback != nullptr && callback->callback != nullptr && callback->cookie != nullptr) {
std::string log = "Test " + std::to_string(i) + " / " + std::to_string(rdc_runs.size());
callback->callback(callback->cookie, log.data());
}
response->diag_info[i].test_case = rdc_runs[i];
// NOTE: rdc_test_case_run reuses the diagnostic_run callback
rdc_test_case_run(rdc_runs[i], const_cast<uint32_t*>(gpus.entity_ids), gpus.count, config,
config_size, &(response->diag_info[i]));
config_size, &(response->diag_info[i]), callback);
response->results_count++;
}
if (callback != nullptr && callback->callback != nullptr && callback->cookie != nullptr) {
std::string log = "DiagnosticRun finished";
callback->callback(callback->cookie, log.data());
}
return RDC_ST_OK;
}
+5 -4
Просмотреть файл
@@ -25,6 +25,7 @@ THE SOFTWARE.
#include "amd_smi/amdsmi.h"
#include "common/rdc_fields_supported.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcException.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/RdcNotification.h"
@@ -380,7 +381,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
rdc_diag_level_t level, const char* config,
size_t config_size,
rdc_diag_response_t* response) {
rdc_diag_response_t* response, rdc_diag_callback_t* callback) {
if (!response) {
return RDC_ST_BAD_PARAMETER;
}
@@ -391,13 +392,13 @@ rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
if (status != RDC_ST_OK) return status;
auto diag = rdc_module_mgr_->get_diagnostic_module();
return diag->rdc_diagnostic_run(rdc_group_info, level, config, config_size, response);
return diag->rdc_diagnostic_run(rdc_group_info, level, config, config_size, response, callback);
}
rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) {
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
if (!result) {
return RDC_ST_BAD_PARAMETER;
}
@@ -408,7 +409,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
auto diag = rdc_module_mgr_->get_diagnostic_module();
return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids, rdc_group_info.count, config,
config_size, result);
config_size, result, callback);
}
// Control API
+5 -3
Просмотреть файл
@@ -21,6 +21,7 @@ THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcRVSLib.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
namespace amd {
@@ -90,7 +91,7 @@ rdc_status_t RdcRVSLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cas
rdc_status_t RdcRVSLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, const char* config,
size_t config_size, rdc_diag_test_result_t* result) {
size_t config_size, rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
@@ -99,7 +100,7 @@ rdc_status_t RdcRVSLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
}
rdc_status_t status =
test_case_run_(test_case, gpu_index, gpu_count, config, config_size, result);
test_case_run_(test_case, gpu_index, gpu_count, config, config_size, result, callback);
RDC_LOG(RDC_DEBUG, "Run " << test_case << " test case from RVS: " << rdc_status_string(status)
<< " config[" << config_size << "]: " << config);
return status;
@@ -107,12 +108,13 @@ rdc_status_t RdcRVSLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
rdc_status_t RdcRVSLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) {
rdc_diag_response_t* response, rdc_diag_callback_t* callback) {
(void)gpus;
(void)level;
(void)config;
(void)config_size;
(void)response;
(void)callback;
return RDC_ST_NOT_SUPPORTED;
}
+5 -3
Просмотреть файл
@@ -23,6 +23,7 @@ THE SOFTWARE.
#include <functional>
#include "rdc/rdc.h"
#include "rdc_lib/RdcException.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
@@ -94,7 +95,7 @@ rdc_status_t RdcRocrLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_ca
rdc_status_t RdcRocrLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, const char* config,
size_t config_size, rdc_diag_test_result_t* result) {
size_t config_size, rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
@@ -103,19 +104,20 @@ rdc_status_t RdcRocrLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
}
rdc_status_t status =
test_case_run_(test_case, gpu_index, gpu_count, config, config_size, result);
test_case_run_(test_case, gpu_index, gpu_count, config, config_size, result, callback);
RDC_LOG(RDC_DEBUG, "Run " << test_case << " test case from Rocr: " << rdc_status_string(status));
return status;
}
rdc_status_t RdcRocrLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
const char* config, size_t config_size,
rdc_diag_response_t* response) {
rdc_diag_response_t* response, rdc_diag_callback_t* callback) {
(void)gpus;
(void)level;
(void)config;
(void)config_size;
(void)response;
(void)callback;
return RDC_ST_NOT_SUPPORTED;
}
+4 -2
Просмотреть файл
@@ -24,6 +24,7 @@ THE SOFTWARE.
#include <stdlib.h>
#include <strings.h>
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
namespace amd {
@@ -203,7 +204,8 @@ rdc_status_t RdcSmiLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cas
rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, const char* /*config*/,
size_t /*config_size*/, rdc_diag_test_result_t* result) {
size_t /*config_size*/, rdc_diag_test_result_t* result,
rdc_diag_callback_t* callback) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
@@ -220,7 +222,7 @@ rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
}
rdc_status_t RdcSmiLib::rdc_diagnostic_run(const rdc_group_info_t&, rdc_diag_level_t, const char*,
size_t, rdc_diag_response_t*) {
size_t, rdc_diag_response_t*, rdc_diag_callback_t*) {
return RDC_ST_NOT_SUPPORTED;
}
+93 -54
Просмотреть файл
@@ -26,6 +26,9 @@ THE SOFTWARE.
#include <future>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc.pb.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
amd::rdc::RdcHandler* make_handler(const char* ip_and_port, const char* root_ca,
const char* client_cert, const char* client_key) {
@@ -553,7 +556,8 @@ rdc_status_t RdcStandaloneHandler::rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
rdc_diag_level_t level, const char* config,
size_t config_size,
rdc_diag_response_t* response) {
rdc_diag_response_t* response,
rdc_diag_callback_t* /*callback*/) {
if (!response) {
return RDC_ST_BAD_PARAMETER;
}
@@ -566,40 +570,62 @@ rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
request.set_config(config);
request.set_config_size(config_size);
::grpc::Status status = stub_->DiagnosticRun(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
if (err_status != RDC_ST_OK) return err_status;
auto res = reply.response();
response->results_count = res.results_count();
auto reader = stub_->DiagnosticRun(&context, request);
// for the duration of the DiagnosticRun (multiple tests) - we're stuck in this loop
//
// there are 2 optional reply fields:
// * log - reports messages back during the diagnostic run
// * response - delivered when the diagnostic run completes
while (reader->Read(&reply)) {
if (reply.has_log()) {
// TODO: Remove cout?
RDC_LOG(RDC_INFO, "LOG: " << reply.log());
std::cout << "LOG: " << reply.log() << std::endl;
continue;
}
if (reply.has_response()) {
RDC_LOG(RDC_DEBUG, "HAS RESPONSE!");
auto res = reply.response();
response->results_count = res.results_count();
if (res.diag_info_size() > static_cast<int>(MAX_TEST_CASES)) {
return RDC_ST_BAD_PARAMETER;
if (res.diag_info_size() > static_cast<int>(MAX_TEST_CASES)) {
return RDC_ST_BAD_PARAMETER;
}
for (int i = 0; i < res.diag_info_size(); i++) {
const ::rdc::DiagnosticTestResult& result = res.diag_info(i);
rdc_diag_test_result_t& to_result = response->diag_info[i];
to_result.status = static_cast<rdc_diag_result_t>(result.status());
// Set details
to_result.details.code = result.details().code();
strncpy_with_null(to_result.details.msg, result.details().msg().c_str(),
MAX_DIAG_MSG_LENGTH);
to_result.test_case = static_cast<rdc_diag_test_cases_t>(result.test_case());
to_result.per_gpu_result_count = result.per_gpu_result_count();
// Set Result details
if (result.gpu_results_size() > RDC_MAX_NUM_DEVICES) {
return RDC_ST_BAD_PARAMETER;
}
for (int j = 0; j < result.gpu_results_size(); j++) {
auto per_gpu_result = result.gpu_results(j);
rdc_diag_per_gpu_result_t& to_per_gpu = to_result.gpu_results[j];
to_per_gpu.gpu_index = per_gpu_result.gpu_index();
to_per_gpu.gpu_result.code = per_gpu_result.gpu_result().code();
strncpy_with_null(to_per_gpu.gpu_result.msg, per_gpu_result.gpu_result().msg().c_str(),
MAX_DIAG_MSG_LENGTH);
}
strncpy_with_null(to_result.info, result.info().c_str(), MAX_DIAG_MSG_LENGTH);
}
}
}
for (int i = 0; i < res.diag_info_size(); i++) {
const ::rdc::DiagnosticTestResult& result = res.diag_info(i);
rdc_diag_test_result_t& to_result = response->diag_info[i];
to_result.status = static_cast<rdc_diag_result_t>(result.status());
// Set details
to_result.details.code = result.details().code();
strncpy_with_null(to_result.details.msg, result.details().msg().c_str(), MAX_DIAG_MSG_LENGTH);
to_result.test_case = static_cast<rdc_diag_test_cases_t>(result.test_case());
to_result.per_gpu_result_count = result.per_gpu_result_count();
// Set Result details
if (result.gpu_results_size() > RDC_MAX_NUM_DEVICES) {
return RDC_ST_BAD_PARAMETER;
}
for (int j = 0; j < result.gpu_results_size(); j++) {
auto per_gpu_result = result.gpu_results(j);
rdc_diag_per_gpu_result_t& to_per_gpu = to_result.gpu_results[j];
to_per_gpu.gpu_index = per_gpu_result.gpu_index();
to_per_gpu.gpu_result.code = per_gpu_result.gpu_result().code();
strncpy_with_null(to_per_gpu.gpu_result.msg, per_gpu_result.gpu_result().msg().c_str(),
MAX_DIAG_MSG_LENGTH);
}
strncpy_with_null(to_result.info, result.info().c_str(), MAX_DIAG_MSG_LENGTH);
auto status = reader->Finish();
if (status.ok()) {
RDC_LOG(RDC_DEBUG, "reader status: success!");
} else {
RDC_LOG(RDC_ERROR, "reader status: failure!");
}
return RDC_ST_OK;
@@ -608,7 +634,8 @@ rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
rdc_status_t RdcStandaloneHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
const char* config, size_t config_size,
rdc_diag_test_result_t* to_result) {
rdc_diag_test_result_t* to_result,
rdc_diag_callback_t* /*callback*/) {
if (!to_result) {
return RDC_ST_BAD_PARAMETER;
}
@@ -621,33 +648,45 @@ rdc_status_t RdcStandaloneHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
request.set_config_size(config_size);
request.set_test_case(static_cast<::rdc::DiagnosticTestCaseRunRequest_TestCaseType>(test_case));
::grpc::Status status = stub_->DiagnosticTestCaseRun(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
if (err_status != RDC_ST_OK) return err_status;
auto result = reply.result();
auto reader = stub_->DiagnosticTestCaseRun(&context, request);
while (reader->Read(&reply)) {
if (!reply.has_result()) {
RDC_LOG(RDC_ERROR, "NO TEST_RUN RESULT!");
continue;
}
to_result->status = static_cast<rdc_diag_result_t>(result.status());
auto result = reply.result();
// Set details
to_result->details.code = result.details().code();
strncpy_with_null(to_result->details.msg, result.details().msg().c_str(), MAX_DIAG_MSG_LENGTH);
to_result->status = static_cast<rdc_diag_result_t>(result.status());
to_result->test_case = static_cast<rdc_diag_test_cases_t>(result.test_case());
to_result->per_gpu_result_count = result.per_gpu_result_count();
// Set details
to_result->details.code = result.details().code();
strncpy_with_null(to_result->details.msg, result.details().msg().c_str(), MAX_DIAG_MSG_LENGTH);
// Set Result details
if (result.gpu_results_size() > RDC_MAX_NUM_DEVICES) {
return RDC_ST_BAD_PARAMETER;
to_result->test_case = static_cast<rdc_diag_test_cases_t>(result.test_case());
to_result->per_gpu_result_count = result.per_gpu_result_count();
// Set Result details
if (result.gpu_results_size() > RDC_MAX_NUM_DEVICES) {
return RDC_ST_BAD_PARAMETER;
}
for (int j = 0; j < result.gpu_results_size(); j++) {
auto per_gpu_result = result.gpu_results(j);
rdc_diag_per_gpu_result_t& to_per_gpu = to_result->gpu_results[j];
to_per_gpu.gpu_index = per_gpu_result.gpu_index();
to_per_gpu.gpu_result.code = per_gpu_result.gpu_result().code();
strncpy_with_null(to_per_gpu.gpu_result.msg, per_gpu_result.gpu_result().msg().c_str(),
MAX_DIAG_MSG_LENGTH);
}
strncpy_with_null(to_result->info, result.info().c_str(), MAX_DIAG_MSG_LENGTH);
}
for (int j = 0; j < result.gpu_results_size(); j++) {
auto per_gpu_result = result.gpu_results(j);
rdc_diag_per_gpu_result_t& to_per_gpu = to_result->gpu_results[j];
to_per_gpu.gpu_index = per_gpu_result.gpu_index();
to_per_gpu.gpu_result.code = per_gpu_result.gpu_result().code();
strncpy_with_null(to_per_gpu.gpu_result.msg, per_gpu_result.gpu_result().msg().c_str(),
MAX_DIAG_MSG_LENGTH);
auto status = reader->Finish();
if (status.ok()) {
RDC_LOG(RDC_DEBUG, "reader status: success!");
} else {
RDC_LOG(RDC_ERROR, "reader status: failure!");
}
strncpy_with_null(to_result->info, result.info().c_str(), MAX_DIAG_MSG_LENGTH);
return RDC_ST_OK;
}
+6 -5
Просмотреть файл
@@ -20,10 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <string.h>
#include <memory>
#include <stdexcept>
#include <string>
#include "rdc_lib/RdcDiagnosticLibInterface.h"
#include "rdc_lib/rdc_common.h"
@@ -152,7 +149,7 @@ static rdc_status_t run_compute_queue_test(uint32_t gpu_index, rdc_diag_test_res
rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* /*config*/, size_t /*config_size*/,
rdc_diag_test_result_t* result) {
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
if (result == nullptr || gpu_count == 0) {
return RDC_ST_BAD_PARAMETER;
}
@@ -170,6 +167,10 @@ rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
// Run test for each GPU. It will continue even
// if one GPU test is fail.
for (uint32_t i = 0; i < gpu_count; i++) {
if (callback != nullptr && callback->callback != nullptr && callback->cookie != nullptr) {
std::string str = "ROCR test on GPU " + std::to_string(i);
callback->callback(callback->cookie, str.data());
}
switch (test_case) {
case RDC_DIAG_SYS_MEM_CHECK:
run_memory_test(gpu_index[i], result);
+1 -1
Просмотреть файл
@@ -57,7 +57,7 @@ rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
// TODO: use gpu_index
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
const char* config, size_t config_size,
rdc_diag_test_result_t* result) {
rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) {
rvs_status_t rvs_status = RVS_STATUS_SUCCESS;
RVS_LOG();
if (result == nullptr || gpu_count == 0) {
+2 -1
Просмотреть файл
@@ -155,8 +155,9 @@ void RdciDiagSubSystem::process() {
rdc_status_t result;
rdc_diag_response_t response;
//rdc_diag_callback_t callback;
result = rdc_diagnostic_run(rdc_handle_, group_id_, run_level_, config_test_.c_str(),
config_test_.length(), &response);
config_test_.length(), &response, nullptr);
if (result != RDC_ST_OK) {
std::string error_msg = rdc_status_string(result);
+8 -6
Просмотреть файл
@@ -24,6 +24,8 @@ THE SOFTWARE.
#include <thread>
#include <grpcpp/server_context.h>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc.h"
@@ -121,13 +123,13 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
::grpc::Status RemoveAllJob(::grpc::ServerContext* context, const ::rdc::Empty* request,
::rdc::RemoveAllJobResponse* reply) override;
::grpc::Status DiagnosticRun(::grpc::ServerContext* context,
const ::rdc::DiagnosticRunRequest* request,
::rdc::DiagnosticRunResponse* reply) override;
::grpc::Status DiagnosticRun(
::grpc::ServerContext* context, const ::rdc::DiagnosticRunRequest* request,
::grpc::ServerWriter< ::rdc::DiagnosticRunResponse>* writer) override;
::grpc::Status DiagnosticTestCaseRun(::grpc::ServerContext* context,
const ::rdc::DiagnosticTestCaseRunRequest* request,
::rdc::DiagnosticTestCaseRunResponse* reply) override;
::grpc::Status DiagnosticTestCaseRun(
::grpc::ServerContext* context, const ::rdc::DiagnosticTestCaseRunRequest* request,
::grpc::ServerWriter< ::rdc::DiagnosticTestCaseRunResponse>* writer) override;
::grpc::Status GetMixedComponentVersion(::grpc::ServerContext* context,
const ::rdc::GetMixedComponentVersionRequest* request,
+80 -12
Просмотреть файл
@@ -23,6 +23,7 @@ THE SOFTWARE.
#include <assert.h>
#include <grpcpp/grpcpp.h>
#include <grpcpp/impl/call_op_set.h>
#include <csignal>
#include <future>
@@ -31,6 +32,7 @@ THE SOFTWARE.
#include <string>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc.pb.h"
#include "rdc/rdc.h"
#include "rdc/rdc_private.h"
#include "rdc/rdc_server_main.h"
@@ -577,26 +579,55 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::DiagnosticRun(::grpc::ServerContext* context,
const ::rdc::DiagnosticRunRequest* request,
::rdc::DiagnosticRunResponse* reply) {
::grpc::Status RdcAPIServiceImpl::DiagnosticRun(
::grpc::ServerContext* context, const ::rdc::DiagnosticRunRequest* request,
::grpc::ServerWriter<::rdc::DiagnosticRunResponse>* writer) {
(void)(context);
if (!reply || !request) {
if (!writer || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
auto cb_lambda = [](void* w, void* m) -> void {
if (w == nullptr) {
RDC_LOG(RDC_ERROR, "BAD WRITER");
return;
}
if (m == nullptr) {
RDC_LOG(RDC_ERROR, "BAD MESSAGE");
return;
}
auto writer = static_cast<::grpc::ServerWriter<::rdc::DiagnosticRunResponse>*>(w);
char* message = static_cast<char*>(m);
::rdc::DiagnosticRunResponse reply;
reply.set_log(std::string(message));
if (!writer->Write(reply)) {
RDC_LOG(RDC_ERROR, "Failed to write log message");
}
};
rdc_callback_t cb = cb_lambda;
rdc_diag_callback_t callback = {
cb,
writer,
};
rdc_diag_response_t diag_response;
rdc_status_t result = rdc_diagnostic_run(
rdc_handle_, request->group_id(), static_cast<rdc_diag_level_t>(request->level()),
const_cast<char*>(request->config().c_str()), static_cast<size_t>(request->config().length()),
&diag_response);
&diag_response, &callback);
reply->set_status(result);
::rdc::DiagnosticRunResponse reply;
reply.set_status(result);
if (result != RDC_ST_OK) {
return ::grpc::Status::OK;
}
::rdc::DiagnosticResponse* to_response = reply->mutable_response();
::rdc::DiagnosticResponse* to_response = reply.mutable_response();
to_response->set_results_count(diag_response.results_count);
for (uint32_t i = 0; i < diag_response.results_count; i++) {
@@ -626,28 +657,61 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
to_diag_info->set_info(test_result.info);
}
if (!writer->Write(reply)) {
return ::grpc::Status::CANCELLED;
}
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::DiagnosticTestCaseRun(
::grpc::ServerContext* context, const ::rdc::DiagnosticTestCaseRunRequest* request,
::rdc::DiagnosticTestCaseRunResponse* reply) {
::grpc::ServerWriter<::rdc::DiagnosticTestCaseRunResponse>* writer) {
(void)(context);
if (!reply || !request) {
if (!writer || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
auto cb_lambda = [](void* w, void* m) -> void {
if (w == nullptr) {
RDC_LOG(RDC_ERROR, "BAD WRITER");
return;
}
if (m == nullptr) {
RDC_LOG(RDC_ERROR, "BAD MESSAGE");
return;
}
auto writer = static_cast<::grpc::ServerWriter<::rdc::DiagnosticTestCaseRunResponse>*>(w);
char* message = static_cast<char*>(m);
::rdc::DiagnosticTestCaseRunResponse reply;
reply.set_log(std::string(message));
if (!writer->Write(reply)) {
RDC_LOG(RDC_ERROR, "Failed to write log message");
}
};
rdc_callback_t cb = cb_lambda;
rdc_diag_callback_t callback = {
cb,
writer,
};
rdc_diag_test_result_t test_result;
rdc_status_t result = rdc_test_case_run(
rdc_handle_, request->group_id(), static_cast<rdc_diag_test_cases_t>(request->test_case()),
const_cast<char*>(request->config().c_str()), static_cast<size_t>(request->config().length()),
&test_result);
&test_result, &callback);
reply->set_status(result);
::rdc::DiagnosticTestCaseRunResponse reply;
reply.set_status(result);
if (result != RDC_ST_OK) {
return ::grpc::Status::OK;
}
::rdc::DiagnosticTestResult* to_diag_info = reply->mutable_result();
::rdc::DiagnosticTestResult* to_diag_info = reply.mutable_result();
to_diag_info->set_status(test_result.status);
// details
@@ -670,6 +734,10 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
}
to_diag_info->set_info(test_result.info);
if (!writer->Write(reply)) {
return ::grpc::Status::CANCELLED;
}
return ::grpc::Status::OK;
}