Add the RdcSmiDiagnostic module

Provides a RdcSmiDiagnostic module, which will call rocm_smi_lib.

It will support following diagnostics: Get GPU Topology, Check GPU
parameters and check processes running on the GPUs.

The grpc client and server side diagnostics function is added.

The diag module is added to the rdci.

Change-Id: I10a0cf3c20556a61373ab686f82cae75acaa40dd
This commit is contained in:
Bill(Shuzhou) Liu
2021-06-21 15:35:06 -04:00
والد 5a4bf97327
کامیت 76ccf58008
22فایلهای تغییر یافته به همراه1330 افزوده شده و 13 حذف شده
@@ -145,6 +145,23 @@ int main(int, char **) {
}
}
}
// (5) run one test case
std::cout <<" ============== Run individual diagnostic test ===========\n";
rdc_diag_test_result_t test_result;
result = rdc_test_case_run(rdc_handle, group_id,
RDC_DIAG_COMPUTE_PROCESS, &test_result);
if (result != RDC_ST_OK) {
std::cout << "Error run RDC_DIAG_COMPUTE_PROCESS diagnostic. Return: "
<< rdc_status_string(result);
goto cleanup;
}
std::cout << std::setw(22) << std::left
<< get_test_name(RDC_DIAG_COMPUTE_PROCESS) + ":"
<< rdc_diagnostic_result_string(test_result.status) << "\n";
// Cleanup consists of shutting down RDC.
cleanup:
std::cout << "Cleaning up.\n";
+21
مشاهده پرونده
@@ -980,6 +980,27 @@ rdc_status_t rdc_diagnostic_run(
rdc_diag_level_t level,
rdc_diag_response_t* response);
/**
* @brief Run one diagnostic test case
*
* @details Run a specific diagnostic test case.
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @param[in] test_case The test case to run.
*
* @param[inout] result The results of the test.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_test_case_run(
rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result);
/**
* @brief Get a description of a provided RDC error status
*
@@ -36,7 +36,8 @@ rdc_status_t rdc_diag_test_cases_query(
uint32_t* test_case_count);
// Run a specific test case
rdc_status_t rdc_test_case_run(
rdc_status_t rdc_diag_test_case_run(
rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
@@ -86,6 +86,11 @@ class RdcHandler {
rdc_diag_level_t level,
rdc_diag_response_t* response) = 0;
virtual rdc_status_t rdc_test_case_run(
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) = 0;
// Control API
virtual rdc_status_t rdc_field_update_all(uint32_t wait_for_update) = 0;
@@ -91,6 +91,11 @@ class RdcEmbeddedHandler: public RdcHandler {
rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_test_case_run(
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) override;
// Control API
rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override;
@@ -0,0 +1,70 @@
/*
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef INCLUDE_RDC_LIB_IMPL_RDCSMIDIAGNOSTICIMPL_H_
#define INCLUDE_RDC_LIB_IMPL_RDCSMIDIAGNOSTICIMPL_H_
#include <memory>
#include <string>
#include "rdc_lib/rdc_common.h"
#include "rdc/rdc.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace rdc {
class RdcSmiDiagnosticImpl {
public:
RdcSmiDiagnosticImpl();
rdc_status_t check_rsmi_process_info(
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t check_rsmi_topo_info(
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t check_rsmi_param_info(
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result);
private:
rdc_diag_result_t check_temperature_level(uint32_t gpu_index
, rsmi_temperature_type_t type
, char msg[MAX_DIAG_MSG_LENGTH]
, char per_gpu_msg[MAX_DIAG_MSG_LENGTH]);
std::string get_temperature_string(
rsmi_temperature_type_t type) const;
rdc_diag_result_t check_voltage_level(uint32_t gpu_index
, rsmi_voltage_type_t type
, char msg[MAX_DIAG_MSG_LENGTH]
, char per_gpu_msg[MAX_DIAG_MSG_LENGTH]);
std::string get_voltage_string(
rsmi_voltage_type_t type) const;
};
typedef std::shared_ptr<RdcSmiDiagnosticImpl> RdcSmiDiagnosticPtr;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCSMIDIAGNOSTICIMPL_H_
@@ -27,6 +27,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/RdcTelemetry.h"
#include "rdc_lib/RdcDiagnostic.h"
#include "rdc_lib/impl/RdcSmiDiagnosticImpl.h"
namespace amd {
namespace rdc {
@@ -71,6 +72,7 @@ class RdcSmiLib : public RdcTelemetry, public RdcDiagnostic {
private:
RdcMetricFetcherPtr metric_fetcher_;
bool bulk_fetch_enabled_;
RdcSmiDiagnosticPtr smi_diag_;
};
typedef std::shared_ptr<RdcSmiLib> RdcSmiLibPtr;
@@ -85,6 +85,10 @@ class RdcStandaloneHandler: public RdcHandler {
rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_test_case_run(
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) override;
// Control RdcAPI
rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override;
+74
مشاهده پرونده
@@ -214,6 +214,18 @@ service RdcAPI {
// rdc_status_t rdc_job_remove_all()
rpc RemoveAllJob(Empty) returns (RemoveAllJobResponse) {}
// rdc_status_t rdc_diagnostic_run(
// rdc_gpu_group_t group_id,
// rdc_diag_level_t level,
// rdc_diag_response_t* response);
rpc DiagnosticRun(DiagnosticRunRequest) returns (DiagnosticRunResponse) {}
// rdc_status_t rdc_test_case_run(
// rdc_gpu_group_t group_id,
// rdc_diag_test_cases_t test_case,
// rdc_diag_test_result_t* result);
rpc DiagnosticTestCaseRun(DiagnosticTestCaseRunRequest) returns (DiagnosticTestCaseRunResponse) {}
}
message Empty {
@@ -459,4 +471,66 @@ message RemoveJobResponse {
message RemoveAllJobResponse {
uint32 status = 1;
}
message DiagnosticRunRequest {
uint32 group_id = 1;
uint32 level = 2;
}
message DiagnosticDetail {
string msg = 1;
uint32 code = 2;
}
message DiagnosticPerGpuResult {
uint32 gpu_index = 1;
DiagnosticDetail gpu_result = 2;
}
message DiagnosticTestResult {
uint32 status = 1;
DiagnosticDetail details = 2;
enum DiagnosticTestCase {
COMPUTE_PROCESS = 0;
SDMA_QUEUE = 1;
COMPUTE_QUEUE = 2;
VRAM_CHECK = 3;
SYS_MEM_CHECK = 4;
NODE_TOPOLOGY = 5;
GPU_PARAMETERS = 6;
};
DiagnosticTestCase test_case = 3;
uint32 per_gpu_result_count = 4;
repeated DiagnosticPerGpuResult gpu_results = 5;
string info = 6;
}
message DiagnosticResponse {
uint32 results_count = 1;
repeated DiagnosticTestResult diag_info = 2;
}
message DiagnosticRunResponse {
uint32 status = 1;
DiagnosticResponse response = 2;
}
message DiagnosticTestCaseRunRequest {
uint32 group_id = 1;
enum TestCaseType {
COMPUTE_PROCESS = 0;
SDMA_QUEUE = 1;
COMPUTE_QUEUE = 2;
VRAM_CHECK = 3;
SYS_MEM_CHECK = 4;
NODE_TOPOLOGY = 5;
GPU_PARAMETERS = 6;
};
TestCaseType test_case = 2;
}
message DiagnosticTestCaseRunResponse {
uint32 status = 1;
DiagnosticTestResult result = 2;
}
+2
مشاهده پرونده
@@ -161,6 +161,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcRasLib.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcSmiLib.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcTelemetryModule.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcDiagnosticModule.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcSmiDiagnosticImpl.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcModuleMgrImpl.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcNotificationImpl.cc")
set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RsmiUtils.cc")
@@ -187,6 +188,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcTel
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcDiagnostic.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcDiagnosticLibInterface.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcDiagnosticModule.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcSmiDiagnosticImpl.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcNotification.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcNotificationImpl.h")
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RsmiUtils.h")
@@ -334,6 +334,18 @@ rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle,
rdc_diagnostic_run(group_id, level, response);
}
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_test_case_run(group_id, test_case, result);
}
const char* rdc_status_string(rdc_status_t result) {
switch (result) {
case RDC_ST_OK:
@@ -67,7 +67,7 @@ namespace amd {
namespace rdc {
// TODO(bill_liu): make it configurable
const uint32_t METIC_UPDATE_FREQUENCY = 100; // 100ms by default
const uint32_t METIC_UPDATE_FREQUENCY = 1000; // 1000 microseconds by default
RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode):
group_settings_(new RdcGroupSettingsImpl())
@@ -386,6 +386,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(
rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_response_t* response) {
if (!response) {
return RDC_ST_BAD_PARAMETER;
}
// Get GPU group information
rdc_group_info_t rdc_group_info;
@@ -397,6 +400,24 @@ rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(
return diag->rdc_diagnostic_run(rdc_group_info, level, response);
}
rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) {
if (!result) {
return RDC_ST_BAD_PARAMETER;
}
// Get GPU group information
rdc_group_info_t rdc_group_info;
rdc_status_t status = rdc_group_gpu_get_info(
group_id, &rdc_group_info);
if (status != RDC_ST_OK) return status;
auto diag = rdc_module_mgr_->get_diagnostic_module();
return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids,
rdc_group_info.count, result);
}
// Control API
rdc_status_t RdcEmbeddedHandler::rdc_field_update_all(
uint32_t wait_for_update) {
@@ -39,6 +39,7 @@ RdcMetricsUpdaterImpl::RdcMetricsUpdaterImpl(
// Make the listen time for notifications a relatively long time.
// There's no point in starting/stopping it constantly.
static const uint32_t kRdcFieldListenNotifTime_mS = 10000;
static const uint32_t kRdcEventCheck_ms = 1000;
void RdcMetricsUpdaterImpl::start() {
if (started_) {
@@ -49,7 +50,7 @@ void RdcMetricsUpdaterImpl::start() {
while (started_) {
watch_table_->rdc_field_listen_notif(kRdcFieldListenNotifTime_mS);
std::this_thread::sleep_for(
std::chrono::microseconds(_check_frequency));
std::chrono::milliseconds(kRdcEventCheck_ms));
}
});
updater_ = std::async(std::launch::async, [this](){
@@ -0,0 +1,575 @@
/*
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcSmiDiagnosticImpl.h"
#include <vector>
#include <map>
#include <string>
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RsmiUtils.h"
namespace amd {
namespace rdc {
RdcSmiDiagnosticImpl::RdcSmiDiagnosticImpl() {
}
rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
*result = {};
result->test_case = RDC_DIAG_COMPUTE_PROCESS;
result->status = RDC_DIAG_RESULT_SKIP;
result->per_gpu_result_count = 0;
rsmi_status_t err = RSMI_STATUS_SUCCESS;
uint32_t num_items = 0;
err = rsmi_compute_process_info_get(
nullptr, &num_items);
if (err != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"Fail to get process information: " << err);
strncpy_with_null(result->info,
"Fail to retreive process information from rocm_smi_lib",
MAX_DIAG_MSG_LENGTH);
return Rsmi2RdcError(err);
}
// No process found
if (num_items == 0) {
result->status = RDC_DIAG_RESULT_PASS;
result->per_gpu_result_count = 0;
strncpy_with_null(result->info,
"Do not have any compute process running on any devices",
MAX_DIAG_MSG_LENGTH);
return RDC_ST_OK;
}
std::string info = std::to_string(num_items)
+ " compute process is using devices.";
// Find details of the process running on each GPU
std::vector<rsmi_process_info_t> procs(num_items);
err = rsmi_compute_process_info_get(
reinterpret_cast<rsmi_process_info_t*>(&procs[0]), &num_items);
if (err != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO,
"Fail to get process detail information: " << err);
strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH);
return Rsmi2RdcError(err);
}
std::map<uint32_t, std::vector<uint32_t>> pids_per_gpu;
for (uint32_t i=0; i < num_items; i++) {
info += " Process: " + std::to_string(procs[i].process_id)
+= ", pasid: " + std::to_string(procs[i].pasid)
+= ", vram_usage: " + std::to_string(procs[i].vram_usage)
+= ", sdma_usage: " + std::to_string(procs[i].sdma_usage)
+= ", cu_occupancy: " + std::to_string(procs[i].cu_occupancy)
+=".";
// Get the num_devices the process is running
uint32_t num_devices = 0;
err = rsmi_compute_process_gpus_get(procs[i].process_id,
nullptr, &num_devices);
if (err != RSMI_STATUS_SUCCESS || num_devices == 0) {
RDC_LOG(RDC_INFO,
"Fail to get process GPUs detail information: " << err);
continue;
}
// Get the details of devices
std::vector<uint32_t> device_details(num_devices);
err = rsmi_compute_process_gpus_get(procs[i].process_id,
reinterpret_cast<uint32_t*>(&device_details[0]), &num_devices);
if (err != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO,
"Fail to get process GPUs detail information: " << err);
continue;
}
// Add process information in per GPU structure
for (uint32_t j=0; j < num_devices; j++) {
// return the value if exists
auto ite = pids_per_gpu.insert(std::pair<uint32_t,
std::vector<uint32_t>>(device_details[j],
std::vector<uint32_t>()));
ite.first->second.push_back(procs[i].process_id);
}
} // end for (uint32_t i=0 ...)
if (pids_per_gpu.size() == 0) {
result->status = RDC_DIAG_RESULT_WARN;
info += " Cannot detect the processes running in which devices.";
} else {
result->status = RDC_DIAG_RESULT_PASS; // pass by default
}
// Mark as fail
for (uint32_t i=0; i < gpu_count; i++) {
if (pids_per_gpu.find(gpu_index[i]) != pids_per_gpu.end()) {
result->status = RDC_DIAG_RESULT_FAIL;
break;
}
}
// Set per GPU information
strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH);
for (auto ite=pids_per_gpu.begin(); ite != pids_per_gpu.end(); ++ite) {
auto& per_gpu = result->gpu_results[result->per_gpu_result_count];
per_gpu.gpu_index = ite->first;
per_gpu.gpu_result.code = 0;
std::string per_gpu_msg = "Running process:";
for (uint32_t k=0; k < ite->second.size(); k++) {
per_gpu_msg += " " + std::to_string(ite->second[k]);
}
strncpy_with_null(per_gpu.gpu_result.msg,
per_gpu_msg.c_str(), MAX_DIAG_MSG_LENGTH);
result->per_gpu_result_count++;
if (result->per_gpu_result_count >= RDC_MAX_NUM_DEVICES) {
RDC_LOG(RDC_ERROR, "Found more GPUs than " << RDC_MAX_NUM_DEVICES);
break;
}
}
return RDC_ST_OK;
}
std::string RdcSmiDiagnosticImpl::get_temperature_string(
rsmi_temperature_type_t type) const {
switch (type) {
case RSMI_TEMP_TYPE_EDGE:
return "Edge";
case RSMI_TEMP_TYPE_JUNCTION:
return "Junction";
case RSMI_TEMP_TYPE_MEMORY:
return "Memory";
default:
return "Unknown";
}
}
std::string RdcSmiDiagnosticImpl::get_voltage_string(
rsmi_voltage_type_t type) const {
switch (type) {
case RSMI_VOLT_TYPE_VDDGFX:
return "Vddgfx voltage";
default:
return "Unknown";
}
}
// Show topology type
rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_topo_info(
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
*result = {};
result->test_case = RDC_DIAG_NODE_TOPOLOGY;
const std::map<RSMI_IO_LINK_TYPE, std::string> link_to_string = {
{RSMI_IOLINK_TYPE_UNDEFINED, "Undefined"},
{RSMI_IOLINK_TYPE_PCIEXPRESS, "PCI Express"},
{RSMI_IOLINK_TYPE_XGMI, "XGMI"},
{RSMI_IOLINK_TYPE_NUMIOLINKTYPES, "IO Link"}
};
result->status = RDC_DIAG_RESULT_SKIP;
result->per_gpu_result_count = 0;
rsmi_status_t err = RSMI_STATUS_SUCCESS;
std::string info = "";
for (uint32_t i=0; i < gpu_count; i++) {
for (uint32_t j=0; j < gpu_count; j++) {
if (gpu_index[i] == gpu_index[j]) continue;
uint64_t weight;
err = rsmi_topo_get_link_weight(
gpu_index[i], gpu_index[j], &weight);
if (err != RSMI_STATUS_SUCCESS) {
result->status = RDC_DIAG_RESULT_FAIL;
result->details.code = err;
std::string err_info = "rsmi_topo_get_link_weight(";
err_info += std::to_string(gpu_index[i]) + ",";
err_info += std::to_string(gpu_index[j]) + ", &weight)";
err_info += " fail";
strncpy_with_null(result->details.msg, err_info.c_str(),
MAX_DIAG_MSG_LENGTH);
strncpy_with_null(result->info, err_info.c_str(),
MAX_DIAG_MSG_LENGTH);
return RDC_ST_MSI_ERROR;
}
info += std::to_string(gpu_index[i]) + "=>";
info += std::to_string(gpu_index[j]) + " weight:";
info += std::to_string(weight) + " ";
}
}
if (info != "") {
strncpy_with_null(result->info, info.c_str(),
MAX_DIAG_MSG_LENGTH);
} else {
strncpy_with_null(result->info, "No link detected.",
MAX_DIAG_MSG_LENGTH);
}
result->status = RDC_DIAG_RESULT_PASS;
return RDC_ST_OK;
}
rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info(
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
*result = {};
result->test_case = RDC_DIAG_GPU_PARAMETERS;
result->status = RDC_DIAG_RESULT_PASS;
std::string info = "";
for (uint32_t i=0; i < gpu_count; i++) {
// temperature
for (rsmi_temperature_type_t sensor_type = RSMI_TEMP_TYPE_FIRST;
sensor_type != RSMI_TEMP_TYPE_LAST; ) {
auto status = check_temperature_level(
gpu_index[i], sensor_type, result->info,
result->gpu_results[i].gpu_result.msg);
// Set to higher error level
if (status > result->status) {
result->status = status;
}
sensor_type = static_cast<rsmi_temperature_type_t>(sensor_type+1);
}
// Voltage
for (rsmi_voltage_type_t sensor_type = RSMI_VOLT_TYPE_FIRST;
sensor_type != RSMI_VOLT_TYPE_LAST;) {
auto status = check_voltage_level(
gpu_index[i], sensor_type, result->info,
result->gpu_results[i].gpu_result.msg);
// Set to higher error level
if (status > result->status) {
result->status = status;
}
sensor_type = static_cast<rsmi_voltage_type_t>(sensor_type+1);
}
result->gpu_results->gpu_index = gpu_index[i];
result->per_gpu_result_count++;
}
return RDC_ST_OK;
}
rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
uint32_t gpu_index, rsmi_temperature_type_t type
, char msg[MAX_DIAG_MSG_LENGTH]
, char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) {
rdc_diag_result_t result = RDC_DIAG_RESULT_PASS;
rsmi_temperature_metric_t met = RSMI_TEMP_CURRENT;
rsmi_status_t err = RSMI_STATUS_SUCCESS;
int64_t current_temp = 0;
std::string info = msg;
std::string per_gpu_info = per_gpu_msg;
err = rsmi_dev_temp_metric_get(gpu_index,
type, met, &current_temp);
if (err != RSMI_STATUS_SUCCESS) return result;
// Max temperature
met = RSMI_TEMP_MAX;
int64_t max_temp = 0;
err = rsmi_dev_temp_metric_get(gpu_index,
type, met, &max_temp);
if (err == RSMI_STATUS_SUCCESS) {
if (current_temp >= max_temp) {
result = RDC_DIAG_RESULT_WARN;
per_gpu_info += "Max ";
per_gpu_info += get_temperature_string(type);
per_gpu_info += " temperature ";
per_gpu_info += std::to_string(max_temp);
per_gpu_info += " greater than current temperature ";
per_gpu_info += std::to_string(current_temp) +". ";
info += get_temperature_string(type) + ": ";
info += "GPU " + std::to_string(gpu_index);
info += " max ";
info += get_temperature_string(type);
info += " temperature exceeds. ";
} else {
info += "GPU " + std::to_string(gpu_index);
info += " max ";
info += get_temperature_string(type);
info += " temperature in range. ";
}
}
met = RSMI_TEMP_MIN;
int64_t min_temp = 0;
err = rsmi_dev_temp_metric_get(gpu_index,
type, met, &min_temp);
if (err == RSMI_STATUS_SUCCESS) {
if (current_temp <= min_temp) {
result = RDC_DIAG_RESULT_WARN;
per_gpu_info += "Min ";
per_gpu_info += get_temperature_string(type);
per_gpu_info += " temperature ";
per_gpu_info += std::to_string(min_temp);
per_gpu_info += " less than current temperature ";
per_gpu_info += std::to_string(current_temp) +". ";
info += "GPU " + std::to_string(gpu_index);
info += " min ";
info += get_temperature_string(type);
info += " temperature exceeds. ";
} else {
info += "GPU " + std::to_string(gpu_index);
info += " min ";
info += get_temperature_string(type);
info += " temperature in range. ";
}
}
met = RSMI_TEMP_CRITICAL;
int64_t critical_temp = 0;
err = rsmi_dev_temp_metric_get(gpu_index,
type, met, &critical_temp);
if (err != RSMI_STATUS_SUCCESS) {
if (current_temp >= critical_temp) {
result = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Critical ";
per_gpu_info += get_temperature_string(type);
per_gpu_info += " temperature ";
per_gpu_info += std::to_string(critical_temp);
per_gpu_info += " greater than current temperature ";
per_gpu_info += std::to_string(current_temp) +". ";
info += "GPU " + std::to_string(gpu_index);
info += " Critical ";
info += get_temperature_string(type);
info += " temperature exceeds. ";
} else {
info += "GPU " + std::to_string(gpu_index);
info += " Critical ";
info += get_temperature_string(type);
info += " temperature in range. ";
}
}
met = RSMI_TEMP_EMERGENCY;
int64_t emergency_temp = 0;
err = rsmi_dev_temp_metric_get(gpu_index,
type, met, &emergency_temp);
if (err != RSMI_STATUS_SUCCESS) {
if (current_temp >= critical_temp) {
result = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Emergency ";
per_gpu_info += get_temperature_string(type);
per_gpu_info += " temperature ";
per_gpu_info += std::to_string(emergency_temp);
per_gpu_info += " greater than current temperature ";
per_gpu_info += std::to_string(current_temp) +". ";
info += "GPU " + std::to_string(gpu_index);
info += " Emergency ";
info += get_temperature_string(type);
info += " temperature exceeds. ";
} else {
info += "GPU " + std::to_string(gpu_index);
info += " Emergency ";
info += get_temperature_string(type);
info += " temperature in range. ";
}
}
met = RSMI_TEMP_CRIT_MIN;
int64_t critical_min_temp = 0;
err = rsmi_dev_temp_metric_get(gpu_index,
type, met, &critical_min_temp);
if (err != RSMI_STATUS_SUCCESS) {
if (current_temp <= critical_min_temp) {
result = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Critical Min ";
per_gpu_info += get_temperature_string(type);
per_gpu_info += " temperature ";
per_gpu_info += std::to_string(critical_min_temp);
per_gpu_info += " less than current temperature ";
per_gpu_info += std::to_string(current_temp) +". ";
info += "GPU " + std::to_string(gpu_index);
info += " Critical Min ";
info += get_temperature_string(type);
info += " temperature exceeds. ";
} else {
info += "GPU " + std::to_string(gpu_index);
info += " Critical Min ";
info += get_temperature_string(type);
info += " temperature in range. ";
}
}
strncpy_with_null(msg, info.c_str(),
MAX_DIAG_MSG_LENGTH);
strncpy_with_null(per_gpu_msg, per_gpu_info.c_str(),
MAX_DIAG_MSG_LENGTH);
return result;
}
rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(
uint32_t gpu_index, rsmi_voltage_type_t type
, char msg[MAX_DIAG_MSG_LENGTH]
, char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) {
rdc_diag_result_t result = RDC_DIAG_RESULT_PASS;
rsmi_voltage_metric_t met = RSMI_VOLT_CURRENT;
rsmi_status_t err = RSMI_STATUS_SUCCESS;
int64_t current_voltage = 0;
std::string info = msg;
std::string per_gpu_info = per_gpu_msg;
err = rsmi_dev_volt_metric_get(gpu_index,
type, met, &current_voltage);
if (err != RSMI_STATUS_SUCCESS) return result;
// Max voltage
met = RSMI_VOLT_MAX;
int64_t max_volt = 0;
err = rsmi_dev_volt_metric_get(gpu_index,
type, met, &max_volt);
if (err == RSMI_STATUS_SUCCESS) {
if (current_voltage >= max_volt) {
result = RDC_DIAG_RESULT_WARN;
per_gpu_info += "Max ";
per_gpu_info += get_voltage_string(type);
per_gpu_info += " voltage ";
per_gpu_info += std::to_string(max_volt);
per_gpu_info += " greater than current voltage ";
per_gpu_info += std::to_string(current_voltage) +". ";
info += get_voltage_string(type) + ": ";
info += "GPU " + std::to_string(gpu_index);
info += " max ";
info += get_voltage_string(type);
info += " voltage exceeds. ";
} else {
info += "GPU " + std::to_string(gpu_index);
info += " max ";
info += get_voltage_string(type);
info += " voltage in range. ";
}
}
// Min voltage
met = RSMI_VOLT_MIN;
int64_t min_volt = 0;
err = rsmi_dev_volt_metric_get(gpu_index,
type, met, &min_volt);
if (err == RSMI_STATUS_SUCCESS) {
if (current_voltage <= min_volt) {
result = RDC_DIAG_RESULT_WARN;
per_gpu_info += "Min ";
per_gpu_info += get_voltage_string(type);
per_gpu_info += " voltage ";
per_gpu_info += std::to_string(min_volt);
per_gpu_info += " less than current voltage ";
per_gpu_info += std::to_string(current_voltage) +". ";
info += get_voltage_string(type) + ": ";
info += "GPU " + std::to_string(gpu_index);
info += " min ";
info += get_voltage_string(type);
info += " voltage exceeds. ";
} else {
info += "GPU " + std::to_string(gpu_index);
info += " min ";
info += get_voltage_string(type);
info += " voltage in range. ";
}
}
// Max Critical voltage
met = RSMI_VOLT_MAX_CRIT;
int64_t critical_max_volt = 0;
err = rsmi_dev_volt_metric_get(gpu_index,
type, met, &critical_max_volt);
if (err == RSMI_STATUS_SUCCESS) {
if (current_voltage >= critical_max_volt) {
result = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Critical Max ";
per_gpu_info += get_voltage_string(type);
per_gpu_info += " voltage ";
per_gpu_info += std::to_string(critical_max_volt);
per_gpu_info += " greater than current voltage ";
per_gpu_info += std::to_string(current_voltage) +". ";
info += get_voltage_string(type) + ": ";
info += "GPU " + std::to_string(gpu_index);
info += " Critical max ";
info += get_voltage_string(type);
info += " voltage exceeds. ";
} else {
info += "GPU " + std::to_string(gpu_index);
info += " Critical max ";
info += get_voltage_string(type);
info += " voltage in range. ";
}
}
// Min Critical voltage
met = RSMI_VOLT_MIN_CRIT;
int64_t critical_min_volt = 0;
err = rsmi_dev_volt_metric_get(gpu_index,
type, met, &critical_min_volt);
if (err == RSMI_STATUS_SUCCESS) {
if (current_voltage <= critical_min_volt) {
result = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Critical Min ";
per_gpu_info += get_voltage_string(type);
per_gpu_info += " voltage ";
per_gpu_info += std::to_string(critical_min_volt);
per_gpu_info += " less than current voltage ";
per_gpu_info += std::to_string(current_voltage) +". ";
info += get_voltage_string(type) + ": ";
info += "GPU " + std::to_string(gpu_index);
info += " Critical min ";
info += get_voltage_string(type);
info += " voltage exceeds. ";
} else {
info += "GPU " + std::to_string(gpu_index);
info += " Critical min ";
info += get_voltage_string(type);
info += " voltage in range. ";
}
}
strncpy_with_null(msg, info.c_str(),
MAX_DIAG_MSG_LENGTH);
strncpy_with_null(per_gpu_msg, per_gpu_info.c_str(),
MAX_DIAG_MSG_LENGTH);
return result;
}
} // namespace rdc
} // namespace amd
+24 -4
مشاهده پرونده
@@ -31,8 +31,9 @@ THE SOFTWARE.
namespace amd {
namespace rdc {
RdcSmiLib::RdcSmiLib(const RdcMetricFetcherPtr& mf): metric_fetcher_(mf),
bulk_fetch_enabled_(false) { // Disable bulk fetch by default.
RdcSmiLib::RdcSmiLib(const RdcMetricFetcherPtr& mf): metric_fetcher_(mf)
, bulk_fetch_enabled_(false)
, smi_diag_(std::make_shared<RdcSmiDiagnosticImpl>()) {
char* bulk_env = getenv("RDC_BULK_FETCH_ENABLED");
if (bulk_env != nullptr && strcasecmp(bulk_env, "true") == 0) {
RDC_LOG(RDC_DEBUG, "Bulk fetch enabled.");
@@ -182,7 +183,14 @@ rdc_status_t RdcSmiLib::rdc_diag_test_cases_query(
return RDC_ST_BAD_PARAMETER;
}
return RDC_ST_NOT_SUPPORTED;
const std::vector<rdc_diag_test_cases_t> tests {
RDC_DIAG_COMPUTE_PROCESS,
RDC_DIAG_NODE_TOPOLOGY,
RDC_DIAG_GPU_PARAMETERS
};
std::copy(tests.begin(), tests.end(), test_cases);
*test_case_count = tests.size();
return RDC_ST_OK;
}
// Run a specific test case
@@ -194,7 +202,19 @@ rdc_status_t RdcSmiLib::rdc_test_case_run(
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
return RDC_ST_NOT_SUPPORTED;
switch (test_case) {
case RDC_DIAG_COMPUTE_PROCESS:
return smi_diag_->check_rsmi_process_info(
gpu_index, gpu_count, result);
case RDC_DIAG_NODE_TOPOLOGY:
return smi_diag_->check_rsmi_topo_info(
gpu_index, gpu_count, result);
case RDC_DIAG_GPU_PARAMETERS:
return smi_diag_->check_rsmi_param_info(
gpu_index, gpu_count, result);
default:
return RDC_ST_NOT_SUPPORTED;
}
}
rdc_status_t RdcSmiLib::rdc_diagnostic_run(
@@ -559,10 +559,110 @@ rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run(
rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_response_t* response) {
(void)group_id;
(void)level;
(void)response;
return RDC_ST_NOT_SUPPORTED;
if (!response) {
return RDC_ST_BAD_PARAMETER;
}
::rdc::DiagnosticRunRequest request;
::rdc::DiagnosticRunResponse reply;
::grpc::ClientContext context;
request.set_group_id(group_id);
request.set_level(level);
::grpc::Status status = stub_->
DiagnosticRun(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
if (err_status != RDC_ST_OK) return err_status;
auto res = reply.response();
response->results_count = res.results_count();
if (res.diag_info_size() > static_cast<int>(MAX_TEST_CASES)) {
return RDC_ST_BAD_PARAMETER;
}
for (int i = 0; i < res.diag_info_size(); i++) {
const ::rdc::DiagnosticTestResult& result = res.diag_info(i);
rdc_diag_test_result_t& to_result = response->diag_info[i];
to_result.status = static_cast<rdc_diag_result_t>(result.status());
// Set details
to_result.details.code = result.details().code();
strncpy_with_null(to_result.details.msg,
result.details().msg().c_str(), MAX_DIAG_MSG_LENGTH);
to_result.test_case = static_cast<rdc_diag_test_cases_t>(
result.test_case());
to_result.per_gpu_result_count = result.per_gpu_result_count();
// Set Result details
if (result.gpu_results_size() > RDC_MAX_NUM_DEVICES) {
return RDC_ST_BAD_PARAMETER;
}
for (int j=0; j < result.gpu_results_size(); j++) {
auto per_gpu_result = result.gpu_results(j);
rdc_diag_per_gpu_result_t& to_per_gpu = to_result.gpu_results[j];
to_per_gpu.gpu_index = per_gpu_result.gpu_index();
to_per_gpu.gpu_result.code = per_gpu_result.gpu_result().code();
strncpy_with_null(to_per_gpu.gpu_result.msg,
per_gpu_result.gpu_result().msg().c_str(), MAX_DIAG_MSG_LENGTH);
}
strncpy_with_null(to_result.info,
result.info().c_str(), MAX_DIAG_MSG_LENGTH);
}
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_test_case_run(
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t *to_result) {
if (!to_result) {
return RDC_ST_BAD_PARAMETER;
}
::rdc::DiagnosticTestCaseRunRequest request;
::rdc::DiagnosticTestCaseRunResponse reply;
::grpc::ClientContext context;
request.set_group_id(group_id);
request.set_test_case(static_cast<
::rdc::DiagnosticTestCaseRunRequest_TestCaseType>(test_case));
::grpc::Status status = stub_->DiagnosticTestCaseRun(
&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
if (err_status != RDC_ST_OK)
return err_status;
auto result = reply.result();
to_result->status = static_cast<rdc_diag_result_t>(result.status());
// Set details
to_result->details.code = result.details().code();
strncpy_with_null(to_result->details.msg,
result.details().msg().c_str(), MAX_DIAG_MSG_LENGTH);
to_result->test_case = static_cast<rdc_diag_test_cases_t>(
result.test_case());
to_result->per_gpu_result_count = result.per_gpu_result_count();
// Set Result details
if (result.gpu_results_size() > RDC_MAX_NUM_DEVICES) {
return RDC_ST_BAD_PARAMETER;
}
for (int j = 0; j < result.gpu_results_size(); j++) {
auto per_gpu_result = result.gpu_results(j);
rdc_diag_per_gpu_result_t &to_per_gpu =
to_result->gpu_results[j];
to_per_gpu.gpu_index = per_gpu_result.gpu_index();
to_per_gpu.gpu_result.code = per_gpu_result.gpu_result().code();
strncpy_with_null(to_per_gpu.gpu_result.msg,
per_gpu_result.gpu_result().msg().c_str(),
MAX_DIAG_MSG_LENGTH);
}
strncpy_with_null(to_result->info,
result.info().c_str(), MAX_DIAG_MSG_LENGTH);
return RDC_ST_OK;
}
+1
مشاهده پرونده
@@ -78,6 +78,7 @@ set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciSubSystem.cc")
set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciGroupSubSystem.cc")
set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciFieldGroupSubSystem.cc")
set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciDmonSubSystem.cc")
set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciDiagSubSystem.cc")
set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciStatsSubSystem.cc")
set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${PROJECT_SOURCE_DIR}/common/rdc_utils.cc")
set(RDCI_SRC_LIST ${RDCI_SRC_LIST}
@@ -0,0 +1,61 @@
/*
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef RDCI_INCLUDE_RDCIDIAGSUBSYSTEM_H_
#define RDCI_INCLUDE_RDCIDIAGSUBSYSTEM_H_
#include <signal.h>
#include <map>
#include <vector>
#include <string>
#include "RdciSubSystem.h"
namespace amd {
namespace rdc {
class RdciDiagSubSystem: public RdciSubSystem {
public:
RdciDiagSubSystem();
~RdciDiagSubSystem();
void parse_cmd_opts(int argc, char ** argv) override;
void process() override;
private:
void show_help() const;
std::string get_test_name(
rdc_diag_test_cases_t test_case) const;
enum OPERATIONS {
DIAG_UNKNOWN = 0,
DIAG_HELP,
DIAG_RUN,
} diag_ops_;
rdc_gpu_group_t group_id_;
rdc_diag_level_t run_level_;
};
} // namespace rdc
} // namespace amd
#endif // RDCI_INCLUDE_RDCIDIAGSUBSYSTEM_H_
@@ -0,0 +1,208 @@
/*
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "RdciDiagSubSystem.h"
#include <getopt.h>
#include <unistd.h>
#include <signal.h>
#include <string.h>
#include <assert.h>
#include <limits>
#include <iomanip>
#include <unordered_map>
#include <vector>
#include <string>
#include <queue>
#include <ctime>
#include <sstream>
#include "rdc_lib/rdc_common.h"
#include "common/rdc_utils.h"
#include "common/rdc_fields_supported.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcException.h"
namespace amd {
namespace rdc {
RdciDiagSubSystem::RdciDiagSubSystem(): diag_ops_(DIAG_RUN)
, run_level_(RDC_DIAG_LVL_SHORT) {
}
RdciDiagSubSystem::~RdciDiagSubSystem() {
}
void RdciDiagSubSystem::parse_cmd_opts(int argc, char ** argv) {
const int HOST_OPTIONS = 1000;
const struct option long_options[] = {
{"host", required_argument, nullptr, HOST_OPTIONS},
{"help", optional_argument, nullptr, 'h'},
{"unauth", optional_argument, nullptr, 'u'},
{"run-level", required_argument, nullptr, 'r'},
{"group-id", required_argument, nullptr, 'g'},
{ nullptr, 0 , nullptr, 0 }
};
bool group_id_set = false;
int option_index = 0;
int opt = 0;
while ((opt = getopt_long(argc, argv, "hug:r:",
long_options, &option_index)) != -1) {
switch (opt) {
case HOST_OPTIONS:
ip_port_ = optarg;
break;
case 'h':
diag_ops_ = DIAG_HELP;
return;
case 'u':
use_auth_ = false;
break;
case 'g':
if (!IsNumber(optarg)) {
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER,
"The group id needs to be a number");
}
group_id_ = std::stoi(optarg);
group_id_set = true;
break;
case 'r':
if (!IsNumber(optarg)) {
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER,
"The run level needs to be a number");
}
run_level_ = static_cast<rdc_diag_level_t>(std::stoi(optarg));
break;
default:
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER,
"Unknown command line options");
}
}
if (!group_id_set) {
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER,
"Need to specify the GPU group id");
}
}
void RdciDiagSubSystem::show_help() const {
// Try to keep total output line length to <= 80 chars for better
// readability. For reference:
// *********************** 60 Chars **************************
// ************** 40 Chars ***************
// ***** 20 Chars ****
std::cout << " diag -- Used to run diagnostic for GPUs.\n\n";
std::cout << "Usage\n";
std::cout << " rdci diag [--host <IP/FQDN>:port] [-u] -g <groupId>"
<< " -r <runLevel>\n";
std::cout << "\nFlags:\n";
show_common_usage();
std::cout << " -g --group-id The GPU group to diagnose"
<< " on the specified host.\n";
std::cout << " -r --run-level level Integer representing test"
<< " run levels [default = 1].\n"
<< " level 1: Tests take a "
<< "few seconds to run.\n"
<< " level 2: Tests take a "
<< "few minutes to run (To be implemented).\n"
<< " level 3: Tests take "
<< "half an hour to run (To be implemented).\n";
}
std::string RdciDiagSubSystem::get_test_name
(rdc_diag_test_cases_t test_case) const {
const std::map<rdc_diag_test_cases_t, std::string> test_desc = {
{RDC_DIAG_COMPUTE_PROCESS, "No compute process"},
{RDC_DIAG_SDMA_QUEUE, "SDMA Queue ready"},
{RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"},
{RDC_DIAG_VRAM_CHECK, "VRAM check"},
{RDC_DIAG_SYS_MEM_CHECK, "System memory check"},
{RDC_DIAG_NODE_TOPOLOGY, "Node topology check"},
{RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"},
{RDC_DIAG_TEST_LAST, "Unknown"}
};
auto test_name = test_desc.find(test_case);
if (test_name == test_desc.end()) {
return "Unknown Test";
}
return test_name->second;
}
void RdciDiagSubSystem::process() {
if (diag_ops_ == DIAG_HELP ||
diag_ops_ == DIAG_UNKNOWN) {
show_help();
return;
}
rdc_status_t result;
rdc_diag_response_t response;
result = rdc_diagnostic_run(rdc_handle_, group_id_,
run_level_, &response);
if (result != RDC_ST_OK) {
std::string error_msg = rdc_status_string(result);
throw RdcException(result, error_msg.c_str());
}
// (3) Check diagnostic results
for (uint32_t i=0 ; i < response.results_count; i++) {
const rdc_diag_test_result_t& test_result =
response.diag_info[i];
std::cout << std::setw(26) << std::left
<< get_test_name(test_result.test_case) + ":"
<< rdc_diagnostic_result_string(test_result.status) << "\n";
}
// (4) diagnostic detail information
std::cout <<" =============== Diagnostic Details ==================\n";
for (uint32_t i=0 ; i < response.results_count; i++) {
const rdc_diag_test_result_t& test_result =
response.diag_info[i];
if (test_result.info[0] != '\0') {
std::cout << std::setw(26) << std::left
<< get_test_name(test_result.test_case) + ":"
<< test_result.info << "\n";
}
for (uint32_t j=0; j < test_result.per_gpu_result_count; j++) {
const rdc_diag_per_gpu_result_t& gpu_result
= test_result.gpu_results[j];
if (strlen(gpu_result.gpu_result.msg) > 0) {
std::cout << " GPU " << gpu_result.gpu_index << " " <<
gpu_result.gpu_result.msg << "\n";
}
}
}
}
} // namespace rdc
} // namespace amd
+4 -1
مشاهده پرونده
@@ -27,6 +27,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcException.h"
#include "RdciDiscoverySubSystem.h"
#include "RdciDmonSubSystem.h"
#include "RdciDiagSubSystem.h"
#include "RdciFieldGroupSubSystem.h"
#include "RdciGroupSubSystem.h"
#include "RdciStatsSubSystem.h"
@@ -35,7 +36,7 @@ THE SOFTWARE.
int main(int argc, char ** argv) {
const std::string usage_help =
"Usage:\trdci <subsystem>\nsubsystem: discovery, dmon, group, "
"fieldgroup, stats\n";
"fieldgroup, stats, diag\n";
if (argc <= 1) {
std::cout << usage_help;
@@ -49,6 +50,8 @@ int main(int argc, char ** argv) {
subsystem.reset(new amd::rdc::RdciDiscoverySubSystem());
} else if (subsystem_name == "dmon") {
subsystem.reset(new amd::rdc::RdciDmonSubSystem());
} else if (subsystem_name == "diag") {
subsystem.reset(new amd::rdc::RdciDiagSubSystem());
} else if (subsystem_name == "group") {
subsystem.reset(new amd::rdc::RdciGroupSubSystem());
} else if (subsystem_name == "fieldgroup") {
@@ -120,6 +120,14 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
const ::rdc::Empty* request,
::rdc::RemoveAllJobResponse* reply) override;
::grpc::Status DiagnosticRun(::grpc::ServerContext* context,
const ::rdc::DiagnosticRunRequest* request,
::rdc::DiagnosticRunResponse* reply) override;
::grpc::Status DiagnosticTestCaseRun(::grpc::ServerContext* context,
const ::rdc::DiagnosticTestCaseRunRequest* request,
::rdc::DiagnosticTestCaseRunResponse* reply) override;
private:
bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
::rdc::GpuUsageInfo* target);
+107 -1
مشاهده پرونده
@@ -559,7 +559,6 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::RemoveJob(
::grpc::ServerContext* context,
const ::rdc::RemoveJobRequest* request,
@@ -591,7 +590,114 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src,
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::DiagnosticRun(
::grpc::ServerContext* context,
const ::rdc::DiagnosticRunRequest* request,
::rdc::DiagnosticRunResponse* reply) {
(void)(context);
if (!reply || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
rdc_diag_response_t diag_response;
rdc_status_t result = rdc_diagnostic_run(
rdc_handle_,
request->group_id(),
static_cast<rdc_diag_level_t>(request->level()),
&diag_response);
reply->set_status(result);
if (result != RDC_ST_OK) {
return ::grpc::Status::OK;
}
::rdc::DiagnosticResponse* to_response = reply->mutable_response();
to_response->set_results_count(diag_response.results_count);
for (uint32_t i=0 ; i < diag_response.results_count; i++) {
const rdc_diag_test_result_t& test_result =
diag_response.diag_info[i];
::rdc::DiagnosticTestResult* to_diag_info =
to_response->add_diag_info();
to_diag_info->set_status(test_result.status);
// details
auto to_details = to_diag_info->mutable_details();
to_details->set_code(test_result.details.code);
to_details->set_msg(test_result.details.msg);
to_diag_info->set_test_case(
static_cast<::rdc::DiagnosticTestResult_DiagnosticTestCase>
(test_result.test_case));
to_diag_info->set_per_gpu_result_count(
test_result.per_gpu_result_count);
// gpu_results
for (uint32_t j=0; j < test_result.per_gpu_result_count; j++) {
auto to_result = to_diag_info->add_gpu_results();
const rdc_diag_per_gpu_result_t& cur_result =
test_result.gpu_results[j];
to_result->set_gpu_index(cur_result.gpu_index);
auto to_per_detail = to_result->mutable_gpu_result();
to_per_detail->set_code(cur_result.gpu_result.code);
to_per_detail->set_msg(cur_result.gpu_result.msg);
}
to_diag_info->set_info(test_result.info);
}
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::DiagnosticTestCaseRun(
::grpc::ServerContext *context,
const ::rdc::DiagnosticTestCaseRunRequest *request,
::rdc::DiagnosticTestCaseRunResponse *reply) {
(void)(context);
if (!reply || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
rdc_diag_test_result_t test_result;
rdc_status_t result = rdc_test_case_run(
rdc_handle_,
request->group_id(),
static_cast<rdc_diag_test_cases_t>(request->test_case()),
&test_result);
reply->set_status(result);
if (result != RDC_ST_OK) {
return ::grpc::Status::OK;
}
::rdc::DiagnosticTestResult *to_diag_info =
reply->mutable_result();
to_diag_info->set_status(test_result.status);
// details
auto to_details = to_diag_info->mutable_details();
to_details->set_code(test_result.details.code);
to_details->set_msg(test_result.details.msg);
to_diag_info->set_test_case(
static_cast<::rdc::DiagnosticTestResult_DiagnosticTestCase>(
test_result.test_case));
to_diag_info->set_per_gpu_result_count(
test_result.per_gpu_result_count);
// gpu_results
for (uint32_t j = 0; j < test_result.per_gpu_result_count; j++) {
auto to_result = to_diag_info->add_gpu_results();
const rdc_diag_per_gpu_result_t &cur_result =
test_result.gpu_results[j];
to_result->set_gpu_index(cur_result.gpu_index);
auto to_per_detail = to_result->mutable_gpu_result();
to_per_detail->set_code(cur_result.gpu_result.code);
to_per_detail->set_msg(cur_result.gpu_result.msg);
}
to_diag_info->set_info(test_result.info);
return ::grpc::Status::OK;
}
} // namespace rdc
} // namespace amd