diff --git a/example/diagnostic_example.cc b/example/diagnostic_example.cc index eb47f008e1..b34db7c3a0 100644 --- a/example/diagnostic_example.cc +++ b/example/diagnostic_example.cc @@ -145,6 +145,23 @@ int main(int, char **) { } } } + + // (5) run one test case + std::cout <<" ============== Run individual diagnostic test ===========\n"; + rdc_diag_test_result_t test_result; + result = rdc_test_case_run(rdc_handle, group_id, + RDC_DIAG_COMPUTE_PROCESS, &test_result); + + if (result != RDC_ST_OK) { + std::cout << "Error run RDC_DIAG_COMPUTE_PROCESS diagnostic. Return: " + << rdc_status_string(result); + goto cleanup; + } + + std::cout << std::setw(22) << std::left + << get_test_name(RDC_DIAG_COMPUTE_PROCESS) + ":" + << rdc_diagnostic_result_string(test_result.status) << "\n"; + // Cleanup consists of shutting down RDC. cleanup: std::cout << "Cleaning up.\n"; diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index 4ef192ecd4..0198c07d96 100755 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -980,6 +980,27 @@ rdc_status_t rdc_diagnostic_run( rdc_diag_level_t level, rdc_diag_response_t* response); +/** + * @brief Run one diagnostic test case + * + * @details Run a specific diagnostic test case. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @param[in] test_case The test case to run. + * + * @param[inout] result The results of the test. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_test_case_run( + rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, + rdc_diag_test_cases_t test_case, + rdc_diag_test_result_t* result); + /** * @brief Get a description of a provided RDC error status * diff --git a/include/rdc_lib/RdcDiagnosticLibInterface.h b/include/rdc_lib/RdcDiagnosticLibInterface.h index d833d36648..85d9930ccc 100644 --- a/include/rdc_lib/RdcDiagnosticLibInterface.h +++ b/include/rdc_lib/RdcDiagnosticLibInterface.h @@ -36,7 +36,8 @@ rdc_status_t rdc_diag_test_cases_query( uint32_t* test_case_count); // Run a specific test case -rdc_status_t rdc_test_case_run( + +rdc_status_t rdc_diag_test_case_run( rdc_diag_test_cases_t test_case, uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, diff --git a/include/rdc_lib/RdcHandler.h b/include/rdc_lib/RdcHandler.h index a51eb90add..8cda0c0531 100644 --- a/include/rdc_lib/RdcHandler.h +++ b/include/rdc_lib/RdcHandler.h @@ -86,6 +86,11 @@ class RdcHandler { rdc_diag_level_t level, rdc_diag_response_t* response) = 0; + virtual rdc_status_t rdc_test_case_run( + rdc_gpu_group_t group_id, + rdc_diag_test_cases_t test_case, + rdc_diag_test_result_t* result) = 0; + // Control API virtual rdc_status_t rdc_field_update_all(uint32_t wait_for_update) = 0; diff --git a/include/rdc_lib/impl/RdcEmbeddedHandler.h b/include/rdc_lib/impl/RdcEmbeddedHandler.h index 12113aad20..5f1b9662af 100644 --- a/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -91,6 +91,11 @@ class RdcEmbeddedHandler: public RdcHandler { rdc_gpu_group_t group_id, rdc_diag_level_t level, rdc_diag_response_t* response) override; + rdc_status_t rdc_test_case_run( + rdc_gpu_group_t group_id, + rdc_diag_test_cases_t test_case, + rdc_diag_test_result_t* result) override; + // Control API rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; diff --git a/include/rdc_lib/impl/RdcSmiDiagnosticImpl.h b/include/rdc_lib/impl/RdcSmiDiagnosticImpl.h new file mode 100644 index 0000000000..cc275f43b3 --- /dev/null +++ b/include/rdc_lib/impl/RdcSmiDiagnosticImpl.h @@ -0,0 +1,70 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_IMPL_RDCSMIDIAGNOSTICIMPL_H_ +#define INCLUDE_RDC_LIB_IMPL_RDCSMIDIAGNOSTICIMPL_H_ +#include +#include +#include "rdc_lib/rdc_common.h" +#include "rdc/rdc.h" +#include "rocm_smi/rocm_smi.h" + +namespace amd { +namespace rdc { + +class RdcSmiDiagnosticImpl { + public: + RdcSmiDiagnosticImpl(); + + rdc_status_t check_rsmi_process_info( + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result); + rdc_status_t check_rsmi_topo_info( + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result); + rdc_status_t check_rsmi_param_info( + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result); + + private: + rdc_diag_result_t check_temperature_level(uint32_t gpu_index + , rsmi_temperature_type_t type + , char msg[MAX_DIAG_MSG_LENGTH] + , char per_gpu_msg[MAX_DIAG_MSG_LENGTH]); + std::string get_temperature_string( + rsmi_temperature_type_t type) const; + + rdc_diag_result_t check_voltage_level(uint32_t gpu_index + , rsmi_voltage_type_t type + , char msg[MAX_DIAG_MSG_LENGTH] + , char per_gpu_msg[MAX_DIAG_MSG_LENGTH]); + std::string get_voltage_string( + rsmi_voltage_type_t type) const; +}; + +typedef std::shared_ptr RdcSmiDiagnosticPtr; + +} // namespace rdc +} // namespace amd +#endif // INCLUDE_RDC_LIB_IMPL_RDCSMIDIAGNOSTICIMPL_H_ diff --git a/include/rdc_lib/impl/RdcSmiLib.h b/include/rdc_lib/impl/RdcSmiLib.h index 6a18621a69..6c92bde387 100644 --- a/include/rdc_lib/impl/RdcSmiLib.h +++ b/include/rdc_lib/impl/RdcSmiLib.h @@ -27,6 +27,7 @@ THE SOFTWARE. #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcTelemetry.h" #include "rdc_lib/RdcDiagnostic.h" +#include "rdc_lib/impl/RdcSmiDiagnosticImpl.h" namespace amd { namespace rdc { @@ -71,6 +72,7 @@ class RdcSmiLib : public RdcTelemetry, public RdcDiagnostic { private: RdcMetricFetcherPtr metric_fetcher_; bool bulk_fetch_enabled_; + RdcSmiDiagnosticPtr smi_diag_; }; typedef std::shared_ptr RdcSmiLibPtr; diff --git a/include/rdc_lib/impl/RdcStandaloneHandler.h b/include/rdc_lib/impl/RdcStandaloneHandler.h index 009775d35a..4cb4662bc8 100644 --- a/include/rdc_lib/impl/RdcStandaloneHandler.h +++ b/include/rdc_lib/impl/RdcStandaloneHandler.h @@ -85,6 +85,10 @@ class RdcStandaloneHandler: public RdcHandler { rdc_gpu_group_t group_id, rdc_diag_level_t level, rdc_diag_response_t* response) override; + rdc_status_t rdc_test_case_run( + rdc_gpu_group_t group_id, + rdc_diag_test_cases_t test_case, + rdc_diag_test_result_t* result) override; // Control RdcAPI rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; diff --git a/protos/rdc.proto b/protos/rdc.proto index 0736a23706..0f4e8d1d33 100755 --- a/protos/rdc.proto +++ b/protos/rdc.proto @@ -214,6 +214,18 @@ service RdcAPI { // rdc_status_t rdc_job_remove_all() rpc RemoveAllJob(Empty) returns (RemoveAllJobResponse) {} + + // rdc_status_t rdc_diagnostic_run( + // rdc_gpu_group_t group_id, + // rdc_diag_level_t level, + // rdc_diag_response_t* response); + rpc DiagnosticRun(DiagnosticRunRequest) returns (DiagnosticRunResponse) {} + + // rdc_status_t rdc_test_case_run( + // rdc_gpu_group_t group_id, + // rdc_diag_test_cases_t test_case, + // rdc_diag_test_result_t* result); + rpc DiagnosticTestCaseRun(DiagnosticTestCaseRunRequest) returns (DiagnosticTestCaseRunResponse) {} } message Empty { @@ -459,4 +471,66 @@ message RemoveJobResponse { message RemoveAllJobResponse { uint32 status = 1; +} + +message DiagnosticRunRequest { + uint32 group_id = 1; + uint32 level = 2; +} + +message DiagnosticDetail { + string msg = 1; + uint32 code = 2; +} + +message DiagnosticPerGpuResult { + uint32 gpu_index = 1; + DiagnosticDetail gpu_result = 2; +} + +message DiagnosticTestResult { + uint32 status = 1; + DiagnosticDetail details = 2; + enum DiagnosticTestCase { + COMPUTE_PROCESS = 0; + SDMA_QUEUE = 1; + COMPUTE_QUEUE = 2; + VRAM_CHECK = 3; + SYS_MEM_CHECK = 4; + NODE_TOPOLOGY = 5; + GPU_PARAMETERS = 6; + }; + DiagnosticTestCase test_case = 3; + uint32 per_gpu_result_count = 4; + repeated DiagnosticPerGpuResult gpu_results = 5; + string info = 6; +} + +message DiagnosticResponse { + uint32 results_count = 1; + repeated DiagnosticTestResult diag_info = 2; +} + +message DiagnosticRunResponse { + uint32 status = 1; + DiagnosticResponse response = 2; +} + +message DiagnosticTestCaseRunRequest { + uint32 group_id = 1; + enum TestCaseType { + COMPUTE_PROCESS = 0; + SDMA_QUEUE = 1; + COMPUTE_QUEUE = 2; + VRAM_CHECK = 3; + SYS_MEM_CHECK = 4; + NODE_TOPOLOGY = 5; + GPU_PARAMETERS = 6; + }; + TestCaseType test_case = 2; +} + +message DiagnosticTestCaseRunResponse { + uint32 status = 1; + DiagnosticTestResult result = 2; } \ No newline at end of file diff --git a/rdc_libs/CMakeLists.txt b/rdc_libs/CMakeLists.txt index 6d27f93b01..7ded02838c 100755 --- a/rdc_libs/CMakeLists.txt +++ b/rdc_libs/CMakeLists.txt @@ -161,6 +161,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcRasLib.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcSmiLib.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcTelemetryModule.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcDiagnosticModule.cc") +set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcSmiDiagnosticImpl.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcModuleMgrImpl.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RdcNotificationImpl.cc") set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/rdc/src/RsmiUtils.cc") @@ -187,6 +188,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcTel set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcDiagnostic.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcDiagnosticLibInterface.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcDiagnosticModule.h") +set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcSmiDiagnosticImpl.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/RdcNotification.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RdcNotificationImpl.h") set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${RDC_LIB_INC_DIR}/rdc_lib/impl/RsmiUtils.h") diff --git a/rdc_libs/bootstrap/src/RdcBootStrap.cc b/rdc_libs/bootstrap/src/RdcBootStrap.cc index db1600bc95..fb66fb77b7 100644 --- a/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -334,6 +334,18 @@ rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_diagnostic_run(group_id, level, response); } +rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, + rdc_gpu_group_t group_id, + rdc_diag_test_cases_t test_case, + rdc_diag_test_result_t* result) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)-> + rdc_test_case_run(group_id, test_case, result); +} + const char* rdc_status_string(rdc_status_t result) { switch (result) { case RDC_ST_OK: diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 6843699075..cabffe0f95 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -67,7 +67,7 @@ namespace amd { namespace rdc { // TODO(bill_liu): make it configurable -const uint32_t METIC_UPDATE_FREQUENCY = 100; // 100ms by default +const uint32_t METIC_UPDATE_FREQUENCY = 1000; // 1000 microseconds by default RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode): group_settings_(new RdcGroupSettingsImpl()) @@ -386,6 +386,9 @@ rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run( rdc_gpu_group_t group_id, rdc_diag_level_t level, rdc_diag_response_t* response) { + if (!response) { + return RDC_ST_BAD_PARAMETER; + } // Get GPU group information rdc_group_info_t rdc_group_info; @@ -397,6 +400,24 @@ rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run( return diag->rdc_diagnostic_run(rdc_group_info, level, response); } +rdc_status_t RdcEmbeddedHandler::rdc_test_case_run( + rdc_gpu_group_t group_id, + rdc_diag_test_cases_t test_case, + rdc_diag_test_result_t* result) { + if (!result) { + return RDC_ST_BAD_PARAMETER; + } + // Get GPU group information + rdc_group_info_t rdc_group_info; + rdc_status_t status = rdc_group_gpu_get_info( + group_id, &rdc_group_info); + if (status != RDC_ST_OK) return status; + + auto diag = rdc_module_mgr_->get_diagnostic_module(); + return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids, + rdc_group_info.count, result); +} + // Control API rdc_status_t RdcEmbeddedHandler::rdc_field_update_all( uint32_t wait_for_update) { diff --git a/rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc b/rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc index 385a97c771..8e5a234c07 100644 --- a/rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricsUpdaterImpl.cc @@ -39,6 +39,7 @@ RdcMetricsUpdaterImpl::RdcMetricsUpdaterImpl( // Make the listen time for notifications a relatively long time. // There's no point in starting/stopping it constantly. static const uint32_t kRdcFieldListenNotifTime_mS = 10000; +static const uint32_t kRdcEventCheck_ms = 1000; void RdcMetricsUpdaterImpl::start() { if (started_) { @@ -49,7 +50,7 @@ void RdcMetricsUpdaterImpl::start() { while (started_) { watch_table_->rdc_field_listen_notif(kRdcFieldListenNotifTime_mS); std::this_thread::sleep_for( - std::chrono::microseconds(_check_frequency)); + std::chrono::milliseconds(kRdcEventCheck_ms)); } }); updater_ = std::async(std::launch::async, [this](){ diff --git a/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc b/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc new file mode 100644 index 0000000000..79639b5310 --- /dev/null +++ b/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc @@ -0,0 +1,575 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/impl/RdcSmiDiagnosticImpl.h" +#include +#include +#include +#include "rdc_lib/rdc_common.h" +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/impl/RsmiUtils.h" + +namespace amd { +namespace rdc { +RdcSmiDiagnosticImpl::RdcSmiDiagnosticImpl() { +} + +rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info( + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + *result = {}; + result->test_case = RDC_DIAG_COMPUTE_PROCESS; + result->status = RDC_DIAG_RESULT_SKIP; + result->per_gpu_result_count = 0; + rsmi_status_t err = RSMI_STATUS_SUCCESS; + uint32_t num_items = 0; + err = rsmi_compute_process_info_get( + nullptr, &num_items); + if (err != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "Fail to get process information: " << err); + strncpy_with_null(result->info, + "Fail to retreive process information from rocm_smi_lib", + MAX_DIAG_MSG_LENGTH); + return Rsmi2RdcError(err); + } + + // No process found + if (num_items == 0) { + result->status = RDC_DIAG_RESULT_PASS; + result->per_gpu_result_count = 0; + strncpy_with_null(result->info, + "Do not have any compute process running on any devices", + MAX_DIAG_MSG_LENGTH); + return RDC_ST_OK; + } + + std::string info = std::to_string(num_items) + + " compute process is using devices."; + // Find details of the process running on each GPU + std::vector procs(num_items); + err = rsmi_compute_process_info_get( + reinterpret_cast(&procs[0]), &num_items); + if (err != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_INFO, + "Fail to get process detail information: " << err); + strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH); + return Rsmi2RdcError(err); + } + + std::map> pids_per_gpu; + for (uint32_t i=0; i < num_items; i++) { + info += " Process: " + std::to_string(procs[i].process_id) + += ", pasid: " + std::to_string(procs[i].pasid) + += ", vram_usage: " + std::to_string(procs[i].vram_usage) + += ", sdma_usage: " + std::to_string(procs[i].sdma_usage) + += ", cu_occupancy: " + std::to_string(procs[i].cu_occupancy) + +="."; + + // Get the num_devices the process is running + uint32_t num_devices = 0; + err = rsmi_compute_process_gpus_get(procs[i].process_id, + nullptr, &num_devices); + if (err != RSMI_STATUS_SUCCESS || num_devices == 0) { + RDC_LOG(RDC_INFO, + "Fail to get process GPUs detail information: " << err); + continue; + } + + // Get the details of devices + std::vector device_details(num_devices); + err = rsmi_compute_process_gpus_get(procs[i].process_id, + reinterpret_cast(&device_details[0]), &num_devices); + if (err != RSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_INFO, + "Fail to get process GPUs detail information: " << err); + continue; + } + + // Add process information in per GPU structure + for (uint32_t j=0; j < num_devices; j++) { + // return the value if exists + auto ite = pids_per_gpu.insert(std::pair>(device_details[j], + std::vector())); + ite.first->second.push_back(procs[i].process_id); + } + } // end for (uint32_t i=0 ...) + + if (pids_per_gpu.size() == 0) { + result->status = RDC_DIAG_RESULT_WARN; + info += " Cannot detect the processes running in which devices."; + } else { + result->status = RDC_DIAG_RESULT_PASS; // pass by default + } + + // Mark as fail + for (uint32_t i=0; i < gpu_count; i++) { + if (pids_per_gpu.find(gpu_index[i]) != pids_per_gpu.end()) { + result->status = RDC_DIAG_RESULT_FAIL; + break; + } + } + + // Set per GPU information + strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH); + for (auto ite=pids_per_gpu.begin(); ite != pids_per_gpu.end(); ++ite) { + auto& per_gpu = result->gpu_results[result->per_gpu_result_count]; + per_gpu.gpu_index = ite->first; + per_gpu.gpu_result.code = 0; + std::string per_gpu_msg = "Running process:"; + for (uint32_t k=0; k < ite->second.size(); k++) { + per_gpu_msg += " " + std::to_string(ite->second[k]); + } + + strncpy_with_null(per_gpu.gpu_result.msg, + per_gpu_msg.c_str(), MAX_DIAG_MSG_LENGTH); + + result->per_gpu_result_count++; + if (result->per_gpu_result_count >= RDC_MAX_NUM_DEVICES) { + RDC_LOG(RDC_ERROR, "Found more GPUs than " << RDC_MAX_NUM_DEVICES); + break; + } + } + + return RDC_ST_OK; +} + +std::string RdcSmiDiagnosticImpl::get_temperature_string( + rsmi_temperature_type_t type) const { + switch (type) { + case RSMI_TEMP_TYPE_EDGE: + return "Edge"; + case RSMI_TEMP_TYPE_JUNCTION: + return "Junction"; + case RSMI_TEMP_TYPE_MEMORY: + return "Memory"; + default: + return "Unknown"; + } +} + +std::string RdcSmiDiagnosticImpl::get_voltage_string( + rsmi_voltage_type_t type) const { + switch (type) { + case RSMI_VOLT_TYPE_VDDGFX: + return "Vddgfx voltage"; + default: + return "Unknown"; + } +} + +// Show topology type +rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_topo_info( + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + *result = {}; + result->test_case = RDC_DIAG_NODE_TOPOLOGY; + + const std::map link_to_string = { + {RSMI_IOLINK_TYPE_UNDEFINED, "Undefined"}, + {RSMI_IOLINK_TYPE_PCIEXPRESS, "PCI Express"}, + {RSMI_IOLINK_TYPE_XGMI, "XGMI"}, + {RSMI_IOLINK_TYPE_NUMIOLINKTYPES, "IO Link"} + }; + + result->status = RDC_DIAG_RESULT_SKIP; + result->per_gpu_result_count = 0; + rsmi_status_t err = RSMI_STATUS_SUCCESS; + std::string info = ""; + + for (uint32_t i=0; i < gpu_count; i++) { + for (uint32_t j=0; j < gpu_count; j++) { + if (gpu_index[i] == gpu_index[j]) continue; + + uint64_t weight; + err = rsmi_topo_get_link_weight( + gpu_index[i], gpu_index[j], &weight); + if (err != RSMI_STATUS_SUCCESS) { + result->status = RDC_DIAG_RESULT_FAIL; + result->details.code = err; + std::string err_info = "rsmi_topo_get_link_weight("; + err_info += std::to_string(gpu_index[i]) + ","; + err_info += std::to_string(gpu_index[j]) + ", &weight)"; + err_info += " fail"; + strncpy_with_null(result->details.msg, err_info.c_str(), + MAX_DIAG_MSG_LENGTH); + strncpy_with_null(result->info, err_info.c_str(), + MAX_DIAG_MSG_LENGTH); + return RDC_ST_MSI_ERROR; + } + + info += std::to_string(gpu_index[i]) + "=>"; + info += std::to_string(gpu_index[j]) + " weight:"; + info += std::to_string(weight) + " "; + } + } + if (info != "") { + strncpy_with_null(result->info, info.c_str(), + MAX_DIAG_MSG_LENGTH); + } else { + strncpy_with_null(result->info, "No link detected.", + MAX_DIAG_MSG_LENGTH); + } + + result->status = RDC_DIAG_RESULT_PASS; + return RDC_ST_OK; +} + +rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info( + uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { + if (result == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + *result = {}; + result->test_case = RDC_DIAG_GPU_PARAMETERS; + + result->status = RDC_DIAG_RESULT_PASS; + std::string info = ""; + + for (uint32_t i=0; i < gpu_count; i++) { + // temperature + for (rsmi_temperature_type_t sensor_type = RSMI_TEMP_TYPE_FIRST; + sensor_type != RSMI_TEMP_TYPE_LAST; ) { + auto status = check_temperature_level( + gpu_index[i], sensor_type, result->info, + result->gpu_results[i].gpu_result.msg); + // Set to higher error level + if (status > result->status) { + result->status = status; + } + sensor_type = static_cast(sensor_type+1); + } + + // Voltage + for (rsmi_voltage_type_t sensor_type = RSMI_VOLT_TYPE_FIRST; + sensor_type != RSMI_VOLT_TYPE_LAST;) { + auto status = check_voltage_level( + gpu_index[i], sensor_type, result->info, + result->gpu_results[i].gpu_result.msg); + // Set to higher error level + if (status > result->status) { + result->status = status; + } + sensor_type = static_cast(sensor_type+1); + } + result->gpu_results->gpu_index = gpu_index[i]; + result->per_gpu_result_count++; + } + return RDC_ST_OK; +} + +rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level( + uint32_t gpu_index, rsmi_temperature_type_t type + , char msg[MAX_DIAG_MSG_LENGTH] + , char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) { + rdc_diag_result_t result = RDC_DIAG_RESULT_PASS; + rsmi_temperature_metric_t met = RSMI_TEMP_CURRENT; + rsmi_status_t err = RSMI_STATUS_SUCCESS; + int64_t current_temp = 0; + std::string info = msg; + std::string per_gpu_info = per_gpu_msg; + + err = rsmi_dev_temp_metric_get(gpu_index, + type, met, ¤t_temp); + + if (err != RSMI_STATUS_SUCCESS) return result; + + // Max temperature + met = RSMI_TEMP_MAX; + int64_t max_temp = 0; + err = rsmi_dev_temp_metric_get(gpu_index, + type, met, &max_temp); + if (err == RSMI_STATUS_SUCCESS) { + if (current_temp >= max_temp) { + result = RDC_DIAG_RESULT_WARN; + per_gpu_info += "Max "; + per_gpu_info += get_temperature_string(type); + per_gpu_info += " temperature "; + per_gpu_info += std::to_string(max_temp); + per_gpu_info += " greater than current temperature "; + per_gpu_info += std::to_string(current_temp) +". "; + info += get_temperature_string(type) + ": "; + info += "GPU " + std::to_string(gpu_index); + info += " max "; + info += get_temperature_string(type); + info += " temperature exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " max "; + info += get_temperature_string(type); + info += " temperature in range. "; + } + } + + met = RSMI_TEMP_MIN; + int64_t min_temp = 0; + err = rsmi_dev_temp_metric_get(gpu_index, + type, met, &min_temp); + if (err == RSMI_STATUS_SUCCESS) { + if (current_temp <= min_temp) { + result = RDC_DIAG_RESULT_WARN; + per_gpu_info += "Min "; + per_gpu_info += get_temperature_string(type); + per_gpu_info += " temperature "; + per_gpu_info += std::to_string(min_temp); + per_gpu_info += " less than current temperature "; + per_gpu_info += std::to_string(current_temp) +". "; + info += "GPU " + std::to_string(gpu_index); + info += " min "; + info += get_temperature_string(type); + info += " temperature exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " min "; + info += get_temperature_string(type); + info += " temperature in range. "; + } + } + + met = RSMI_TEMP_CRITICAL; + int64_t critical_temp = 0; + err = rsmi_dev_temp_metric_get(gpu_index, + type, met, &critical_temp); + if (err != RSMI_STATUS_SUCCESS) { + if (current_temp >= critical_temp) { + result = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "Critical "; + per_gpu_info += get_temperature_string(type); + per_gpu_info += " temperature "; + per_gpu_info += std::to_string(critical_temp); + per_gpu_info += " greater than current temperature "; + per_gpu_info += std::to_string(current_temp) +". "; + info += "GPU " + std::to_string(gpu_index); + info += " Critical "; + info += get_temperature_string(type); + info += " temperature exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " Critical "; + info += get_temperature_string(type); + info += " temperature in range. "; + } + } + + met = RSMI_TEMP_EMERGENCY; + int64_t emergency_temp = 0; + err = rsmi_dev_temp_metric_get(gpu_index, + type, met, &emergency_temp); + if (err != RSMI_STATUS_SUCCESS) { + if (current_temp >= critical_temp) { + result = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "Emergency "; + per_gpu_info += get_temperature_string(type); + per_gpu_info += " temperature "; + per_gpu_info += std::to_string(emergency_temp); + per_gpu_info += " greater than current temperature "; + per_gpu_info += std::to_string(current_temp) +". "; + info += "GPU " + std::to_string(gpu_index); + info += " Emergency "; + info += get_temperature_string(type); + info += " temperature exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " Emergency "; + info += get_temperature_string(type); + info += " temperature in range. "; + } + } + + met = RSMI_TEMP_CRIT_MIN; + int64_t critical_min_temp = 0; + err = rsmi_dev_temp_metric_get(gpu_index, + type, met, &critical_min_temp); + if (err != RSMI_STATUS_SUCCESS) { + if (current_temp <= critical_min_temp) { + result = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "Critical Min "; + per_gpu_info += get_temperature_string(type); + per_gpu_info += " temperature "; + per_gpu_info += std::to_string(critical_min_temp); + per_gpu_info += " less than current temperature "; + per_gpu_info += std::to_string(current_temp) +". "; + info += "GPU " + std::to_string(gpu_index); + info += " Critical Min "; + info += get_temperature_string(type); + info += " temperature exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " Critical Min "; + info += get_temperature_string(type); + info += " temperature in range. "; + } + } + + strncpy_with_null(msg, info.c_str(), + MAX_DIAG_MSG_LENGTH); + strncpy_with_null(per_gpu_msg, per_gpu_info.c_str(), + MAX_DIAG_MSG_LENGTH); + + return result; +} + + +rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level( + uint32_t gpu_index, rsmi_voltage_type_t type + , char msg[MAX_DIAG_MSG_LENGTH] + , char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) { + rdc_diag_result_t result = RDC_DIAG_RESULT_PASS; + rsmi_voltage_metric_t met = RSMI_VOLT_CURRENT; + rsmi_status_t err = RSMI_STATUS_SUCCESS; + int64_t current_voltage = 0; + std::string info = msg; + std::string per_gpu_info = per_gpu_msg; + + err = rsmi_dev_volt_metric_get(gpu_index, + type, met, ¤t_voltage); + if (err != RSMI_STATUS_SUCCESS) return result; + + // Max voltage + met = RSMI_VOLT_MAX; + int64_t max_volt = 0; + err = rsmi_dev_volt_metric_get(gpu_index, + type, met, &max_volt); + if (err == RSMI_STATUS_SUCCESS) { + if (current_voltage >= max_volt) { + result = RDC_DIAG_RESULT_WARN; + per_gpu_info += "Max "; + per_gpu_info += get_voltage_string(type); + per_gpu_info += " voltage "; + per_gpu_info += std::to_string(max_volt); + per_gpu_info += " greater than current voltage "; + per_gpu_info += std::to_string(current_voltage) +". "; + info += get_voltage_string(type) + ": "; + info += "GPU " + std::to_string(gpu_index); + info += " max "; + info += get_voltage_string(type); + info += " voltage exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " max "; + info += get_voltage_string(type); + info += " voltage in range. "; + } + } + + // Min voltage + met = RSMI_VOLT_MIN; + int64_t min_volt = 0; + err = rsmi_dev_volt_metric_get(gpu_index, + type, met, &min_volt); + if (err == RSMI_STATUS_SUCCESS) { + if (current_voltage <= min_volt) { + result = RDC_DIAG_RESULT_WARN; + per_gpu_info += "Min "; + per_gpu_info += get_voltage_string(type); + per_gpu_info += " voltage "; + per_gpu_info += std::to_string(min_volt); + per_gpu_info += " less than current voltage "; + per_gpu_info += std::to_string(current_voltage) +". "; + info += get_voltage_string(type) + ": "; + info += "GPU " + std::to_string(gpu_index); + info += " min "; + info += get_voltage_string(type); + info += " voltage exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " min "; + info += get_voltage_string(type); + info += " voltage in range. "; + } + } + + // Max Critical voltage + met = RSMI_VOLT_MAX_CRIT; + int64_t critical_max_volt = 0; + err = rsmi_dev_volt_metric_get(gpu_index, + type, met, &critical_max_volt); + if (err == RSMI_STATUS_SUCCESS) { + if (current_voltage >= critical_max_volt) { + result = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "Critical Max "; + per_gpu_info += get_voltage_string(type); + per_gpu_info += " voltage "; + per_gpu_info += std::to_string(critical_max_volt); + per_gpu_info += " greater than current voltage "; + per_gpu_info += std::to_string(current_voltage) +". "; + info += get_voltage_string(type) + ": "; + info += "GPU " + std::to_string(gpu_index); + info += " Critical max "; + info += get_voltage_string(type); + info += " voltage exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " Critical max "; + info += get_voltage_string(type); + info += " voltage in range. "; + } + } + + // Min Critical voltage + met = RSMI_VOLT_MIN_CRIT; + int64_t critical_min_volt = 0; + err = rsmi_dev_volt_metric_get(gpu_index, + type, met, &critical_min_volt); + if (err == RSMI_STATUS_SUCCESS) { + if (current_voltage <= critical_min_volt) { + result = RDC_DIAG_RESULT_FAIL; + per_gpu_info += "Critical Min "; + per_gpu_info += get_voltage_string(type); + per_gpu_info += " voltage "; + per_gpu_info += std::to_string(critical_min_volt); + per_gpu_info += " less than current voltage "; + per_gpu_info += std::to_string(current_voltage) +". "; + info += get_voltage_string(type) + ": "; + info += "GPU " + std::to_string(gpu_index); + info += " Critical min "; + info += get_voltage_string(type); + info += " voltage exceeds. "; + } else { + info += "GPU " + std::to_string(gpu_index); + info += " Critical min "; + info += get_voltage_string(type); + info += " voltage in range. "; + } + } + + strncpy_with_null(msg, info.c_str(), + MAX_DIAG_MSG_LENGTH); + strncpy_with_null(per_gpu_msg, per_gpu_info.c_str(), + MAX_DIAG_MSG_LENGTH); + + return result; +} + +} // namespace rdc +} // namespace amd + diff --git a/rdc_libs/rdc/src/RdcSmiLib.cc b/rdc_libs/rdc/src/RdcSmiLib.cc index 8985e8aa1b..dd2c871da7 100644 --- a/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/rdc_libs/rdc/src/RdcSmiLib.cc @@ -31,8 +31,9 @@ THE SOFTWARE. namespace amd { namespace rdc { -RdcSmiLib::RdcSmiLib(const RdcMetricFetcherPtr& mf): metric_fetcher_(mf), - bulk_fetch_enabled_(false) { // Disable bulk fetch by default. +RdcSmiLib::RdcSmiLib(const RdcMetricFetcherPtr& mf): metric_fetcher_(mf) + , bulk_fetch_enabled_(false) + , smi_diag_(std::make_shared()) { char* bulk_env = getenv("RDC_BULK_FETCH_ENABLED"); if (bulk_env != nullptr && strcasecmp(bulk_env, "true") == 0) { RDC_LOG(RDC_DEBUG, "Bulk fetch enabled."); @@ -182,7 +183,14 @@ rdc_status_t RdcSmiLib::rdc_diag_test_cases_query( return RDC_ST_BAD_PARAMETER; } - return RDC_ST_NOT_SUPPORTED; + const std::vector tests { + RDC_DIAG_COMPUTE_PROCESS, + RDC_DIAG_NODE_TOPOLOGY, + RDC_DIAG_GPU_PARAMETERS + }; + std::copy(tests.begin(), tests.end(), test_cases); + *test_case_count = tests.size(); + return RDC_ST_OK; } // Run a specific test case @@ -194,7 +202,19 @@ rdc_status_t RdcSmiLib::rdc_test_case_run( if (result == nullptr) { return RDC_ST_BAD_PARAMETER; } - return RDC_ST_NOT_SUPPORTED; + switch (test_case) { + case RDC_DIAG_COMPUTE_PROCESS: + return smi_diag_->check_rsmi_process_info( + gpu_index, gpu_count, result); + case RDC_DIAG_NODE_TOPOLOGY: + return smi_diag_->check_rsmi_topo_info( + gpu_index, gpu_count, result); + case RDC_DIAG_GPU_PARAMETERS: + return smi_diag_->check_rsmi_param_info( + gpu_index, gpu_count, result); + default: + return RDC_ST_NOT_SUPPORTED; + } } rdc_status_t RdcSmiLib::rdc_diagnostic_run( diff --git a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index 2a5d132d52..0d0c724232 100644 --- a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -559,10 +559,110 @@ rdc_status_t RdcStandaloneHandler::rdc_diagnostic_run( rdc_gpu_group_t group_id, rdc_diag_level_t level, rdc_diag_response_t* response) { - (void)group_id; - (void)level; - (void)response; - return RDC_ST_NOT_SUPPORTED; + if (!response) { + return RDC_ST_BAD_PARAMETER; + } + ::rdc::DiagnosticRunRequest request; + ::rdc::DiagnosticRunResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + request.set_level(level); + + ::grpc::Status status = stub_-> + DiagnosticRun(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; + auto res = reply.response(); + response->results_count = res.results_count(); + + if (res.diag_info_size() > static_cast(MAX_TEST_CASES)) { + return RDC_ST_BAD_PARAMETER; + } + for (int i = 0; i < res.diag_info_size(); i++) { + const ::rdc::DiagnosticTestResult& result = res.diag_info(i); + rdc_diag_test_result_t& to_result = response->diag_info[i]; + to_result.status = static_cast(result.status()); + + // Set details + to_result.details.code = result.details().code(); + strncpy_with_null(to_result.details.msg, + result.details().msg().c_str(), MAX_DIAG_MSG_LENGTH); + + to_result.test_case = static_cast( + result.test_case()); + to_result.per_gpu_result_count = result.per_gpu_result_count(); + + // Set Result details + if (result.gpu_results_size() > RDC_MAX_NUM_DEVICES) { + return RDC_ST_BAD_PARAMETER; + } + for (int j=0; j < result.gpu_results_size(); j++) { + auto per_gpu_result = result.gpu_results(j); + rdc_diag_per_gpu_result_t& to_per_gpu = to_result.gpu_results[j]; + to_per_gpu.gpu_index = per_gpu_result.gpu_index(); + to_per_gpu.gpu_result.code = per_gpu_result.gpu_result().code(); + strncpy_with_null(to_per_gpu.gpu_result.msg, + per_gpu_result.gpu_result().msg().c_str(), MAX_DIAG_MSG_LENGTH); + } + strncpy_with_null(to_result.info, + result.info().c_str(), MAX_DIAG_MSG_LENGTH); + } + + return RDC_ST_OK; +} + +rdc_status_t RdcStandaloneHandler::rdc_test_case_run( + rdc_gpu_group_t group_id, + rdc_diag_test_cases_t test_case, + rdc_diag_test_result_t *to_result) { + if (!to_result) { + return RDC_ST_BAD_PARAMETER; + } + ::rdc::DiagnosticTestCaseRunRequest request; + ::rdc::DiagnosticTestCaseRunResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + request.set_test_case(static_cast< + ::rdc::DiagnosticTestCaseRunRequest_TestCaseType>(test_case)); + + ::grpc::Status status = stub_->DiagnosticTestCaseRun( + &context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) + return err_status; + auto result = reply.result(); + + to_result->status = static_cast(result.status()); + + // Set details + to_result->details.code = result.details().code(); + strncpy_with_null(to_result->details.msg, + result.details().msg().c_str(), MAX_DIAG_MSG_LENGTH); + + to_result->test_case = static_cast( + result.test_case()); + to_result->per_gpu_result_count = result.per_gpu_result_count(); + + // Set Result details + if (result.gpu_results_size() > RDC_MAX_NUM_DEVICES) { + return RDC_ST_BAD_PARAMETER; + } + for (int j = 0; j < result.gpu_results_size(); j++) { + auto per_gpu_result = result.gpu_results(j); + rdc_diag_per_gpu_result_t &to_per_gpu = + to_result->gpu_results[j]; + to_per_gpu.gpu_index = per_gpu_result.gpu_index(); + to_per_gpu.gpu_result.code = per_gpu_result.gpu_result().code(); + strncpy_with_null(to_per_gpu.gpu_result.msg, + per_gpu_result.gpu_result().msg().c_str(), + MAX_DIAG_MSG_LENGTH); + } + strncpy_with_null(to_result->info, + result.info().c_str(), MAX_DIAG_MSG_LENGTH); + + return RDC_ST_OK; } diff --git a/rdci/CMakeLists.txt b/rdci/CMakeLists.txt index dd3083aef0..4831aed7c5 100644 --- a/rdci/CMakeLists.txt +++ b/rdci/CMakeLists.txt @@ -78,6 +78,7 @@ set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciSubSystem.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciGroupSubSystem.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciFieldGroupSubSystem.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciDmonSubSystem.cc") +set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciDiagSubSystem.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${SRC_DIR}/RdciStatsSubSystem.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} "${PROJECT_SOURCE_DIR}/common/rdc_utils.cc") set(RDCI_SRC_LIST ${RDCI_SRC_LIST} diff --git a/rdci/include/RdciDiagSubSystem.h b/rdci/include/RdciDiagSubSystem.h new file mode 100644 index 0000000000..9a884c4d94 --- /dev/null +++ b/rdci/include/RdciDiagSubSystem.h @@ -0,0 +1,61 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDCI_INCLUDE_RDCIDIAGSUBSYSTEM_H_ +#define RDCI_INCLUDE_RDCIDIAGSUBSYSTEM_H_ +#include +#include +#include +#include +#include "RdciSubSystem.h" + +namespace amd { +namespace rdc { + +class RdciDiagSubSystem: public RdciSubSystem { + public: + RdciDiagSubSystem(); + ~RdciDiagSubSystem(); + void parse_cmd_opts(int argc, char ** argv) override; + void process() override; + + private: + void show_help() const; + + std::string get_test_name( + rdc_diag_test_cases_t test_case) const; + + enum OPERATIONS { + DIAG_UNKNOWN = 0, + DIAG_HELP, + DIAG_RUN, + } diag_ops_; + + rdc_gpu_group_t group_id_; + rdc_diag_level_t run_level_; +}; + + +} // namespace rdc +} // namespace amd + + +#endif // RDCI_INCLUDE_RDCIDIAGSUBSYSTEM_H_ diff --git a/rdci/src/RdciDiagSubSystem.cc b/rdci/src/RdciDiagSubSystem.cc new file mode 100644 index 0000000000..cd871773e0 --- /dev/null +++ b/rdci/src/RdciDiagSubSystem.cc @@ -0,0 +1,208 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "RdciDiagSubSystem.h" +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rdc_lib/rdc_common.h" +#include "common/rdc_utils.h" +#include "common/rdc_fields_supported.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcException.h" + +namespace amd { +namespace rdc { + + +RdciDiagSubSystem::RdciDiagSubSystem(): diag_ops_(DIAG_RUN) + , run_level_(RDC_DIAG_LVL_SHORT) { +} + +RdciDiagSubSystem::~RdciDiagSubSystem() { +} + +void RdciDiagSubSystem::parse_cmd_opts(int argc, char ** argv) { + const int HOST_OPTIONS = 1000; + const struct option long_options[] = { + {"host", required_argument, nullptr, HOST_OPTIONS}, + {"help", optional_argument, nullptr, 'h'}, + {"unauth", optional_argument, nullptr, 'u'}, + {"run-level", required_argument, nullptr, 'r'}, + {"group-id", required_argument, nullptr, 'g'}, + { nullptr, 0 , nullptr, 0 } + }; + + bool group_id_set = false; + int option_index = 0; + int opt = 0; + + while ((opt = getopt_long(argc, argv, "hug:r:", + long_options, &option_index)) != -1) { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + case 'h': + diag_ops_ = DIAG_HELP; + return; + case 'u': + use_auth_ = false; + break; + case 'g': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "The group id needs to be a number"); + } + group_id_ = std::stoi(optarg); + group_id_set = true; + break; + case 'r': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "The run level needs to be a number"); + } + run_level_ = static_cast(std::stoi(optarg)); + break; + default: + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Unknown command line options"); + } + } + + if (!group_id_set) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, + "Need to specify the GPU group id"); + } +} + +void RdciDiagSubSystem::show_help() const { + // Try to keep total output line length to <= 80 chars for better + // readability. For reference: + // *********************** 60 Chars ************************** + // ************** 40 Chars *************** + // ***** 20 Chars **** + std::cout << " diag -- Used to run diagnostic for GPUs.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci diag [--host :port] [-u] -g " + << " -r \n"; + std::cout << "\nFlags:\n"; + show_common_usage(); + std::cout << " -g --group-id The GPU group to diagnose" + << " on the specified host.\n"; + std::cout << " -r --run-level level Integer representing test" + << " run levels [default = 1].\n" + << " level 1: Tests take a " + << "few seconds to run.\n" + << " level 2: Tests take a " + << "few minutes to run (To be implemented).\n" + << " level 3: Tests take " + << "half an hour to run (To be implemented).\n"; +} + +std::string RdciDiagSubSystem::get_test_name + (rdc_diag_test_cases_t test_case) const { + const std::map test_desc = { + {RDC_DIAG_COMPUTE_PROCESS, "No compute process"}, + {RDC_DIAG_SDMA_QUEUE, "SDMA Queue ready"}, + {RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"}, + {RDC_DIAG_VRAM_CHECK, "VRAM check"}, + {RDC_DIAG_SYS_MEM_CHECK, "System memory check"}, + {RDC_DIAG_NODE_TOPOLOGY, "Node topology check"}, + {RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"}, + {RDC_DIAG_TEST_LAST, "Unknown"} + }; + + auto test_name = test_desc.find(test_case); + if (test_name == test_desc.end()) { + return "Unknown Test"; + } + return test_name->second; +} + +void RdciDiagSubSystem::process() { + if (diag_ops_ == DIAG_HELP || + diag_ops_ == DIAG_UNKNOWN) { + show_help(); + return; + } + + rdc_status_t result; + rdc_diag_response_t response; + result = rdc_diagnostic_run(rdc_handle_, group_id_, + run_level_, &response); + + if (result != RDC_ST_OK) { + std::string error_msg = rdc_status_string(result); + throw RdcException(result, error_msg.c_str()); + } + + // (3) Check diagnostic results + for (uint32_t i=0 ; i < response.results_count; i++) { + const rdc_diag_test_result_t& test_result = + response.diag_info[i]; + std::cout << std::setw(26) << std::left + << get_test_name(test_result.test_case) + ":" + << rdc_diagnostic_result_string(test_result.status) << "\n"; + } + + // (4) diagnostic detail information + std::cout <<" =============== Diagnostic Details ==================\n"; + for (uint32_t i=0 ; i < response.results_count; i++) { + const rdc_diag_test_result_t& test_result = + response.diag_info[i]; + if (test_result.info[0] != '\0') { + std::cout << std::setw(26) << std::left + << get_test_name(test_result.test_case) + ":" + << test_result.info << "\n"; + } + for (uint32_t j=0; j < test_result.per_gpu_result_count; j++) { + const rdc_diag_per_gpu_result_t& gpu_result + = test_result.gpu_results[j]; + if (strlen(gpu_result.gpu_result.msg) > 0) { + std::cout << " GPU " << gpu_result.gpu_index << " " << + gpu_result.gpu_result.msg << "\n"; + } + } + } +} + + +} // namespace rdc +} // namespace amd + + diff --git a/rdci/src/rdci.cc b/rdci/src/rdci.cc index cbee32dae1..f3ee185d52 100644 --- a/rdci/src/rdci.cc +++ b/rdci/src/rdci.cc @@ -27,6 +27,7 @@ THE SOFTWARE. #include "rdc_lib/RdcException.h" #include "RdciDiscoverySubSystem.h" #include "RdciDmonSubSystem.h" +#include "RdciDiagSubSystem.h" #include "RdciFieldGroupSubSystem.h" #include "RdciGroupSubSystem.h" #include "RdciStatsSubSystem.h" @@ -35,7 +36,7 @@ THE SOFTWARE. int main(int argc, char ** argv) { const std::string usage_help = "Usage:\trdci \nsubsystem: discovery, dmon, group, " - "fieldgroup, stats\n"; + "fieldgroup, stats, diag\n"; if (argc <= 1) { std::cout << usage_help; @@ -49,6 +50,8 @@ int main(int argc, char ** argv) { subsystem.reset(new amd::rdc::RdciDiscoverySubSystem()); } else if (subsystem_name == "dmon") { subsystem.reset(new amd::rdc::RdciDmonSubSystem()); + } else if (subsystem_name == "diag") { + subsystem.reset(new amd::rdc::RdciDiagSubSystem()); } else if (subsystem_name == "group") { subsystem.reset(new amd::rdc::RdciGroupSubSystem()); } else if (subsystem_name == "fieldgroup") { diff --git a/server/include/rdc/rdc_api_service.h b/server/include/rdc/rdc_api_service.h index 7991d87e38..515640d596 100755 --- a/server/include/rdc/rdc_api_service.h +++ b/server/include/rdc/rdc_api_service.h @@ -120,6 +120,14 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service { const ::rdc::Empty* request, ::rdc::RemoveAllJobResponse* reply) override; + ::grpc::Status DiagnosticRun(::grpc::ServerContext* context, + const ::rdc::DiagnosticRunRequest* request, + ::rdc::DiagnosticRunResponse* reply) override; + + ::grpc::Status DiagnosticTestCaseRun(::grpc::ServerContext* context, + const ::rdc::DiagnosticTestCaseRunRequest* request, + ::rdc::DiagnosticTestCaseRunResponse* reply) override; + private: bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target); diff --git a/server/src/rdc_api_service.cc b/server/src/rdc_api_service.cc index 33684def7b..fc4c68282f 100755 --- a/server/src/rdc_api_service.cc +++ b/server/src/rdc_api_service.cc @@ -559,7 +559,6 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, return ::grpc::Status::OK; } - ::grpc::Status RdcAPIServiceImpl::RemoveJob( ::grpc::ServerContext* context, const ::rdc::RemoveJobRequest* request, @@ -591,7 +590,114 @@ bool RdcAPIServiceImpl::copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, return ::grpc::Status::OK; } +::grpc::Status RdcAPIServiceImpl::DiagnosticRun( + ::grpc::ServerContext* context, + const ::rdc::DiagnosticRunRequest* request, + ::rdc::DiagnosticRunResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + rdc_diag_response_t diag_response; + rdc_status_t result = rdc_diagnostic_run( + rdc_handle_, + request->group_id(), + static_cast(request->level()), + &diag_response); + + reply->set_status(result); + if (result != RDC_ST_OK) { + return ::grpc::Status::OK; + } + + ::rdc::DiagnosticResponse* to_response = reply->mutable_response(); + to_response->set_results_count(diag_response.results_count); + + for (uint32_t i=0 ; i < diag_response.results_count; i++) { + const rdc_diag_test_result_t& test_result = + diag_response.diag_info[i]; + ::rdc::DiagnosticTestResult* to_diag_info = + to_response->add_diag_info(); + + to_diag_info->set_status(test_result.status); + + // details + auto to_details = to_diag_info->mutable_details(); + to_details->set_code(test_result.details.code); + to_details->set_msg(test_result.details.msg); + + to_diag_info->set_test_case( + static_cast<::rdc::DiagnosticTestResult_DiagnosticTestCase> + (test_result.test_case)); + to_diag_info->set_per_gpu_result_count( + test_result.per_gpu_result_count); + + // gpu_results + for (uint32_t j=0; j < test_result.per_gpu_result_count; j++) { + auto to_result = to_diag_info->add_gpu_results(); + const rdc_diag_per_gpu_result_t& cur_result = + test_result.gpu_results[j]; + to_result->set_gpu_index(cur_result.gpu_index); + auto to_per_detail = to_result->mutable_gpu_result(); + to_per_detail->set_code(cur_result.gpu_result.code); + to_per_detail->set_msg(cur_result.gpu_result.msg); + } + to_diag_info->set_info(test_result.info); + } + + return ::grpc::Status::OK; +} + +::grpc::Status RdcAPIServiceImpl::DiagnosticTestCaseRun( + ::grpc::ServerContext *context, + const ::rdc::DiagnosticTestCaseRunRequest *request, + ::rdc::DiagnosticTestCaseRunResponse *reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + + rdc_diag_test_result_t test_result; + rdc_status_t result = rdc_test_case_run( + rdc_handle_, + request->group_id(), + static_cast(request->test_case()), + &test_result); + + reply->set_status(result); + if (result != RDC_ST_OK) { + return ::grpc::Status::OK; + } + ::rdc::DiagnosticTestResult *to_diag_info = + reply->mutable_result(); + to_diag_info->set_status(test_result.status); + + // details + auto to_details = to_diag_info->mutable_details(); + to_details->set_code(test_result.details.code); + to_details->set_msg(test_result.details.msg); + + to_diag_info->set_test_case( + static_cast<::rdc::DiagnosticTestResult_DiagnosticTestCase>( + test_result.test_case)); + to_diag_info->set_per_gpu_result_count( + test_result.per_gpu_result_count); + + // gpu_results + for (uint32_t j = 0; j < test_result.per_gpu_result_count; j++) { + auto to_result = to_diag_info->add_gpu_results(); + const rdc_diag_per_gpu_result_t &cur_result = + test_result.gpu_results[j]; + to_result->set_gpu_index(cur_result.gpu_index); + auto to_per_detail = to_result->mutable_gpu_result(); + to_per_detail->set_code(cur_result.gpu_result.code); + to_per_detail->set_msg(cur_result.gpu_result.msg); + } + to_diag_info->set_info(test_result.info); + + return ::grpc::Status::OK; +} } // namespace rdc } // namespace amd