diff --git a/projects/rdc/common/rdc_field.data b/projects/rdc/common/rdc_field.data index 7b2dc61e96..c01e44a4f4 100644 --- a/projects/rdc/common/rdc_field.data +++ b/projects/rdc/common/rdc_field.data @@ -161,3 +161,12 @@ FLD_DESC_ENT(RDC_EVNT_NOTIF_THERMAL_THROTTLE, "Clk freq decrease due to temp", FLD_DESC_ENT(RDC_EVNT_NOTIF_PRE_RESET, "GPU reset is about to occur", "GPU_PRE_RESET", false) FLD_DESC_ENT(RDC_EVNT_NOTIF_POST_RESET, "GPU reset just occurred", "GPU_POST_RESET", false) FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG, "GPU ring hang just occured", "RING_HANG", false) + +// RDC health related fields +FLD_DESC_ENT(RDC_HEALTH_XGMI_ERROR, "XGMI one or more errors detected", "XGMI_ERROR", true) +FLD_DESC_ENT(RDC_HEALTH_PCIE_REPLAY_COUNT, "Total PCIE replay count", "PCIE_REPLAY_COUNT", true) +FLD_DESC_ENT(RDC_HEALTH_PENDING_PAGE_NUM, "Pending page number", "PENDING_PAGE_NUM", true) +FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit", "RETIRED_PAGE_LIMIT", false) +FLD_DESC_ENT(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, "Uncorrectable page limit", "UNCORRECTABLE_PAGE_LIMIT", false) +FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", false) +FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", false) diff --git a/projects/rdc/example/CMakeLists.txt b/projects/rdc/example/CMakeLists.txt index ac6019b2fe..6bbf746eb8 100755 --- a/projects/rdc/example/CMakeLists.txt +++ b/projects/rdc/example/CMakeLists.txt @@ -120,6 +120,12 @@ set(POLICY_EXAMPLE_EXE "policy") add_executable(${POLICY_EXAMPLE_EXE} "${POLICY_EXAMPLE_SRC_LIST}") target_link_libraries(${POLICY_EXAMPLE_EXE} pthread dl rdc_bootstrap) +set(HEALTH_EXAMPLE_SRC_LIST "health_example.cc") +cmake_print_variables(HEALTH_EXAMPLE_SRC_LIST) +set(HEALTH_EXAMPLE_EXE "health") +add_executable(${HEALTH_EXAMPLE_EXE} "${HEALTH_EXAMPLE_SRC_LIST}") +target_link_libraries(${HEALTH_EXAMPLE_EXE} pthread dl rdc_bootstrap) + message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") message(" Finished Cmake Example ") message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") diff --git a/projects/rdc/example/health_example.cc b/projects/rdc/example/health_example.cc new file mode 100644 index 0000000000..d15dfa35a5 --- /dev/null +++ b/projects/rdc/example/health_example.cc @@ -0,0 +1,359 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +#include +#include +#include +#include + +#include "rdc/rdc.h" + +rdc_status_t get_watches(rdc_handle_t rdc_handle, rdc_gpu_group_t group_id) { + unsigned int components; + rdc_status_t result = rdc_health_get(rdc_handle, group_id, &components); + if (result == RDC_ST_OK) { + std::string on = "On"; + std::string off = "Off"; + + std::cout << "Health monitor systems status:" << std::endl; + std::cout << "+--------------------+" //"-" width :20 + << "---------------------------------------------------+\n"; //-" width :51 + std::cout << "|" << std::setw(20) << std::left << " PCIe" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "|\n"; + std::cout << "|" << std::setw(20) << std::left << " XGMI" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n"; + std::cout << "|" << std::setw(20) << std::left << " Memory" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n"; + std::cout << "+--------------------+" //"-" width :20 + << "---------------------------------------------------+\n"; //-" width :51 + } + + return result; +} + +std::string health_string(rdc_health_result_t health) { + switch (health) { + case RDC_HEALTH_RESULT_PASS: + return "Pass"; + + case RDC_HEALTH_RESULT_WARN: + return "Warning"; + + case RDC_HEALTH_RESULT_FAIL: + return "Fail"; + + default: + return "Unknown"; + } +} + +std::string component_string(rdc_health_system_t component) { + switch (component) { + case RDC_HEALTH_WATCH_PCIE: + return "PCIe system: "; + + case RDC_HEALTH_WATCH_XGMI: + return"XGMI system: "; + + case RDC_HEALTH_WATCH_MEM: + return "Memory system: "; + + case RDC_HEALTH_WATCH_INFOROM: + return "Inforom system: "; + + case RDC_HEALTH_WATCH_THERMAL: + return "Thermal system:"; + + case RDC_HEALTH_WATCH_POWER: + return "Power system: "; + + default: + return "Unknown"; + } +} + +void output_errstr(const std::string& input) { + std::string word, line_str; + unsigned int width = 60, line_size = 0; + std::istringstream iss(input); + + while (iss >> word) { + if (line_size + word.size() >= width) { + std::cout << "|" << std::setw(20) << " " << "| " + << std::setw(width) << std::left << line_str << "|\n"; + + //add new line string + line_str = word; + line_size = word.size(); + } else { + if (line_size > 0) { + line_str += " "; + line_str += word; + line_size += word.size() + 1; + } else { + line_str += word; + line_size += word.size(); + } + } + } //end while + + if (0 < line_size) + std::cout << "|" << std::setw(20) << " " << "| " + << std::setw(width) << std::left << line_str << "|\n"; +} + +unsigned int handle_one_component(rdc_health_response_t &response, + unsigned int start_index, + uint32_t gpu_index, + rdc_health_system_t component, + rdc_health_result_t &component_health, + std::vector &err_str) { + unsigned int count = 0; + rdc_health_incidents_t *incident; + std::string all_err_str; + + for (unsigned int i = start_index; i < response.incidents_count; i++) { + incident = &response.incidents[i]; + + //same GPU Index, same component + if ((incident->gpu_index != gpu_index) || + (incident->component != component)) + break; + + //set component health + if (incident->health > component_health) + component_health = incident->health; + + all_err_str = " - "; + all_err_str += incident->error.msg; + err_str.push_back(all_err_str); + + count++; + } + + return count; +} + +unsigned int handle_one_gpu(rdc_health_response_t &response, + unsigned int start_index, + uint32_t gpu_index) { + unsigned int count = 0, comp_count = 0; + rdc_health_incidents_t *incident; + rdc_health_result_t gpu_health = RDC_HEALTH_RESULT_PASS; + std::string component_str, health_str, gpu_health_str; + typedef struct { + rdc_health_result_t component_health; + std::vector err_str; + } component_detail_t; + std::map component_detail_map; + + for (unsigned int i = start_index; i < response.incidents_count; i++) { + incident = &response.incidents[i]; + + //same GPU Index + if (incident->gpu_index != gpu_index) + break; + + //set gpu health + if (incident->health > gpu_health) + gpu_health = incident->health; + + //handle smae component + component_detail_t detail; + detail.component_health = RDC_HEALTH_RESULT_PASS; + detail.err_str.clear(); + + comp_count = handle_one_component(response, i, gpu_index, incident->component, detail.component_health, detail.err_str); + i += comp_count - 1; + count += comp_count; + + // Add to the component detail map + component_detail_map.insert({incident->component, detail}); + } + + //output gpu_index health result + gpu_health_str = health_string(gpu_health); + + std::cout << "|" << std::setw(20) << " GPU ID: " + std::to_string(gpu_index) << "| " + << std::setw(60) << std::left << gpu_health_str << "|\n"; + std::cout << "|" << std::setw(20) << " " << "| " + << std::setw(60) << " " << "|\n"; + + for (auto ite : component_detail_map) { + component_str = component_string(ite.first); + health_str = health_string(ite.second.component_health); + std::cout << "|" << std::setw(20) << " " << "| " + << std::setw(60) << std::left << component_str + health_str << "|\n"; + + for (auto msg : ite.second.err_str) + output_errstr(msg); + + std::cout << "|" << std::setw(20) << " " << "| " + << std::setw(60) << " " << "|\n"; + } + std::cout << "+--------------------+-" //"-" width :20 + << "------------------------------------------------------------+\n"; //-" width :60 + + return count; +} + +int main(int, char**) { + rdc_status_t result; + rdc_handle_t rdc_handle; + char hostIpAddress[] = {"127.0.0.1:50051"}; + char group_name[] = {"healthgroup1"}; + + std::cout << "Start rdci in Standalone mode\n"; + + // Init the rdc + result = rdc_init(0); + + if (result != RDC_ST_OK) { + std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } else { + std::cout << "RDC Initialized.\n"; + } + + result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr); + if (result != RDC_ST_OK) { + std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; + } + + // Now we can use the same API for standalone + // (1) create group and add GPUs + rdc_gpu_group_t group_id; + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id); + if (result != RDC_ST_OK) { + std::cout << "Error creating group. Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; + } + std::cout << "Created the GPU group " << group_id << std::endl; + + result = rdc_group_gpu_add(rdc_handle, group_id, 0); // Add GPU 0 + if (result != RDC_ST_OK) { + std::cout << "Error adding group. Return: " << rdc_status_string(result) + << std::endl; + goto destroygroup; + } + + rdc_device_attributes_t attribute; + result = rdc_device_get_attributes(rdc_handle, 0, &attribute); + if (result != RDC_ST_OK) { + std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result); + goto destroygroup; + } + std::cout << "Add GPU 0: " << attribute.device_name << " to group " + << group_id << std::endl; + + // (2) get heath current watches before setting + result = get_watches(rdc_handle, group_id); + if (result != RDC_ST_OK) { + std::cout << "Error getting health watches. Return: " << rdc_status_string(result) + << std::endl; + goto destroygroup; + } + + // (3) set health watches. + unsigned int components; + components = RDC_HEALTH_WATCH_PCIE | RDC_HEALTH_WATCH_XGMI | RDC_HEALTH_WATCH_MEM; + result = rdc_health_set(rdc_handle, group_id, components); + if (result != RDC_ST_OK) { + std::cout << "Error setting health watches. Return: " << rdc_status_string(result) + << std::endl; + goto destroygroup; + } + std::cout << "Set health watches to all." << std::endl; + + // (4) get heath current watches after setting + result = get_watches(rdc_handle, group_id); + if (result != RDC_ST_OK) { + std::cout << "Error getting health watches. Return: " << rdc_status_string(result) + << std::endl; + goto destroygroup; + } + + std::cout << "Start to health monitor group:" << group_id + << std::endl; + std::cout << "Sleep a few seconds before retreive the data ...\n"; + // For standalone mode, the daemon will update and cache the samples + // take samples, standalone mode, do nothing + usleep(5000000); // sleep 5 seconds before fetch the stats + + // (5) Get the health stats + rdc_health_response_t response; + result = rdc_health_check(rdc_handle, group_id, &response); + if (result != RDC_ST_OK) { + std::cout << "Error health check. Return: " << rdc_status_string(result) + << std::endl; + goto destroygroup; + } else { + //output headline + std::string overall_str = health_string(response.overall_health); + std::cout << "Health monitor report:" << std::endl; + std::cout << "+--------------------+-" //"-" width :20 + << "------------------------------------------------------------+\n"; //-" width :60 + std::cout << "|" << std::setw(20) << std::left << " Group " + std::to_string(group_id) << "| " + << std::setw(60) << std::left << "Overall Health: " + overall_str << "|\n"; + std::cout << "+====================+=" //"=" width :20 + << "============================================================+\n"; //"=" width :60 + + //output health of per GPU + unsigned int index = 0; + while (index < response.incidents_count) { + uint32_t gpu_index = response.incidents[index].gpu_index; + + unsigned int count = handle_one_gpu(response, index, gpu_index); + index += count; + } + } + + // (6) Clear the health + result = rdc_health_clear(rdc_handle, group_id); + if (result != RDC_ST_OK) { + std::cout << "Error clear health. Return: " << rdc_status_string(result) + << std::endl; + goto destroygroup; + } + std::cout << "Clear Group " << group_id << " all health monitor systems." << std::endl; + +destroygroup: + // Delete the GPU group + result = rdc_group_gpu_destroy(rdc_handle, group_id); + if (result != RDC_ST_OK) { + std::cout << "Error delete GPU group. Return: " << rdc_status_string(result); + goto cleanup; + } + std::cout << "Deleted the GPU group " << group_id << std::endl; + + // Cleanup consists of shutting down RDC. +cleanup: + std::cout << "Cleaning up.\n"; + rdc_disconnect(rdc_handle); + rdc_shutdown(); + return result; +} diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index 38a59e84da..a0d84d33ea 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -335,6 +335,18 @@ typedef enum { RDC_EVNT_NOTIF_RING_HANG, //!< GPU ring hang just occurred RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_RING_HANG, + + /** + * @brief RDC health related fields + */ + RDC_HEALTH_XGMI_ERROR = 3000, //!< XGMI one or more errors detected + RDC_HEALTH_PCIE_REPLAY_COUNT, //!< Total PCIE replay count + RDC_HEALTH_RETIRED_PAGE_NUM, //!< Retired page number + RDC_HEALTH_PENDING_PAGE_NUM, //!< Pending page number + RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< The threshold of retired page + RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT,//!< The threshold of uncorrectable page + RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter + RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds) } rdc_field_t; // even and odd numbers are used for correctable and uncorrectable errors @@ -589,6 +601,81 @@ typedef struct { rdc_policy_action_t action; //!< Action to take } rdc_policy_t; +/** + * @brief type of health watches + */ +typedef enum { + RDC_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches + RDC_HEALTH_WATCH_XGMI = 0x2, //!< XGMI system watches + RDC_HEALTH_WATCH_MEM = 0x4, //!< Memory watches + RDC_HEALTH_WATCH_INFOROM = 0x8, //!< Inforom watches + RDC_HEALTH_WATCH_THERMAL = 0x10, //!< Temperature watches + RDC_HEALTH_WATCH_POWER = 0x20, //!< Power watches +} rdc_health_system_t; + +/** + * @brief type of health result + */ +typedef enum { + RDC_HEALTH_RESULT_PASS, //!< The health test pass + RDC_HEALTH_RESULT_WARN, //!< The health test has warnings + RDC_HEALTH_RESULT_FAIL //!< The health test fail +} rdc_health_result_t; + +/** + * @brief The maximum length of the health messages + */ +#define MAX_HEALTH_MSG_LENGTH 4096 + +/** + * 8 replays per minute is the maximum recommended + */ +#define PCIE_MAX_REPLAYS_PERMIN 8 + +// The error code set at rdc_health_incidents_t.error.code +typedef enum { + RDC_FR_PCI_REPLAY_RATE = 1000, + RDC_FR_ECC_UNCORRECTABLE_DETECTED = 1001, + RDC_FR_PENDING_PAGE_RETIREMENTS = 1002, + RDC_FR_RETIRED_PAGES_LIMIT = 1003, + RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT = 1004, + RDC_FR_CLOCKS_THROTTLE_THERMAL = 1005, + RDC_FR_CLOCKS_THROTTLE_POWER = 1006, + RDC_FR_XGMI_SINGLE_ERROR = 1007, + RDC_FR_XGMI_MULTIPLE_ERROR = 1008, + RDC_FR_CORRUPT_INFOROM = 1009 +} rdc_health_error_code_t; + +/** + * @brief details of the health errors + */ +typedef struct { + char msg[MAX_HEALTH_MSG_LENGTH]; //!< The test result details + uint32_t code; //!< The low level error code +} rdc_health_detail_t; + +/** + * @brief details of the per health incidents + */ +typedef struct { + uint32_t gpu_index; //!< which GPU in this group have the issue + rdc_health_system_t component; //!< which components have the issue + rdc_health_result_t health; //!< health diagnosis of this incident + rdc_health_detail_t error; //!< The details of the error, rdc_health_error_code_t +} rdc_health_incidents_t; + + +#define HEALTH_MAX_ERROR_ITEMS 64 + +/** + * @brief The health responses for test cases + */ +typedef struct { + rdc_health_result_t overall_health; //!< The overall health of this entire host + unsigned int incidents_count; //!< The number of health incidents reported in this struct + rdc_health_incidents_t incidents[HEALTH_MAX_ERROR_ITEMS]; //!< Report of the errors detected +} rdc_health_response_t; + /** * @brief Initialize ROCm RDC. * @@ -1274,6 +1361,72 @@ rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t grou */ rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id); +/** + * @brief enable the health check for a group + * + * @details For each group, only one parameter can be set. If you want to + * clear the setting for a group, set component == 0x0 + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @param[in] components The list of components that should be enabled for health check + * for example, RDC_HEALTH_WATCH_THERMAL | RDC_HEALTH_WATCH_POWER + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_health_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + unsigned int components); + +/** + * @brief get the health check settings of a group + * + * @details get the health check settings of a component + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @param[out] components The list of components that should be enabled for health check + * for example, RDC_HEALTH_WATCH_THERMAL | RDC_HEALTH_WATCH_POWER + * if it is 0x0, then the health check not set for the group yet. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + unsigned int* components); + +/** + * @brief Check health watch results + * + * @details If it has incidents. + * For each incident, check the component and error message. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @param[inout] response The detail results of the health. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_health_check(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_health_response_t* response); + +/** + * @brief clear the health watch + * + * @details For each group, clear the setting. + * + * @param[in] p_rdc_handle The RDC handler. + * + * @param[in] group_id The GPU group id. + * + * @retval ::RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id); + #ifdef __cplusplus } #endif // __cplusplus diff --git a/projects/rdc/include/rdc_lib/RdcCacheManager.h b/projects/rdc/include/rdc_lib/RdcCacheManager.h index a3af5e6e88..f1ff0fb7de 100644 --- a/projects/rdc/include/rdc_lib/RdcCacheManager.h +++ b/projects/rdc/include/rdc_lib/RdcCacheManager.h @@ -59,6 +59,21 @@ class RdcCacheManager { virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0; virtual rdc_status_t rdc_job_remove_all() = 0; + virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, + uint32_t gpu_index, + const rdc_field_value& value) = 0; + virtual rdc_status_t rdc_health_get_values(rdc_gpu_group_t group_id, + uint32_t gpu_index, + rdc_field_t field_id, + uint64_t start_timestamp, + uint64_t end_timestamp, + rdc_field_value* start_value, + rdc_field_value* end_value) = 0; + virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0; + virtual rdc_status_t rdc_update_health_stats(rdc_gpu_group_t group_id, + uint32_t gpu_index, + const rdc_field_value& value) = 0; + virtual ~RdcCacheManager() {} }; diff --git a/projects/rdc/include/rdc_lib/RdcHandler.h b/projects/rdc/include/rdc_lib/RdcHandler.h index a19b71a63c..e23b3d36fe 100644 --- a/projects/rdc/include/rdc_lib/RdcHandler.h +++ b/projects/rdc/include/rdc_lib/RdcHandler.h @@ -106,6 +106,12 @@ class RdcHandler { virtual rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) = 0; + // Health API + virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) = 0; + virtual rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) = 0; + virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) = 0; + virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0; + virtual ~RdcHandler() {} }; diff --git a/projects/rdc/include/rdc_lib/RdcWatchTable.h b/projects/rdc/include/rdc_lib/RdcWatchTable.h index e33962b8ab..f837fc3f3d 100644 --- a/projects/rdc/include/rdc_lib/RdcWatchTable.h +++ b/projects/rdc/include/rdc_lib/RdcWatchTable.h @@ -50,6 +50,14 @@ class RdcWatchTable { virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) = 0; + virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, + unsigned int components) = 0; + virtual rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, + unsigned int* components) = 0; + virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, + rdc_health_response_t *response) = 0; + virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0; + virtual ~RdcWatchTable() {} }; diff --git a/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h b/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h index 9d27ed6a2b..5852a9f904 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcCacheManagerImpl.h @@ -32,6 +32,8 @@ THE SOFTWARE. #include "rdc_lib/RdcCacheManager.h" #include "rdc_lib/rdc_common.h" +#define HEALTH_MAX_KEEP_SAMPLES 300 + namespace amd { namespace rdc { @@ -81,6 +83,9 @@ struct RdcJobStatsCacheEntry { // typedef std::map RdcJobStatsCache; +// +typedef std::map RdcHealthStatsCache; + class RdcCacheManagerImpl : public RdcCacheManager { public: rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field, @@ -105,6 +110,21 @@ class RdcCacheManagerImpl : public RdcCacheManager { rdc_status_t rdc_job_remove(const char job_id[64]) override; rdc_status_t rdc_job_remove_all() override; + rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, + uint32_t gpu_index, + const rdc_field_value& value) override; + rdc_status_t rdc_health_get_values(rdc_gpu_group_t group_id, + uint32_t gpu_index, + rdc_field_t field_id, + uint64_t start_timestamp, + uint64_t end_timestamp, + rdc_field_value* start_value, + rdc_field_value* end_value) override; + rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override; + rdc_status_t rdc_update_health_stats(rdc_gpu_group_t group_id, + uint32_t gpu_index, + const rdc_field_value& value) override; + private: void set_summary(const FieldSummaryStats& stats, rdc_stats_summary_t& gpu, rdc_stats_summary_t& summary, // NOLINT @@ -113,6 +133,7 @@ class RdcCacheManagerImpl : public RdcCacheManager { uint32_t num_gpus); // NOLINT RdcCacheSamples cache_samples_; RdcJobStatsCache cache_jobs_; + RdcHealthStatsCache cache_health_; std::mutex cache_mutex_; }; diff --git a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h index c931c18021..21ed2490ef 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -108,6 +108,12 @@ class RdcEmbeddedHandler final : public RdcHandler { rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override; + // Health API + rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override; + rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override; + rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override; + rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override; + explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode); ~RdcEmbeddedHandler() final; diff --git a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h index 8c273bc8d0..13e6a23beb 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h +++ b/projects/rdc/include/rdc_lib/impl/RdcStandaloneHandler.h @@ -105,6 +105,12 @@ class RdcStandaloneHandler : public RdcHandler { rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override; + // Health API + rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override; + rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override; + rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override; + rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override; + explicit RdcStandaloneHandler(const char* ip_and_port, const char* root_ca, const char* client_cert, const char* client_key); diff --git a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h index 8c8863f6bb..f9923cc22b 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h @@ -55,6 +55,12 @@ struct JobWatchTableEntry { std::vector fields; //< store fields for faster query }; +struct HealthWatchTableEntry { + unsigned int components; + rdc_field_grp_t field_group_id; + std::vector fields; //< store fields for faster query +}; + class RdcWatchTableImpl : public RdcWatchTable { public: rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, const char job_id[64], @@ -74,6 +80,11 @@ class RdcWatchTableImpl : public RdcWatchTable { //!< is reached, which will be handled in the clean_up() function. rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override; + rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override; + rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override; + rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override; + rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override; + //!< When the RDC is running as RDC_OPERATION_MODE_MANUAL, the user will //!< call this function periodically. Instead of providing other APIs to //!< cleanup the cache, this function will update and cleanup the cache. @@ -85,7 +96,8 @@ class RdcWatchTableImpl : public RdcWatchTable { rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) override; RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr, - const RdcModuleMgrPtr& module_mgr, const RdcNotificationPtr& notif); + const RdcMetricFetcherPtr& metric_fetcher, const RdcModuleMgrPtr& module_mgr, + const RdcNotificationPtr& notif); private: //!< Helper function to Update the fields_in_table when unwatch tables @@ -104,13 +116,39 @@ class RdcWatchTableImpl : public RdcWatchTable { bool is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id, std::string& job_id) const; // NOLINT + bool is_health_watch_field(uint32_t gpu_index, rdc_field_t field_id, + rdc_gpu_group_t& group_id) const; + rdc_status_t rdc_notif_update_cache(rdc_evnt_notification_t* events, uint32_t num_events); //!< The function will be pass as the callback for bulk fetch static rdc_status_t handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values, void* user_data); + rdc_status_t create_health_field_group(unsigned int components, + rdc_field_grp_t* field_group_id); + //!< output: Whether health incidents are full + bool add_health_incident(uint32_t gpu_index, + rdc_health_system_t component, + rdc_health_result_t health, + uint32_t err_code, + std::string err_msg, + rdc_health_incidents_t* incident, + rdc_health_response_t* response); + rdc_status_t get_start_end_values(rdc_gpu_group_t group_id, + uint32_t gpu_index, + rdc_field_t field, + rdc_field_value *start_value, + rdc_field_value *end_value); + rdc_status_t pcie_check(rdc_gpu_group_t group_id, + uint32_t gpu_index, rdc_health_response_t* response); + rdc_status_t xgmi_check(rdc_gpu_group_t group_id, + uint32_t gpu_index, rdc_health_response_t* response); + rdc_status_t memory_check(rdc_gpu_group_t group_id, + uint32_t gpu_index, rdc_health_response_t* response); + RdcGroupSettingsPtr group_settings_; RdcCacheManagerPtr cache_mgr_; + RdcMetricFetcherPtr metric_fetcher_; RdcModuleMgrPtr rdc_module_mgr_; RdcNotificationPtr notifications_; @@ -126,6 +164,9 @@ class RdcWatchTableImpl : public RdcWatchTable { //!< Those settings will only be updated when watching or unwatching. std::map fields_to_watch_; + //!< The health watch table to store the health settings. + std::map health_watch_table_; + //!< The last clean up time std::atomic last_cleanup_time_; std::mutex watch_mutex_; diff --git a/projects/rdc/protos/rdc.proto b/projects/rdc/protos/rdc.proto index b2749448d7..1d28e52945 100755 --- a/projects/rdc/protos/rdc.proto +++ b/projects/rdc/protos/rdc.proto @@ -190,6 +190,18 @@ service RdcAPI { // rdc_policy_condition_t condition); rpc UnRegisterPolicy(UnRegisterPolicyRequest) returns (UnRegisterPolicyResponse) {} + // Health API + // rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components); + rpc SetHealth(SetHealthRequest) returns (SetHealthResponse) {} + + // rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components); + rpc GetHealth(GetHealthRequest) returns (GetHealthResponse) {} + + // rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t r*esponse); + rpc CheckHealth(CheckHealthRequest) returns (CheckHealthResponse) {} + + // rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id); + rpc ClearHealth(ClearHealthRequest) returns (ClearHealthResponse) {} } message Empty { @@ -620,3 +632,55 @@ message UnRegisterPolicyRequest { message UnRegisterPolicyResponse { uint32 status = 1; } + +message SetHealthRequest { + uint32 group_id = 1; + uint32 components = 2; +} + +message SetHealthResponse { + uint32 status = 1; +} +message GetHealthRequest { + uint32 group_id = 1; +} + +message GetHealthResponse { + uint32 status = 1; + uint32 components = 2; +} + +message CheckHealthRequest { + uint32 group_id = 1; +} + +message HealthDetail { + string msg = 1; + uint32 code = 2; +} + +message HealthIncidents { + uint32 gpu_index = 1; + uint32 component = 2; + uint32 health = 3; + HealthDetail error = 4; +} + +message HealthResponse { + uint32 overall_health = 1; + uint32 incidents_count = 2; + repeated HealthIncidents incidents = 3; +} + +message CheckHealthResponse { + uint32 status = 1; + HealthResponse response = 2; +} + +message ClearHealthRequest { + uint32 group_id = 1; +} + +message ClearHealthResponse { + uint32 status = 1; +} diff --git a/projects/rdc/python_binding/rdc_bootstrap.py b/projects/rdc/python_binding/rdc_bootstrap.py index ee69240041..fc631b1579 100644 --- a/projects/rdc/python_binding/rdc_bootstrap.py +++ b/projects/rdc/python_binding/rdc_bootstrap.py @@ -163,6 +163,14 @@ class rdc_field_t(c_int): RDC_EVNT_NOTIF_PRE_RESET = 2002 RDC_EVNT_NOTIF_POST_RESET = 2003 RDC_EVNT_NOTIF_RING_HANG = 2004 + RDC_HEALTH_XGMI_ERROR = 3000 + RDC_HEALTH_PCIE_REPLAY_COUNT = 3001 + RDC_HEALTH_RETIRED_PAGE_NUM = 3002 + RDC_HEALTH_PENDING_PAGE_NUM = 3003 + RDC_HEALTH_RETIRED_PAGE_LIMIT = 3004 + RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT = 3005 + RDC_HEALTH_POWER_THROTTLE_TIME = 3006 + RDC_HEALTH_THERMAL_THROTTLE_TIME = 3007 rdc_handle_t = c_void_p rdc_gpu_group_t = c_uint32 diff --git a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc index 8cbcc01ef2..790a355cd7 100644 --- a/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/projects/rdc/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -401,6 +401,45 @@ rdc_field_t get_field_id_from_name(const char* name) { return RDC_FI_INVALID; } +rdc_status_t rdc_health_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + unsigned int components) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle) + ->rdc_health_set(group_id, components); +} + +rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + unsigned int* components) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle) + ->rdc_health_get(group_id, components); +} + +rdc_status_t rdc_health_check(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_health_response_t *response) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle) + ->rdc_health_check(group_id, response); +} + +rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle) + ->rdc_health_clear(group_id); +} + char* strncpy_with_null(char* dest, const char* src, size_t n) { if (n == 0) { return dest; diff --git a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc index acc8fd0a88..7ece361eff 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcCacheManagerImpl.cc @@ -449,5 +449,142 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(const char job_id[64], return RDC_ST_OK; } +rdc_status_t RdcCacheManagerImpl::rdc_health_set(rdc_gpu_group_t group_id, + uint32_t gpu_index, + const rdc_field_value& value) { + std::lock_guard guard(cache_mutex_); + RdcFieldKey field{gpu_index, value.field_id}; + + // Set initial values + RdcCacheEntry entry; + entry.last_time = value.ts; + entry.value = value.value; + entry.type = value.type; + + auto health_ite = cache_health_.find(group_id); + if (health_ite == cache_health_.end()) { + std::vector ve; + ve.push_back(entry); + + RdcCacheSamples cache_sample; + cache_sample.insert({field, ve}); + + cache_health_.insert({group_id, cache_sample}); + } + else { + auto samples_ite = health_ite->second.find(field); + if (samples_ite == health_ite->second.end()) { + std::vector ve; + ve.push_back(entry); + + health_ite->second.insert({field, ve}); + } else { + samples_ite->second.push_back(entry); + } + } + + return RDC_ST_OK; +} + +rdc_status_t RdcCacheManagerImpl::rdc_health_get_values(rdc_gpu_group_t group_id, + uint32_t gpu_index, + rdc_field_t field_id, + uint64_t start_timestamp, + uint64_t end_timestamp, + rdc_field_value* start_value, + rdc_field_value* end_value) { + if (!start_value && !end_value) + return RDC_ST_BAD_PARAMETER; + + std::lock_guard guard(cache_mutex_); + auto health_ite = cache_health_.find(group_id); + if (health_ite == cache_health_.end()) + return RDC_ST_NOT_FOUND; + + RdcFieldKey field{gpu_index, field_id}; + auto samples_ite = health_ite->second.find(field); + if (samples_ite == health_ite->second.end() || + samples_ite->second.size() == 0) + return RDC_ST_NOT_FOUND; + + auto cache_values = samples_ite->second; + rdc_status_t result = RDC_ST_OK; + if (start_value != nullptr) { + //get start value + result = RDC_ST_NOT_FOUND; + for (auto entry = cache_values.begin(); entry != cache_values.end(); entry++) { + if (entry->last_time >= start_timestamp) { + start_value->field_id = field_id; + start_value->ts = entry->last_time; + start_value->type = entry->type; + if (entry->type == STRING) + strncpy_with_null(start_value->value.str, entry->value.str, RDC_MAX_STR_LENGTH); + else + start_value->value.l_int = entry->value.l_int; + result = RDC_ST_OK; + break; + } + } //end for + } //end if + + if ((RDC_ST_OK == result) && (end_value != nullptr)) { + // get end value + result = RDC_ST_NOT_FOUND; + for (auto entry = cache_values.rbegin(); entry != cache_values.rend(); entry++) { + if (entry->last_time <= end_timestamp) { + end_value->field_id = field_id; + end_value->ts = entry->last_time; + end_value->type = entry->type; + if (entry->type == STRING) + strncpy_with_null(end_value->value.str, entry->value.str, RDC_MAX_STR_LENGTH); + else + end_value->value.l_int = entry->value.l_int; + result = RDC_ST_OK; + break; + } + } //end for + } //end if + + return result; +} + +rdc_status_t RdcCacheManagerImpl::rdc_health_clear(rdc_gpu_group_t group_id) { + std::lock_guard guard(cache_mutex_); + cache_health_.erase(group_id); + return RDC_ST_OK; +} + +rdc_status_t RdcCacheManagerImpl::rdc_update_health_stats(rdc_gpu_group_t group_id, + uint32_t gpu_index, + const rdc_field_value& value) { + std::lock_guard guard(cache_mutex_); + auto health_ite = cache_health_.find(group_id); + if (health_ite == cache_health_.end()) { + return RDC_ST_NOT_FOUND; + } + + RdcFieldKey field{gpu_index, value.field_id}; + auto samples_ite = health_ite->second.find(field); + if (samples_ite == health_ite->second.end()) { + return RDC_ST_NOT_FOUND; + } + + // Check HEALTH_MAX_KEEP_SAMPLES + auto& cache_values = samples_ite->second; + int item_remove = cache_values.size() - HEALTH_MAX_KEEP_SAMPLES + 1; + if (item_remove > 0) { + cache_values.erase(cache_values.begin(), cache_values.begin() + item_remove); + } + + RdcCacheEntry entry; + entry.last_time = value.ts; + entry.value = value.value; + entry.type = value.type; + + cache_values.push_back(entry); + + return RDC_ST_OK; +} + } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 4f82cb1740..b3b0f8f7f6 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -79,7 +79,7 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode) metric_fetcher_(new RdcMetricFetcherImpl()), rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)), rdc_notif_(new RdcNotificationImpl()), - watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, rdc_module_mgr_, rdc_notif_)), + watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, metric_fetcher_, rdc_module_mgr_, rdc_notif_)), metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)), policy_(new RdcPolicyImpl(group_settings_,metric_fetcher_)) { if (mode == RDC_OPERATION_MODE_AUTO) { @@ -460,5 +460,38 @@ rdc_status_t RdcEmbeddedHandler::rdc_policy_unregister(rdc_gpu_group_t group_id) return policy_->rdc_policy_unregister(group_id); } +// Health API +rdc_status_t RdcEmbeddedHandler::rdc_health_set(rdc_gpu_group_t group_id, + unsigned int components) { + if (0 == components) { + return RDC_ST_BAD_PARAMETER; + } + + return watch_table_->rdc_health_set(group_id, components); +} + +rdc_status_t RdcEmbeddedHandler::rdc_health_get(rdc_gpu_group_t group_id, + unsigned int *components) { + if (components == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + return watch_table_->rdc_health_get(group_id, components); +} + +rdc_status_t RdcEmbeddedHandler::rdc_health_check(rdc_gpu_group_t group_id, + rdc_health_response_t *response) { + if (response == nullptr) { + return RDC_ST_BAD_PARAMETER; + } + + return watch_table_->rdc_health_check(group_id, response); +} + +rdc_status_t RdcEmbeddedHandler::rdc_health_clear(rdc_gpu_group_t group_id) { + + return watch_table_->rdc_health_clear(group_id); +} + } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index cb73b8648b..cf8f683061 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -804,6 +804,66 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field read_gpu_metrics_uint64_t(); break; + case RDC_HEALTH_XGMI_ERROR: { + amdsmi_xgmi_status_t status; + ret = amdsmi_gpu_xgmi_error_status(processor_handle, &status); + value->status = Smi2RdcError(ret); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(status); + } + break; + } + + case RDC_HEALTH_PCIE_REPLAY_COUNT: { + amdsmi_pcie_info_t pcie_info; + ret = amdsmi_get_pcie_info(processor_handle, &pcie_info); + value->status = Smi2RdcError(ret); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(pcie_info.pcie_metric.pcie_replay_count); + } + break; + } + + case RDC_HEALTH_RETIRED_PAGE_NUM: + case RDC_HEALTH_PENDING_PAGE_NUM: { + uint32_t num_pages = 0; + ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, nullptr); + if (AMDSMI_STATUS_SUCCESS == ret) { + if (RDC_HEALTH_RETIRED_PAGE_NUM == field_id) { + value->status = Smi2RdcError(ret); + value->type = INTEGER; + value->value.l_int = static_cast(num_pages); + break; + } + + if ((0 < num_pages) && + (RDC_HEALTH_PENDING_PAGE_NUM == field_id)) { + std::vector bad_page_info(num_pages); + ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, + bad_page_info.data()); + value->status = Smi2RdcError(ret); + value->type = INTEGER; + if (AMDSMI_STATUS_SUCCESS == ret) { + uint64_t pending_page_num = 0; + for (uint32_t i=0; i < num_pages; i++) { + if (AMDSMI_MEM_PAGE_STATUS_PENDING == bad_page_info[i].status) + pending_page_num++; + } + + value->value.l_int = static_cast(pending_page_num); + } + } + } else + value->status = Smi2RdcError(ret); + break; + } + + case RDC_HEALTH_RETIRED_PAGE_LIMIT: + case RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT: + case RDC_HEALTH_POWER_THROTTLE_TIME: //gpu_metrics 1.6 + case RDC_HEALTH_THERMAL_THROTTLE_TIME: //gpu_metrics 1.6 default: break; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc index 0fbfad6920..520efbee6b 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc @@ -180,6 +180,9 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI RDC_EVNT_XGMI_1_THRPUT, RDC_EVNT_XGMI_2_THRPUT, RDC_EVNT_XGMI_3_THRPUT, RDC_EVNT_XGMI_4_THRPUT, RDC_EVNT_XGMI_5_THRPUT, RDC_FI_OAM_ID, RDC_FI_GPU_MM_ENC_UTIL, RDC_FI_GPU_MM_DEC_UTIL, RDC_FI_GPU_MEMORY_ACTIVITY, + RDC_HEALTH_XGMI_ERROR, RDC_HEALTH_PCIE_REPLAY_COUNT, RDC_HEALTH_RETIRED_PAGE_NUM, + RDC_HEALTH_PENDING_PAGE_NUM, RDC_HEALTH_RETIRED_PAGE_LIMIT, RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, + RDC_HEALTH_POWER_THROTTLE_TIME, RDC_HEALTH_THERMAL_THROTTLE_TIME, }; std::copy(fields.begin(), fields.end(), field_ids); *field_count = fields.size(); diff --git a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc index b4c549da1d..ce7c0874d4 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcWatchTableImpl.cc @@ -41,10 +41,12 @@ namespace rdc { RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr, + const RdcMetricFetcherPtr& metric_fetcher, const RdcModuleMgrPtr& module_mgr, const RdcNotificationPtr& notif) : group_settings_(group_settings), cache_mgr_(cache_mgr), + metric_fetcher_(metric_fetcher), rdc_module_mgr_(module_mgr), notifications_(notif), last_cleanup_time_(0) {} @@ -373,6 +375,423 @@ rdc_status_t RdcWatchTableImpl::rdc_field_unwatch(rdc_gpu_group_t group_id, return update_field_in_table_when_unwatch(ite->first); } +rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int components, + rdc_field_grp_t* field_group_id) { + // set filed ids + std::vector field_ids{}; + if (components & RDC_HEALTH_WATCH_PCIE) { + field_ids.push_back(RDC_HEALTH_PCIE_REPLAY_COUNT); + } + + if (components & RDC_HEALTH_WATCH_XGMI) { + field_ids.push_back(RDC_HEALTH_XGMI_ERROR); + } + + if (components & RDC_HEALTH_WATCH_MEM) { + field_ids.push_back(RDC_FI_ECC_UNCORRECT_TOTAL); + field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM); + field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM); + field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT); + field_ids.push_back(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT); + } + + if (components & RDC_HEALTH_WATCH_INFOROM) { + } + + if (components & RDC_HEALTH_WATCH_THERMAL) { + field_ids.push_back(RDC_HEALTH_THERMAL_THROTTLE_TIME); + } + + if (components & RDC_HEALTH_WATCH_POWER) { + field_ids.push_back(RDC_HEALTH_POWER_THROTTLE_TIME); + } + + if (0 == field_ids.size()) { + RDC_LOG(RDC_ERROR, "Fail to health set. The components must contain at least one watch."); + return RDC_ST_BAD_PARAMETER; + } + + const std::string field_group_name("health-field-group"); + return group_settings_->rdc_group_field_create(field_ids.size(), field_ids.data(), + field_group_name.c_str(), field_group_id); +} + +rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id, + unsigned int components) { + // remove old health for same group_id + rdc_health_clear(group_id); + + // create a field group base on the components + rdc_field_grp_t field_group_id; + rdc_status_t result = create_health_field_group(components, &field_group_id); + if (result != RDC_ST_OK) { + return result; + } + + // get field key + std::vector fields_in_watch; + result = get_fields_from_group(group_id, field_group_id, fields_in_watch); + if (result != RDC_ST_OK) { + return result; + } + + // add to the health watch table + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + HealthWatchTableEntry hentry{components, field_group_id, fields_in_watch}; + health_watch_table_.insert({group_id, hentry}); + } while (0); + + for (auto fields = fields_in_watch.begin(); fields != fields_in_watch.end(); fields++) { + // get initial values + rdc_field_value value; + result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value); + if (result != RDC_ST_OK) + break; + + // set initial values to cache + result = cache_mgr_->rdc_health_set(group_id, fields->first, value); + if (result != RDC_ST_OK) + break; + } + + // Start to watch the fields and update fields per 1 second. + result = rdc_field_watch(group_id, field_group_id, 1000000, 0, 0); + return result; +} + +rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id, + unsigned int *components) { + if (nullptr == components) + return RDC_ST_BAD_PARAMETER; + + std::lock_guard guard(watch_mutex_); + auto table_iter = health_watch_table_.find(group_id); + + // already in the health watch table + if (table_iter != health_watch_table_.end()) + *components = table_iter->second.components; + else + *components = 0; + + return RDC_ST_OK; +} + +bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index, + rdc_health_system_t component, + rdc_health_result_t health, + uint32_t err_code, + std::string err_msg, + rdc_health_incidents_t* incident, + rdc_health_response_t* response) { + bool result = false; + + incident->gpu_index = gpu_index; + incident->component = component; + incident->health = health; + incident->error.code = err_code; + strncpy_with_null(incident->error.msg, err_msg.c_str(), MAX_HEALTH_MSG_LENGTH); + + if (incident->health > response->overall_health) + response->overall_health = incident->health; + response->incidents_count++; + if (response->incidents_count >= HEALTH_MAX_ERROR_ITEMS) { + RDC_LOG(RDC_INFO, "Health incidents are full!"); + result = true; + } + + return (result); +} + +rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id, + uint32_t gpu_index, + rdc_field_t field, + rdc_field_value *start_value, + rdc_field_value *end_value) { + if ((nullptr == start_value) || (nullptr == end_value)) + return RDC_ST_BAD_PARAMETER; + + uint64_t start_timestamp = 0; + + //get the history data last 1 minute + start_timestamp = static_cast(time(nullptr) - 60) * 1000; + + //get the values of the field at the start_timestamp/end_timestampe + rdc_status_t result = cache_mgr_->rdc_health_get_values(group_id, + gpu_index, field, + start_timestamp, 0, + start_value, nullptr); + if (result != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result); + return result; + } + + // get end values + result = metric_fetcher_->fetch_smi_field(gpu_index, field, end_value); + if (result != RDC_ST_OK) + RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " current data. Return: " << result); + + return result; +} + +rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id, + uint32_t gpu_index, + rdc_health_response_t* response) { + //get field start/end values + rdc_field_value start = {}, end = {}; + rdc_status_t result = get_start_end_values(group_id, + gpu_index, + RDC_HEALTH_PCIE_REPLAY_COUNT, + &start, + &end); + if (result != RDC_ST_OK) + return result; + + uint64_t pcie_replay_count = end.value.l_int - start.value.l_int; + if (pcie_replay_count > PCIE_MAX_REPLAYS_PERMIN) { + rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + + std::string err_msg = "Detected "; + err_msg += std::to_string(pcie_replay_count); + err_msg += " PCIe replays per minute exceeding the max limit "; + err_msg += std::to_string(PCIE_MAX_REPLAYS_PERMIN); + err_msg += "."; + + //add incident + if (add_health_incident(gpu_index, + RDC_HEALTH_WATCH_PCIE, + RDC_HEALTH_RESULT_WARN, + RDC_FR_PCI_REPLAY_RATE, + err_msg, + incident, + response)) + return RDC_ST_MAX_LIMIT; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id, + uint32_t gpu_index, + rdc_health_response_t* response) { + //get field start/end values + rdc_field_value start = {}, end = {}; + rdc_status_t result = get_start_end_values(group_id, + gpu_index, + RDC_HEALTH_XGMI_ERROR, + &start, + &end); + if (result != RDC_ST_OK) + return result; + + amdsmi_xgmi_status_t status = static_cast(end.value.l_int); + if (AMDSMI_XGMI_STATUS_NO_ERRORS != status) { + rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + + uint32_t err_code; + std::string err_msg = "Detected "; + if (AMDSMI_XGMI_STATUS_ERROR == status) { + err_msg += " a single XGMI error"; + err_code = RDC_FR_XGMI_SINGLE_ERROR; + } else { + err_msg += " multiple XGMI errors"; + err_code = RDC_FR_XGMI_MULTIPLE_ERROR; + } + err_msg += "."; + + //add incident + if (add_health_incident(gpu_index, + RDC_HEALTH_WATCH_XGMI, + RDC_HEALTH_RESULT_FAIL, + err_code, + err_msg, + incident, + response)) + return RDC_ST_MAX_LIMIT; + } + + return RDC_ST_OK; +} + +rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id, + uint32_t gpu_index, + rdc_health_response_t* response) { + //get field start/end values + rdc_field_value start = {}, end = {}; + rdc_status_t result = get_start_end_values(group_id, + gpu_index, + RDC_FI_ECC_UNCORRECT_TOTAL, + &start, + &end); + if (result != RDC_ST_OK) + return result; + + uint64_t ecc_uncorrectable_count = 0; + ecc_uncorrectable_count = end.value.l_int - start.value.l_int; + if (ecc_uncorrectable_count > 0) { + rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + + std::string err_msg = "Detected "; + err_msg += std::to_string(ecc_uncorrectable_count); + err_msg += " uncorrectable ECC error(s) in the last minute."; + + //add incident + if (add_health_incident(gpu_index, + RDC_HEALTH_WATCH_MEM, + RDC_HEALTH_RESULT_FAIL, + RDC_FR_ECC_UNCORRECTABLE_DETECTED, + err_msg, + incident, + response)) + return RDC_ST_MAX_LIMIT; + } + + result = get_start_end_values(group_id, + gpu_index, + RDC_HEALTH_PENDING_PAGE_NUM, + &start, + &end); + if (result != RDC_ST_OK) + return result; + + uint64_t num_pages = end.value.l_int - start.value.l_int; + if (num_pages > 0) { + rdc_health_incidents_t *incident = &response->incidents[response->incidents_count]; + + std::string err_msg = "Detected "; + err_msg += std::to_string(num_pages); + err_msg += " pending retired page(s)."; + + //add incident + if (add_health_incident(gpu_index, + RDC_HEALTH_WATCH_MEM, + RDC_HEALTH_RESULT_WARN, + RDC_FR_PENDING_PAGE_RETIREMENTS, + err_msg, + incident, + response)) + return RDC_ST_MAX_LIMIT; + } + + //To do: RDC_FR_RETIRED_PAGES_LIMIT + //To do: RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT + + return RDC_ST_OK; +} + +rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id, + rdc_health_response_t *response) { + if (nullptr == response) + return RDC_ST_BAD_PARAMETER; + + unsigned int components = 0; + std::vector fields_in_watch; + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + auto health = health_watch_table_.find(group_id); + if (health == health_watch_table_.end()) + return RDC_ST_NOT_FOUND; + components = health->second.components; + fields_in_watch = health->second.fields; + } while (0); + + rdc_group_info_t ginfo; + rdc_status_t result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo); + if (result != RDC_ST_OK) + return result; + + for (auto fields = fields_in_watch.begin(); fields != fields_in_watch.end(); fields++) { + // get current values + rdc_field_value value; + result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value); + if (result != RDC_ST_OK) + break; + + // set current values to cache + result = cache_mgr_->rdc_update_health_stats(group_id, fields->first, value); + if (result != RDC_ST_OK) + break; + } + + //init response + response->overall_health = RDC_HEALTH_RESULT_PASS; + response->incidents_count = 0; + + for (uint32_t gindex = 0; gindex < ginfo.count; gindex++) { + //PCIe + if (components & RDC_HEALTH_WATCH_PCIE) { + result = pcie_check(group_id, ginfo.entity_ids[gindex], response); + if (result == RDC_ST_MAX_LIMIT) + return result; + } + + //XGMI + if (components & RDC_HEALTH_WATCH_XGMI) { + result = xgmi_check(group_id, ginfo.entity_ids[gindex], response); + if (result == RDC_ST_MAX_LIMIT) + return result; + } + + //Memory + if (components & RDC_HEALTH_WATCH_MEM) { + result = memory_check(group_id, ginfo.entity_ids[gindex], response); + if (result == RDC_ST_MAX_LIMIT) + return result; + } + + //InfoROM + if (components & RDC_HEALTH_WATCH_INFOROM) { + //To do: + return RDC_ST_NOT_SUPPORTED; + } + + //Thermal + if (components & RDC_HEALTH_WATCH_THERMAL) { + //To do: + return RDC_ST_NOT_SUPPORTED; + } + + //Power + if (components & RDC_HEALTH_WATCH_POWER) { + //To do: + return RDC_ST_NOT_SUPPORTED; + } + } //end of for gindex + + return RDC_ST_OK; +} + +rdc_status_t RdcWatchTableImpl::rdc_health_clear(rdc_gpu_group_t group_id) { + rdc_field_grp_t field_group_id; + + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + auto health = health_watch_table_.find(group_id); + if (health == health_watch_table_.end()) { + return RDC_ST_NOT_FOUND; + } + field_group_id = health->second.field_group_id; + } while (0); + + // at first, unwatch the old fields. + rdc_status_t result = rdc_field_unwatch(group_id, field_group_id); + if (result != RDC_ST_OK) { + return result; + } + + // destroy the old field group + group_settings_->rdc_group_field_destroy(field_group_id); + + do { //< lock guard for thread safe + std::lock_guard guard(watch_mutex_); + health_watch_table_.erase(group_id); + } while (0); + + result = cache_mgr_->rdc_health_clear(group_id); + + return RDC_ST_OK; +} + bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id, std::string& job_id) const { RdcFieldKey key{gpu_index, field_id}; @@ -388,6 +807,21 @@ bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, rdc_field_t field return false; } +bool RdcWatchTableImpl::is_health_watch_field(uint32_t gpu_index, rdc_field_t field_id, + rdc_gpu_group_t& group_id) const { + RdcFieldKey key{gpu_index, field_id}; + + for (auto ite = health_watch_table_.begin(); ite != health_watch_table_.end(); ite++) { + auto& fields = ite->second.fields; + if (std::find(fields.begin(), fields.end(), key) != fields.end()) { + group_id = ite->first; + return true; + } + } + + return false; +} + rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values, void* user_data) { if (values == nullptr || user_data == nullptr) { @@ -421,6 +855,12 @@ rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values, uin if (watchTable->is_job_watch_field(gpu_index, field_id, job_id)) { watchTable->cache_mgr_->rdc_update_job_stats(gpu_index, job_id, values[i].field_value); } + + // Update the health stats cache + rdc_gpu_group_t group_id; + if (watchTable->is_health_watch_field(gpu_index, field_id, group_id)) { + watchTable->cache_mgr_->rdc_update_health_stats(group_id, gpu_index, values[i].field_value); + } } return RDC_ST_OK; } @@ -492,6 +932,12 @@ rdc_status_t RdcWatchTableImpl::rdc_notif_update_cache(rdc_evnt_notification_t* if (is_job_watch_field(gpu_index, field_id, job_id)) { cache_mgr_->rdc_update_job_stats(gpu_index, job_id, events[i].field); } + + // Update the health stats cache + rdc_gpu_group_t group_id; + if (is_health_watch_field(gpu_index, field_id, group_id)) { + cache_mgr_->rdc_update_health_stats(group_id, gpu_index, events[i].field); + } } return RDC_ST_OK; } @@ -549,6 +995,7 @@ void RdcWatchTableImpl::debug_status() { RDC_LOG(RDC_DEBUG, "fields_to_watch_:" << fields_to_watch_.size() << " watch_table_:" << watch_table_.size() << " job_watch_table_:" << job_watch_table_.size() + << " health_watch_table_:" << health_watch_table_.size() << " cache stats:" << cache_mgr_->get_cache_stats()); if (watch_table_.size() > 0) { @@ -575,6 +1022,18 @@ void RdcWatchTableImpl::debug_status() { jite->first << ": " << jite->second.group_id << " fields : " << strstream.str()); } + if (health_watch_table_.size() > 0) { + RDC_LOG(RDC_DEBUG, "health watch table details: "); + } + for (auto hite = health_watch_table_.begin(); hite != health_watch_table_.end(); hite++) { + std::stringstream strstream; + for (const auto& p : hite->second.fields) { + strstream << "<" << p.first << "," << p.second << "> "; + } + RDC_LOG(RDC_DEBUG, + "group id : " << hite->first << " components : " << hite->second.components << " fields : " << strstream.str()); + } + if (fields_to_watch_.size() > 0) { RDC_LOG(RDC_DEBUG, "fields to watch details:"); } diff --git a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index d20889f790..2e36f48691 100644 --- a/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/projects/rdc/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -870,5 +870,87 @@ rdc_status_t RdcStandaloneHandler::rdc_policy_unregister(rdc_gpu_group_t group_i return error_handle(status, reply.status()); } +// Health RdcAPI +rdc_status_t RdcStandaloneHandler::rdc_health_set(rdc_gpu_group_t group_id, + unsigned int components) { + ::rdc::SetHealthRequest request; + ::rdc::SetHealthResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + request.set_components(components); + ::grpc::Status status = stub_->SetHealth(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + + return err_status; +} + +rdc_status_t RdcStandaloneHandler::rdc_health_get(rdc_gpu_group_t group_id, + unsigned int* components) { + if (!components) { + return RDC_ST_BAD_PARAMETER; + } + + ::rdc::GetHealthRequest request; + ::rdc::GetHealthResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + ::grpc::Status status = stub_->GetHealth(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; + + *components = reply.components(); + return RDC_ST_OK; +} + +rdc_status_t RdcStandaloneHandler::rdc_health_check(rdc_gpu_group_t group_id, + rdc_health_response_t *response) { + if (!response) { + return RDC_ST_BAD_PARAMETER; + } + + ::rdc::CheckHealthRequest request; + ::rdc::CheckHealthResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + ::grpc::Status status = stub_->CheckHealth(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; + + auto res = reply.response(); + response->overall_health = static_cast(res.overall_health()); + response->incidents_count = res.incidents_count(); + + for (int i = 0; i < res.incidents_size(); i++) { + const ::rdc::HealthIncidents& result = res.incidents(i); + rdc_health_incidents_t& to_result = response->incidents[i]; + + to_result.gpu_index = result.gpu_index(); + to_result.component = static_cast(result.component()); + to_result.health = static_cast(result.health()); + + //set error + to_result.error.code = result.error().code(); + strncpy_with_null(to_result.error.msg, result.error().msg().c_str(), MAX_HEALTH_MSG_LENGTH); + } + + return RDC_ST_OK; +} + +rdc_status_t RdcStandaloneHandler::rdc_health_clear(rdc_gpu_group_t group_id) { + ::rdc::ClearHealthRequest request; + ::rdc::ClearHealthResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + ::grpc::Status status = stub_->ClearHealth(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; + + return RDC_ST_OK; +} + } // namespace rdc } // namespace amd diff --git a/projects/rdc/rdci/CMakeLists.txt b/projects/rdc/rdci/CMakeLists.txt index 262bf380ff..625eba25a0 100644 --- a/projects/rdc/rdci/CMakeLists.txt +++ b/projects/rdc/rdci/CMakeLists.txt @@ -68,6 +68,7 @@ set(RDCI_SRC_LIST "${SRC_DIR}/RdciGroupSubSystem.cc" "${SRC_DIR}/RdciStatsSubSystem.cc" "${SRC_DIR}/RdciPolicySubSystem.cc" + "${SRC_DIR}/RdciHealthSubSystem.cc" "${SRC_DIR}/RdciSubSystem.cc" "${SRC_DIR}/rdci.cc") message("RDCI_SRC_LIST=${RDCI_SRC_LIST}") diff --git a/projects/rdc/rdci/include/RdciHealthSubSystem.h b/projects/rdc/rdci/include/RdciHealthSubSystem.h new file mode 100644 index 0000000000..3c628c101c --- /dev/null +++ b/projects/rdc/rdci/include/RdciHealthSubSystem.h @@ -0,0 +1,77 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDCI_INCLUDE_RDCIHEALTHSUBSYSTEM_H_ +#define RDCI_INCLUDE_RDCIHEALTHSUBSYSTEM_H_ +#include + +#include + +#include "RdciSubSystem.h" + +namespace amd { +namespace rdc { + +class RdciHealthSubSystem : public RdciSubSystem { + public: + RdciHealthSubSystem(); + ~RdciHealthSubSystem(); + void parse_cmd_opts(int argc, char** argv) override; + void process() override; + + private: + void show_help() const; + + void get_watches() const; + void set_watches() const; + void health_check() const; + void health_clear() const; + + std::string health_string(rdc_health_result_t health) const; + std::string component_string(rdc_health_system_t component) const; + void output_errstr(const std::string& input) const; + unsigned int handle_one_component(rdc_health_response_t &response, + unsigned int start_index, + uint32_t gpu_index, + rdc_health_system_t component, + rdc_health_result_t &component_health, + std::vector &err_str) const; + unsigned int handle_one_gpu(rdc_health_response_t &response, + unsigned int start_index, + uint32_t gpu_index) const; + + enum OPERATIONS { + HEALTH_UNKNOWN = 0, + HEALTH_HELP, + HEALTH_FETCH, + HEALTH_SET, + HEALTH_CHECK, + HEALTH_CLEAR, + } health_ops_; + + rdc_gpu_group_t group_id_; + unsigned int components_; +}; + +} // namespace rdc +} // namespace amd + +#endif // RDCI_INCLUDE_RDCIHEALTHSUBSYSTEM_H_ diff --git a/projects/rdc/rdci/src/RdciHealthSubSystem.cc b/projects/rdc/rdci/src/RdciHealthSubSystem.cc new file mode 100644 index 0000000000..b2ba82251d --- /dev/null +++ b/projects/rdc/rdci/src/RdciHealthSubSystem.cc @@ -0,0 +1,557 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "RdciHealthSubSystem.h" + +#include +#include +#include + +#include +#include +#include + +#include "common/rdc_utils.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcException.h" +#include "rdc_lib/rdc_common.h" + +namespace amd { +namespace rdc { + +RdciHealthSubSystem::RdciHealthSubSystem() {} + +RdciHealthSubSystem::~RdciHealthSubSystem() {} + +void RdciHealthSubSystem::parse_cmd_opts(int argc, char** argv) { + const int HOST_OPTIONS = 1000; + const int JSON_OPTIONS = 1001; + const int CLEAR_OPTIONS = 1002; + const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS}, + {"unauth", optional_argument, nullptr, 'u'}, + {"help", optional_argument, nullptr, 'h'}, + {"json", optional_argument, nullptr, JSON_OPTIONS}, + {"clear", optional_argument, nullptr, CLEAR_OPTIONS}, + {"group", required_argument, nullptr, 'g'}, + {"fetch", optional_argument, nullptr, 'f'}, + {"set", required_argument, nullptr, 's'}, + {"check", optional_argument, nullptr, 'c'}, + {nullptr, 0, nullptr, 0}}; + + bool group_id_set = false; + int option_index = 0, opt = 0; + std::string flags; + unsigned int components = 0; + + while ((opt = getopt_long(argc, argv, "uhg:fs:c", long_options, &option_index)) != -1) { + switch (opt) { + case HOST_OPTIONS: + ip_port_ = optarg; + break; + + case JSON_OPTIONS: + set_json_output(true); + break; + + case CLEAR_OPTIONS: + health_ops_ = HEALTH_CLEAR; + break; + + case 'u': + use_auth_ = false; + break; + + case 'h': + health_ops_ = HEALTH_HELP; + return; + + case 'g': + if (!IsNumber(optarg)) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "The group id needs to be a number"); + } + group_id_ = std::stoi(optarg); + group_id_set = true; + break; + + case 'f': + health_ops_ = HEALTH_FETCH; + break; + + case 's': + health_ops_ = HEALTH_SET; + + flags = optarg; + for (unsigned int i = 0; i < flags.length(); i++) { + switch (flags.at(i)) { + case 'a': + components |= RDC_HEALTH_WATCH_PCIE; + components |= RDC_HEALTH_WATCH_XGMI; + components |= RDC_HEALTH_WATCH_MEM; + //To do: + //components |= RDC_HEALTH_WATCH_INFOROM; + //components |= RDC_HEALTH_WATCH_THERMAL; + //components |= RDC_HEALTH_WATCH_POWER; + break; + + case 'p': + components |= RDC_HEALTH_WATCH_PCIE; + break; + + case 'm': + components |= RDC_HEALTH_WATCH_MEM; + break; + + case 'i': + //To do: + //components |= RDC_HEALTH_WATCH_INFOROM; + throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported"); + break; + + case 't': + //To do: + //components |= RDC_HEALTH_WATCH_THERMAL; + //components |= RDC_HEALTH_WATCH_POWER; + throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported"); + break; + + case 'x': + components |= RDC_HEALTH_WATCH_XGMI; + break; + + default: + throw RdcException(RDC_ST_BAD_PARAMETER, "Invalid flags"); + break; + } + } + + if (0 == components) { + throw RdcException(RDC_ST_BAD_PARAMETER, "No flags"); + } else + components_ = components; + break; + + case 'c': + health_ops_ = HEALTH_CHECK; + break; + + default: + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options"); + } + } + + if (!group_id_set) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the GPU group id"); + } +} + +void RdciHealthSubSystem::show_help() const { + if (is_json_output()) return; + std::cout << " health -- Used to manage the health watches of a group. \n" + << " The health of the GPUs in a group can then be monitored" + << " during a process.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci health [--host :port] [-u] [-j] -g -s \n"; + std::cout << " rdci health [--host :port] [-u] [-j] -g -c\n"; + std::cout << " rdci health [--host :port] [-u] [-j] -g -f\n"; + std::cout << " rdci health [--host :prot] [-u] [-j] -g --clear\n"; + std::cout << "\nFlags:\n"; + show_common_usage(); + std::cout << " --json Output using json.\n"; + std::cout << " --clear Disable all watches being monitored.\n"; + std::cout << " -g --group groupId The GPU group to query " + << "on the specified host.\n"; + std::cout << " -f --fetch Fetch the current watch status.\n"; + std::cout << " -s --set flags The list of components can be watched. " + << "[default = pm]\n"; + std::cout << " a - watch all components\n"; + std::cout << " p - watch PCIe\n"; + std::cout << " m - watch Memory\n"; + //std::cout << " i - watch infoROM\n"; + //std::cout << " t - watch power and thermal\n"; + std::cout << " x - watch XGMI\n"; + std::cout << " -c --check Check to see if any errors or warnings have " + << "occurred in the currently monitored watches.\n"; +} + +void RdciHealthSubSystem::get_watches() const { + rdc_status_t result; + unsigned int components = 0; + std::string on = "On"; + std::string off = "Off"; + + result = rdc_health_get(rdc_handle_, group_id_, &components); + if (result != RDC_ST_OK) { + std::string error_msg = rdc_status_string(result); + if (result == RDC_ST_NOT_FOUND) { + error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field."; + } + throw RdcException(result, error_msg.c_str()); + } + + if (is_json_output()) { + std::cout << "\"heading\" : \"Health monitor systems status\", "; + std::cout << "\"body\" : ["; + std::cout << "{\"Component\" : \"PCIe\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "\"},"; + std::cout << "{\"Component\" : \"XGMI\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "\"},"; + std::cout << "{\"Component\" : \"Memory\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "\"},"; + /*std::cout << "{\"Component\" : \"InfoROM\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "\"},"; + std::cout << "{\"Component\" : \"Thermal\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "\"},"; + std::cout << "{\"Component\" : \"Power\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "\"}";*/ + std::cout << "]"; + } else { + std::cout << "Health monitor systems status:" << std::endl; + std::cout << "+--------------------+" //"-" width :20 + << "---------------------------------------------------+\n"; //-" width :51 + std::cout << "|" << std::setw(20) << std::left << " PCIe" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "|\n"; + std::cout << "|" << std::setw(20) << std::left << " XGMI" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n"; + std::cout << "|" << std::setw(20) << std::left << " Memory" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n"; + /*std::cout << "|" << std::setw(20) << std::left << " InfoROM" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "|\n"; + std::cout << "|" << std::setw(20) << std::left << " Thermal" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "|\n"; + std::cout << "|" << std::setw(20) << std::left << " Power" << "| " + << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n";*/ + std::cout << "+--------------------+" //"-" width :20 + << "---------------------------------------------------+\n"; //-" width :51 + } +} + +void RdciHealthSubSystem::set_watches() const { + rdc_status_t result; + + result = rdc_health_set(rdc_handle_, group_id_, components_); + if (result != RDC_ST_OK) { + std::string error_msg = rdc_status_string(result); + if (result == RDC_ST_NOT_FOUND) { + error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field."; + } + throw RdcException(result, error_msg.c_str()); + } + + std::cout << "Group " << group_id_ << " health monitor systems set successfully." << std::endl; +} + +std::string RdciHealthSubSystem::health_string(rdc_health_result_t health) const { + switch (health) { + case RDC_HEALTH_RESULT_PASS: + return "Pass"; + + case RDC_HEALTH_RESULT_WARN: + return "Warning"; + + case RDC_HEALTH_RESULT_FAIL: + return "Fail"; + + default: + return "Unknown"; + } +} + +std::string RdciHealthSubSystem::component_string(rdc_health_system_t component) const { + switch (component) { + case RDC_HEALTH_WATCH_PCIE: + return "PCIe system: "; + + case RDC_HEALTH_WATCH_XGMI: + return"XGMI system: "; + + case RDC_HEALTH_WATCH_MEM: + return "Memory system: "; + + case RDC_HEALTH_WATCH_INFOROM: + return "Inforom system: "; + + case RDC_HEALTH_WATCH_THERMAL: + return "Thermal system:"; + + case RDC_HEALTH_WATCH_POWER: + return "Power system: "; + + default: + return "Unknown"; + } +} + +void RdciHealthSubSystem::output_errstr(const std::string& input) const { + std::string word, line_str; + unsigned int width = 60, line_size = 0; + std::istringstream iss(input); + + while (iss >> word) { + if (line_size + word.size() >= width) { + std::cout << "|" << std::setw(20) << " " << "| " + << std::setw(width) << std::left << line_str << "|\n"; + + //add new line string + line_str = word; + line_size = word.size(); + } else { + if (line_size > 0) { + line_str += " "; + line_str += word; + line_size += word.size() + 1; + } else { + line_str += word; + line_size += word.size(); + } + } + } //end while + + if (0 < line_size) + std::cout << "|" << std::setw(20) << " " << "| " + << std::setw(width) << std::left << line_str << "|\n"; +} + +unsigned int RdciHealthSubSystem::handle_one_component(rdc_health_response_t &response, + unsigned int start_index, + uint32_t gpu_index, + rdc_health_system_t component, + rdc_health_result_t &component_health, + std::vector &err_str) const { + unsigned int count = 0; + rdc_health_incidents_t *incident; + std::string all_err_str; + + for (unsigned int i = start_index; i < response.incidents_count; i++) { + incident = &response.incidents[i]; + + //same GPU Index, same component + if ((incident->gpu_index != gpu_index) || + (incident->component != component)) + break; + + //set component health + if (incident->health > component_health) + component_health = incident->health; + + all_err_str = " - "; + all_err_str += incident->error.msg; + err_str.push_back(all_err_str); + + count++; + } + + return count; +} + +unsigned int RdciHealthSubSystem::handle_one_gpu(rdc_health_response_t &response, + unsigned int start_index, + uint32_t gpu_index) const { + unsigned int count = 0, comp_count = 0; + rdc_health_incidents_t *incident; + rdc_health_result_t gpu_health = RDC_HEALTH_RESULT_PASS; + std::string component_str, health_str, gpu_health_str; + typedef struct { + rdc_health_result_t component_health; + std::vector err_str; + } component_detail_t; + std::map component_detail_map; + + for (unsigned int i = start_index; i < response.incidents_count; i++) { + incident = &response.incidents[i]; + + //same GPU Index + if (incident->gpu_index != gpu_index) + break; + + //set gpu health + if (incident->health > gpu_health) + gpu_health = incident->health; + + //handle smae component + component_detail_t detail; + detail.component_health = RDC_HEALTH_RESULT_PASS; + detail.err_str.clear(); + + comp_count = handle_one_component(response, i, gpu_index, incident->component, detail.component_health, detail.err_str); + i += comp_count - 1; + count += comp_count; + + // Add to the component detail map + component_detail_map.insert({incident->component, detail}); + } + + //output gpu_index health result + gpu_health_str = health_string(gpu_health); + + if (is_json_output()) { + std::cout << "{\"Index\" : \"" << std::to_string(gpu_index) << "\", "; + std::cout << "\"Health\" : \"" << gpu_health_str << "\", "; + std::cout << "\"Error\" : ["; + + unsigned int i = 0; + for (auto ite : component_detail_map) { + component_str = component_string(ite.first); + health_str = health_string(ite.second.component_health); + + std::cout << "{\"Component\" : \"" << component_str << "\", "; + std::cout << "\"Health\" : \"" << health_str << "\", "; + + std::cout << "\"Message\" : ["; + unsigned int j = 0; + for (auto err_ite : ite.second.err_str) { + std::cout << "\"" << err_ite << "\""; + j++; + if (j < ite.second.err_str.size()) + std::cout << ", "; + } + std::cout << "]}"; //end Message + + i++; + if (i < component_detail_map.size()) { + std::cout << ", "; + } + } + std::cout << "]}"; //end Error + } else { + std::cout << "|" << std::setw(20) << " GPU ID: " + std::to_string(gpu_index) << "| " + << std::setw(60) << std::left << gpu_health_str << "|\n"; + std::cout << "|" << std::setw(20) << " " << "| " + << std::setw(60) << " " << "|\n"; + + for (auto ite : component_detail_map) { + component_str = component_string(ite.first); + health_str = health_string(ite.second.component_health); + std::cout << "|" << std::setw(20) << " " << "| " + << std::setw(60) << std::left << component_str + health_str << "|\n"; + + for (auto msg : ite.second.err_str) + output_errstr(msg); + + std::cout << "|" << std::setw(20) << " " << "| " + << std::setw(60) << " " << "|\n"; + } + std::cout << "+--------------------+-" //"-" width :20 + << "------------------------------------------------------------+\n"; //-" width :60 + } + + return count; +} + +void RdciHealthSubSystem::health_check() const { + unsigned int components = 0; + rdc_status_t result; + rdc_health_response_t response; + + result = rdc_health_get(rdc_handle_, group_id_, &components); + if (result != RDC_ST_OK) { + std::string error_msg = rdc_status_string(result); + if (result == RDC_ST_NOT_FOUND) { + error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field."; + } + throw RdcException(result, error_msg.c_str()); + } + + if (0 == components) { + std::string error_msg = "Health watches not enable, please enable watches first."; + throw RdcException(RDC_ST_UNKNOWN_ERROR, error_msg.c_str()); + } + + result = rdc_health_check(rdc_handle_, group_id_, &response); + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } + + //output headline + std::string overall_str = health_string(response.overall_health); + if (is_json_output()) { + std::cout << "\"heading\" : \"Health monitor report\", "; + std::cout << "\"body\" : "; + std::cout << "{\"Group\" : \"" << std::to_string(group_id_) << "\", "; + std::cout << "\"Overall Health\" : \"" << overall_str << "\", "; + std::cout << "\"GPU\" : ["; + } else { + std::cout << "Health monitor report:" << std::endl; + std::cout << "+--------------------+-" //"-" width :20 + << "------------------------------------------------------------+\n"; //-" width :60 + std::cout << "|" << std::setw(20) << std::left << " Group " + std::to_string(group_id_) << "| " + << std::setw(60) << std::left << "Overall Health: " + overall_str << "|\n"; + std::cout << "+====================+=" //"=" width :20 + << "============================================================+\n"; //"=" width :60 + } + + //output health of per GPU + unsigned int index = 0; + while (index < response.incidents_count) { + uint32_t gpu_index = response.incidents[index].gpu_index; + + unsigned int count = handle_one_gpu(response, index, gpu_index); + index += count; + if (is_json_output() && (index < response.incidents_count)) + std::cout << ","; + } + + if (is_json_output()) + std::cout << "]}"; //end Group +} + +void RdciHealthSubSystem::health_clear() const { + rdc_status_t result; + + result = rdc_health_clear(rdc_handle_, group_id_); + if (result != RDC_ST_OK) { + std::string error_msg = rdc_status_string(result); + if (result == RDC_ST_NOT_FOUND) { + error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field."; + } + throw RdcException(result, error_msg.c_str()); + } + + std::cout << "Clear Group " << group_id_ << " all health monitor systems." << std::endl; +} + +void RdciHealthSubSystem::process() { + switch (health_ops_) { + case HEALTH_HELP: + case HEALTH_UNKNOWN: + show_help(); + break; + + case HEALTH_FETCH: + get_watches(); + break; + + case HEALTH_SET: + set_watches(); + break; + + case HEALTH_CHECK: + health_check(); + break; + + case HEALTH_CLEAR: + health_clear(); + break; + + default: + show_help(); + break; + } +} + +} // namespace rdc +} // namespace amd diff --git a/projects/rdc/rdci/src/rdci.cc b/projects/rdc/rdci/src/rdci.cc index 8a5179f7fc..399585cf6d 100644 --- a/projects/rdc/rdci/src/rdci.cc +++ b/projects/rdc/rdci/src/rdci.cc @@ -31,6 +31,7 @@ THE SOFTWARE. #include "RdciGroupSubSystem.h" #include "RdciStatsSubSystem.h" #include "RdciPolicySubSystem.h" +#include "RdciHealthSubSystem.h" #include "rdc/rdc.h" #include "rdc_lib/RdcException.h" #include "rdc_lib/rdc_common.h" @@ -50,7 +51,7 @@ int main(int argc, char** argv) { const std::string usage_help = "Usage:\trdci |\n" "subsystem: \n" - " discovery, dmon, group, fieldgroup, stats, diag, policy\n" + " discovery, dmon, group, fieldgroup, stats, diag, policy, health\n" "options: \n" " -v(--version) : Print client version information only\n"; @@ -81,6 +82,8 @@ int main(int argc, char** argv) { subsystem.reset(new amd::rdc::RdciGroupSubSystem()); } else if (subsystem_name == "fieldgroup") { subsystem.reset(new amd::rdc::RdciFieldGroupSubSystem()); + } else if (subsystem_name == "health") { + subsystem.reset(new amd::rdc::RdciHealthSubSystem()); } else if (subsystem_name == "stats") { subsystem.reset(new amd::rdc::RdciStatsSubSystem()); } else if (subsystem_name == "policy") { diff --git a/projects/rdc/server/include/rdc/rdc_api_service.h b/projects/rdc/server/include/rdc/rdc_api_service.h index ec000f4d12..208a5a5652 100644 --- a/projects/rdc/server/include/rdc/rdc_api_service.h +++ b/projects/rdc/server/include/rdc/rdc_api_service.h @@ -153,6 +153,22 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service { const ::rdc::UnRegisterPolicyRequest* request, ::rdc::UnRegisterPolicyResponse* reply) override; + ::grpc::Status SetHealth(::grpc::ServerContext* context, + const ::rdc::SetHealthRequest* request, + ::rdc::SetHealthResponse* reply) override; + + ::grpc::Status GetHealth(::grpc::ServerContext* context, + const ::rdc::GetHealthRequest* request, + ::rdc::GetHealthResponse* reply) override; + + ::grpc::Status CheckHealth(::grpc::ServerContext* context, + const ::rdc::CheckHealthRequest* request, + ::rdc::CheckHealthResponse* reply) override; + + ::grpc::Status ClearHealth(::grpc::ServerContext* context, + const ::rdc::ClearHealthRequest* request, + ::rdc::ClearHealthResponse* reply) override; + private: bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target); rdc_handle_t rdc_handle_; diff --git a/projects/rdc/server/src/rdc_api_service.cc b/projects/rdc/server/src/rdc_api_service.cc index ace5aa474c..b6ff07b4da 100644 --- a/projects/rdc/server/src/rdc_api_service.cc +++ b/projects/rdc/server/src/rdc_api_service.cc @@ -947,5 +947,93 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData) return ::grpc::Status::OK; } +::grpc::Status RdcAPIServiceImpl::SetHealth(::grpc::ServerContext* context, + const ::rdc::SetHealthRequest* request, + ::rdc::SetHealthResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + + rdc_status_t result = rdc_health_set(rdc_handle_, request->group_id(), request->components()); + + reply->set_status(result); + + return ::grpc::Status::OK; +} + +::grpc::Status RdcAPIServiceImpl::GetHealth(::grpc::ServerContext* context, + const ::rdc::GetHealthRequest* request, + ::rdc::GetHealthResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + + unsigned int components; + rdc_status_t result = rdc_health_get(rdc_handle_, request->group_id(), &components); + + reply->set_status(result); + if (result != RDC_ST_OK) { + return ::grpc::Status::OK; + } + + reply->set_components(components); + + return ::grpc::Status::OK; +} + +::grpc::Status RdcAPIServiceImpl::CheckHealth(::grpc::ServerContext* context, + const ::rdc::CheckHealthRequest* request, + ::rdc::CheckHealthResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + + rdc_health_response_t response; + rdc_status_t result = rdc_health_check(rdc_handle_, request->group_id(), &response); + + reply->set_status(result); + if (result != RDC_ST_OK) { + return ::grpc::Status::OK; + } + + ::rdc::HealthResponse* to_response = reply->mutable_response(); + to_response->set_overall_health(response.overall_health); + to_response->set_incidents_count(response.incidents_count); + + for (uint32_t i = 0; i < response.incidents_count; i++) { + const rdc_health_incidents_t& incident = response.incidents[i]; + ::rdc::HealthIncidents* to_incidents = to_response->add_incidents(); + + to_incidents->set_gpu_index(incident.gpu_index); + to_incidents->set_component(incident.component); + to_incidents->set_health(incident.health); + + //error + auto to_error = to_incidents->mutable_error(); + to_error->set_code(incident.error.code); + to_error->set_msg(incident.error.msg); + } + + return ::grpc::Status::OK; +} + +::grpc::Status RdcAPIServiceImpl::ClearHealth(::grpc::ServerContext* context, + const ::rdc::ClearHealthRequest* request, + ::rdc::ClearHealthResponse* reply) { + (void)(context); + if (!reply || !request) { + return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents"); + } + + rdc_status_t result = rdc_health_clear(rdc_handle_, request->group_id()); + + reply->set_status(result); + + return ::grpc::Status::OK; +} + } // namespace rdc } // namespace amd