Backgroud health check

Add the RdcSmiHealth module, which will call rocm_smi_lib. It will support following health: - XGMI error detected - PCIE replay count detected - Memory check - InfoROM check - Power/Thermal check The grpc client and server side health function is added. The health module is added to the rdci. At present, XGMI/PCIE and a part of Memory have been implemented. Others will be added as soon as possible. Change-Id: I1bd99290bdc7dea733f21a41a8c4bcefb2138112 [ROCm/rdc commit: 853d3b0cc5]
2024-10-23 16:42:24 +08:00
@@ -161,3 +161,12 @@ FLD_DESC_ENT(RDC_EVNT_NOTIF_THERMAL_THROTTLE, "Clk freq decrease due to temp",
 FLD_DESC_ENT(RDC_EVNT_NOTIF_PRE_RESET,   "GPU reset is about to occur",                 "GPU_PRE_RESET",    false)
 FLD_DESC_ENT(RDC_EVNT_NOTIF_POST_RESET,  "GPU reset just occurred",                     "GPU_POST_RESET",   false)
 FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG,   "GPU ring hang just occured",                  "RING_HANG",        false)
+
+// RDC health related fields
+FLD_DESC_ENT(RDC_HEALTH_XGMI_ERROR,               "XGMI one or more errors detected",           "XGMI_ERROR",               true)
+FLD_DESC_ENT(RDC_HEALTH_PCIE_REPLAY_COUNT,        "Total PCIE replay count",                    "PCIE_REPLAY_COUNT",        true)
+FLD_DESC_ENT(RDC_HEALTH_PENDING_PAGE_NUM,         "Pending page number",                        "PENDING_PAGE_NUM",         true)
+FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT,       "Retired page limit",                         "RETIRED_PAGE_LIMIT",       false)
+FLD_DESC_ENT(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, "Uncorrectable page limit",                   "UNCORRECTABLE_PAGE_LIMIT", false)
+FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME,      "Power throttle status counter",              "POWER_THROTTLE_TIME",      false)
+FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME,    "Total time(ms) in thermal throttle status",  "THERMAL_THROTTLE_TIME",    false)
@@ -120,6 +120,12 @@ set(POLICY_EXAMPLE_EXE "policy")
 add_executable(${POLICY_EXAMPLE_EXE} "${POLICY_EXAMPLE_SRC_LIST}")
 target_link_libraries(${POLICY_EXAMPLE_EXE} pthread dl rdc_bootstrap)

+set(HEALTH_EXAMPLE_SRC_LIST "health_example.cc")
+cmake_print_variables(HEALTH_EXAMPLE_SRC_LIST)
+set(HEALTH_EXAMPLE_EXE "health")
+add_executable(${HEALTH_EXAMPLE_EXE} "${HEALTH_EXAMPLE_SRC_LIST}")
+target_link_libraries(${HEALTH_EXAMPLE_EXE} pthread dl rdc_bootstrap)
+
 message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
 message("                    Finished Cmake Example                          ")
 message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
@@ -0,0 +1,359 @@
+/*
+Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <unistd.h>
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include <map>
+
+#include "rdc/rdc.h"
+
+rdc_status_t get_watches(rdc_handle_t rdc_handle, rdc_gpu_group_t group_id) {
+  unsigned int components;
+  rdc_status_t result = rdc_health_get(rdc_handle, group_id, &components);
+  if (result == RDC_ST_OK) {
+    std::string on = "On";
+    std::string off = "Off";
+
+    std::cout << "Health monitor systems status:" << std::endl;
+    std::cout << "+--------------------+" //"-" width :20
+              << "---------------------------------------------------+\n"; //-" width :51
+    std::cout << "|" << std::setw(20) << std::left << " PCIe"    << "| "
+              << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "|\n";
+    std::cout << "|" << std::setw(20) << std::left << " XGMI"    << "| "
+              << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n";
+    std::cout << "|" << std::setw(20) << std::left << " Memory"  << "| "
+              << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n";
+    std::cout << "+--------------------+" //"-" width :20
+              << "---------------------------------------------------+\n"; //-" width :51
+  }
+
+  return result;
+}
+
+std::string health_string(rdc_health_result_t health) {
+  switch (health) {
+    case RDC_HEALTH_RESULT_PASS:
+      return "Pass";
+
+    case RDC_HEALTH_RESULT_WARN:
+      return "Warning";
+
+    case RDC_HEALTH_RESULT_FAIL:
+      return "Fail";
+
+    default:
+      return "Unknown";
+  }
+}
+
+std::string component_string(rdc_health_system_t component) {
+    switch (component) {
+      case RDC_HEALTH_WATCH_PCIE:
+        return "PCIe system: ";
+
+      case RDC_HEALTH_WATCH_XGMI:
+        return"XGMI system: ";
+
+      case RDC_HEALTH_WATCH_MEM:
+        return "Memory system: ";
+
+      case RDC_HEALTH_WATCH_INFOROM:
+        return "Inforom system: ";
+
+      case RDC_HEALTH_WATCH_THERMAL:
+        return "Thermal system:";
+
+      case RDC_HEALTH_WATCH_POWER:
+        return "Power system: ";
+
+      default:
+        return "Unknown";
+    }
+}
+
+void output_errstr(const std::string& input) {
+  std::string word, line_str;
+  unsigned int width = 60, line_size = 0;
+  std::istringstream iss(input);
+
+  while (iss >> word) {
+    if (line_size + word.size() >= width) {
+      std::cout << "|" << std::setw(20) << " " << "| "
+                << std::setw(width) << std::left << line_str << "|\n";
+
+      //add new line string
+      line_str = word;
+      line_size = word.size();
+    } else {
+      if (line_size > 0) {
+        line_str += " ";
+        line_str += word;
+        line_size += word.size() + 1;
+      } else {
+        line_str += word;
+        line_size += word.size();
+      }
+    }
+  } //end while
+
+  if (0 < line_size)
+      std::cout << "|" << std::setw(20) << " " << "| "
+                << std::setw(width) << std::left << line_str << "|\n";
+}
+
+unsigned int handle_one_component(rdc_health_response_t &response,
+                                  unsigned int start_index,
+                                  uint32_t gpu_index,
+                                  rdc_health_system_t component,
+                                  rdc_health_result_t &component_health,
+                                  std::vector<std::string> &err_str) {
+  unsigned int count = 0;
+  rdc_health_incidents_t *incident;
+  std::string all_err_str;
+
+  for (unsigned int i = start_index; i < response.incidents_count; i++) {
+    incident = &response.incidents[i];
+
+    //same GPU Index, same component
+    if ((incident->gpu_index != gpu_index) ||
+        (incident->component != component))
+      break;
+
+    //set component health
+    if (incident->health > component_health)
+      component_health = incident->health;
+
+    all_err_str = " - ";
+    all_err_str += incident->error.msg;
+    err_str.push_back(all_err_str);
+
+    count++;
+  }
+
+  return count;
+}
+
+unsigned int handle_one_gpu(rdc_health_response_t &response,
+                            unsigned int start_index,
+                            uint32_t gpu_index) {
+  unsigned int count = 0, comp_count = 0;
+  rdc_health_incidents_t *incident;
+  rdc_health_result_t gpu_health = RDC_HEALTH_RESULT_PASS;
+  std::string component_str, health_str, gpu_health_str;
+  typedef struct {
+    rdc_health_result_t component_health;
+    std::vector<std::string> err_str;
+  } component_detail_t;
+  std::map<rdc_health_system_t, component_detail_t> component_detail_map;
+
+  for (unsigned int i = start_index; i < response.incidents_count; i++) {
+    incident = &response.incidents[i];
+
+    //same GPU Index
+    if (incident->gpu_index != gpu_index)
+      break;
+
+    //set gpu health
+    if (incident->health > gpu_health)
+      gpu_health = incident->health;
+
+    //handle smae component
+    component_detail_t detail;
+    detail.component_health = RDC_HEALTH_RESULT_PASS;
+    detail.err_str.clear();
+
+    comp_count = handle_one_component(response, i, gpu_index, incident->component, detail.component_health, detail.err_str);
+    i += comp_count - 1;
+    count += comp_count;
+
+    // Add to the component detail map
+    component_detail_map.insert({incident->component, detail});
+  }
+
+  //output gpu_index health result
+  gpu_health_str = health_string(gpu_health);
+
+  std::cout << "|" << std::setw(20) << " GPU ID: " + std::to_string(gpu_index) << "| "
+            << std::setw(60) << std::left << gpu_health_str << "|\n";
+  std::cout << "|" << std::setw(20) << " " << "| "
+            << std::setw(60) << " " << "|\n";
+
+  for (auto ite : component_detail_map) {
+    component_str = component_string(ite.first);
+    health_str = health_string(ite.second.component_health);
+    std::cout << "|" << std::setw(20) << " " << "| "
+              << std::setw(60) << std::left << component_str + health_str << "|\n";
+
+    for (auto msg : ite.second.err_str)
+      output_errstr(msg);
+
+    std::cout << "|" << std::setw(20) << " " << "| "
+              << std::setw(60) << " " << "|\n";
+  }
+  std::cout << "+--------------------+-" //"-" width :20
+            << "------------------------------------------------------------+\n"; //-" width :60
+
+  return count;
+}
+
+int main(int, char**) {
+  rdc_status_t result;
+  rdc_handle_t rdc_handle;
+  char hostIpAddress[] = {"127.0.0.1:50051"};
+  char group_name[] = {"healthgroup1"};
+
+  std::cout << "Start rdci in Standalone mode\n";
+
+  // Init the rdc
+  result = rdc_init(0);
+
+  if (result != RDC_ST_OK) {
+    std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl;
+    goto cleanup;
+  } else {
+    std::cout << "RDC Initialized.\n";
+  }
+
+  result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr);
+  if (result != RDC_ST_OK) {
+    std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result)
+              << std::endl;
+    goto cleanup;
+  }
+
+  // Now we can use the same API for standalone
+  // (1) create group and add GPUs
+  rdc_gpu_group_t group_id;
+  result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id);
+  if (result != RDC_ST_OK) {
+    std::cout << "Error creating group. Return: " << rdc_status_string(result)
+              << std::endl;
+    goto cleanup;
+  }
+  std::cout << "Created the GPU group " << group_id << std::endl;
+
+  result = rdc_group_gpu_add(rdc_handle, group_id, 0);  // Add GPU 0
+  if (result != RDC_ST_OK) {
+    std::cout << "Error adding group. Return: " << rdc_status_string(result)
+              << std::endl;
+    goto destroygroup;
+  }
+
+  rdc_device_attributes_t attribute;
+  result = rdc_device_get_attributes(rdc_handle, 0, &attribute);
+  if (result != RDC_ST_OK) {
+    std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result);
+    goto destroygroup;
+  }
+  std::cout << "Add GPU 0: " << attribute.device_name << " to group "
+            << group_id << std::endl;
+
+  // (2) get heath current watches before setting
+  result = get_watches(rdc_handle, group_id);
+  if (result != RDC_ST_OK) {
+    std::cout << "Error getting health watches. Return: " << rdc_status_string(result)
+              << std::endl;
+    goto destroygroup;
+  }
+
+  // (3) set health watches.
+  unsigned int components;
+  components = RDC_HEALTH_WATCH_PCIE | RDC_HEALTH_WATCH_XGMI | RDC_HEALTH_WATCH_MEM;
+  result = rdc_health_set(rdc_handle, group_id, components);
+  if (result != RDC_ST_OK) {
+    std::cout << "Error setting health watches. Return: " << rdc_status_string(result)
+              << std::endl;
+    goto destroygroup;
+  }
+  std::cout << "Set health watches to all." << std::endl;
+
+  // (4) get heath current watches after setting
+  result = get_watches(rdc_handle, group_id);
+  if (result != RDC_ST_OK) {
+    std::cout << "Error getting health watches. Return: " << rdc_status_string(result)
+              << std::endl;
+    goto destroygroup;
+  }
+
+  std::cout << "Start to health monitor group:" << group_id
+            << std::endl;
+  std::cout << "Sleep a few seconds before retreive the data ...\n";
+  // For standalone mode, the daemon will update and cache the samples
+  // take samples, standalone mode, do nothing
+    usleep(5000000);  // sleep 5 seconds before fetch the stats
+
+  // (5) Get the health stats
+  rdc_health_response_t response;
+  result = rdc_health_check(rdc_handle, group_id, &response);
+  if (result != RDC_ST_OK) {
+    std::cout << "Error health check. Return: " << rdc_status_string(result)
+              << std::endl;
+    goto destroygroup;
+  } else {
+    //output headline
+    std::string overall_str = health_string(response.overall_health);
+    std::cout << "Health monitor report:" << std::endl;
+    std::cout << "+--------------------+-" //"-" width :20
+              << "------------------------------------------------------------+\n"; //-" width :60
+    std::cout << "|" << std::setw(20) << std::left << " Group " + std::to_string(group_id) << "| "
+              << std::setw(60) << std::left << "Overall Health: " + overall_str << "|\n";
+    std::cout << "+====================+=" //"=" width :20
+              << "============================================================+\n"; //"=" width :60
+
+    //output health of per GPU
+    unsigned int index = 0;
+    while (index < response.incidents_count) {
+      uint32_t gpu_index = response.incidents[index].gpu_index;
+
+      unsigned int count = handle_one_gpu(response, index, gpu_index);
+      index += count;
+    }
+  }
+
+  // (6) Clear the health
+  result = rdc_health_clear(rdc_handle, group_id);
+  if (result != RDC_ST_OK) {
+    std::cout << "Error clear health. Return: " << rdc_status_string(result)
+              << std::endl;
+    goto destroygroup;
+  }
+  std::cout << "Clear Group " << group_id << " all health monitor systems." << std::endl;
+
+destroygroup:
+  // Delete the GPU group
+  result = rdc_group_gpu_destroy(rdc_handle, group_id);
+  if (result != RDC_ST_OK) {
+    std::cout << "Error delete GPU group. Return: " << rdc_status_string(result);
+    goto cleanup;
+  }
+  std::cout << "Deleted the GPU group " << group_id << std::endl;
+
+  // Cleanup consists of shutting down RDC.
+cleanup:
+  std::cout << "Cleaning up.\n";
+  rdc_disconnect(rdc_handle);
+  rdc_shutdown();
+  return result;
+}
@@ -335,6 +335,18 @@ typedef enum {
  RDC_EVNT_NOTIF_RING_HANG,         //!< GPU ring hang just occurred

  RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_RING_HANG,
+
+  /**
+   * @brief RDC health related fields
+   */
+  RDC_HEALTH_XGMI_ERROR = 3000,       //!< XGMI one or more errors detected
+  RDC_HEALTH_PCIE_REPLAY_COUNT,       //!< Total PCIE replay count
+  RDC_HEALTH_RETIRED_PAGE_NUM,        //!< Retired page number
+  RDC_HEALTH_PENDING_PAGE_NUM,        //!< Pending page number
+  RDC_HEALTH_RETIRED_PAGE_LIMIT,      //!< The threshold of retired page
+  RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT,//!< The threshold of uncorrectable page
+  RDC_HEALTH_POWER_THROTTLE_TIME,     //!< Power throttle status counter
+  RDC_HEALTH_THERMAL_THROTTLE_TIME,   //!< Total time in thermal throttle status (microseconds)
 } rdc_field_t;

 // even and odd numbers are used for correctable and uncorrectable errors
@@ -589,6 +601,81 @@ typedef struct {
  rdc_policy_action_t action; //!< Action to take
 } rdc_policy_t;

+/**
+ * @brief type of health watches
+ */
+typedef enum {
+    RDC_HEALTH_WATCH_PCIE       = 0x1,   //!< PCIe system watches
+    RDC_HEALTH_WATCH_XGMI       = 0x2,   //!< XGMI system watches
+    RDC_HEALTH_WATCH_MEM        = 0x4,   //!< Memory watches
+    RDC_HEALTH_WATCH_INFOROM    = 0x8,   //!< Inforom watches
+    RDC_HEALTH_WATCH_THERMAL    = 0x10,  //!< Temperature watches
+    RDC_HEALTH_WATCH_POWER      = 0x20,  //!< Power watches
+} rdc_health_system_t;
+
+/**
+ * @brief type of health result
+ */
+typedef enum {
+  RDC_HEALTH_RESULT_PASS,  //!< The health test pass
+  RDC_HEALTH_RESULT_WARN,  //!< The health test has warnings
+  RDC_HEALTH_RESULT_FAIL   //!< The health test fail
+} rdc_health_result_t;
+
+/**
+ * @brief The maximum length of the health messages
+ */
+#define MAX_HEALTH_MSG_LENGTH 4096
+
+/**
+ * 8 replays per minute is the maximum recommended
+ */
+#define PCIE_MAX_REPLAYS_PERMIN 8
+
+// The error code set at rdc_health_incidents_t.error.code
+typedef enum {
+  RDC_FR_PCI_REPLAY_RATE = 1000,
+  RDC_FR_ECC_UNCORRECTABLE_DETECTED = 1001,
+  RDC_FR_PENDING_PAGE_RETIREMENTS = 1002,
+  RDC_FR_RETIRED_PAGES_LIMIT = 1003,
+  RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT = 1004,
+  RDC_FR_CLOCKS_THROTTLE_THERMAL = 1005,
+  RDC_FR_CLOCKS_THROTTLE_POWER = 1006,
+  RDC_FR_XGMI_SINGLE_ERROR = 1007,
+  RDC_FR_XGMI_MULTIPLE_ERROR = 1008,
+  RDC_FR_CORRUPT_INFOROM = 1009
+} rdc_health_error_code_t;
+
+/**
+ * @brief details of the health errors
+ */
+typedef struct {
+  char      msg[MAX_HEALTH_MSG_LENGTH];  //!< The test result details
+  uint32_t  code;                        //!< The low level error code
+} rdc_health_detail_t;
+
+/**
+ * @brief details of the per health incidents
+ */
+typedef struct {
+    uint32_t              gpu_index;  //!< which GPU in this group have the issue
+    rdc_health_system_t   component;  //!< which components have the issue
+    rdc_health_result_t   health;     //!< health diagnosis of this incident
+    rdc_health_detail_t   error;      //!< The details of the error, rdc_health_error_code_t
+} rdc_health_incidents_t;
+
+
+#define HEALTH_MAX_ERROR_ITEMS 64
+
+/**
+ * @brief The health responses for test cases
+ */
+typedef struct {
+    rdc_health_result_t       overall_health;                     //!< The overall health of this entire host
+    unsigned int              incidents_count;                    //!< The number of health incidents reported in this struct
+    rdc_health_incidents_t    incidents[HEALTH_MAX_ERROR_ITEMS];  //!< Report of the errors detected
+} rdc_health_response_t;
+
 /**
 *  @brief Initialize ROCm RDC.
 *
@@ -1274,6 +1361,72 @@ rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t grou
 */
 rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);

+/**
+ *  @brief enable the health check for a group
+ *
+ *  @details For each group, only one parameter can be set. If you want to
+ *  clear the setting for a group, set component == 0x0
+ *
+ *  @param[in] p_rdc_handle The RDC handler.
+ *
+ *  @param[in] group_id The GPU group id.
+ *
+ *  @param[in] components  The list of components that should be enabled for health check
+ *  for example, RDC_HEALTH_WATCH_THERMAL | RDC_HEALTH_WATCH_POWER
+ *
+ *  @retval ::RDC_ST_OK is returned upon successful call.
+ */
+rdc_status_t rdc_health_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
+                            unsigned int components);
+
+/**
+ *  @brief get the health check settings of a group
+ *
+ *  @details get the health check settings of a component
+ *
+ *  @param[in] p_rdc_handle The RDC handler.
+ *
+ *  @param[in] group_id The GPU group id.
+ *
+ *  @param[out] components  The list of components that should be enabled for health check
+ *  for example, RDC_HEALTH_WATCH_THERMAL | RDC_HEALTH_WATCH_POWER
+ *  if it is 0x0, then the health check not set for the group yet.
+ *
+ *  @retval ::RDC_ST_OK is returned upon successful call.
+ */
+rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
+                            unsigned int* components);
+
+/**
+ *  @brief Check health watch results
+ *
+ *  @details If it has incidents.
+ *  For each incident, check the component and error message.
+ *
+ *  @param[in] p_rdc_handle The RDC handler.
+ *
+ *  @param[in] group_id The GPU group id.
+ *
+ *  @param[inout] response  The detail results of the health.
+ *
+ *  @retval ::RDC_ST_OK is returned upon successful call.
+ */
+rdc_status_t rdc_health_check(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
+                              rdc_health_response_t* response);
+
+/**
+ *  @brief clear the health watch
+ *
+ *  @details For each group, clear the setting.
+ *
+ *  @param[in] p_rdc_handle The RDC handler.
+ *
+ *  @param[in] group_id The GPU group id.
+ *
+ *  @retval ::RDC_ST_OK is returned upon successful call.
+ */
+rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
+
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
@@ -59,6 +59,21 @@ class RdcCacheManager {
  virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0;
  virtual rdc_status_t rdc_job_remove_all() = 0;

+  virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id,
+                                      uint32_t gpu_index,
+                                      const rdc_field_value& value) = 0;
+  virtual rdc_status_t rdc_health_get_values(rdc_gpu_group_t group_id,
+                                             uint32_t gpu_index,
+                                             rdc_field_t field_id,
+                                             uint64_t start_timestamp,
+                                             uint64_t end_timestamp,
+                                             rdc_field_value* start_value,
+                                             rdc_field_value* end_value) = 0;
+  virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0;
+  virtual rdc_status_t rdc_update_health_stats(rdc_gpu_group_t group_id,
+                                               uint32_t gpu_index,
+                                               const rdc_field_value& value) = 0;
+
  virtual ~RdcCacheManager() {}
 };

@@ -106,6 +106,12 @@ class RdcHandler {

  virtual rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) = 0;

+  // Health API
+  virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) = 0;
+  virtual rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) = 0;
+  virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) = 0;
+  virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0;
+
  virtual ~RdcHandler() {}
 };

@@ -50,6 +50,14 @@ class RdcWatchTable {
  virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
                                         rdc_field_grp_t field_group_id) = 0;

+  virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id,
+                                      unsigned int components) = 0;
+  virtual rdc_status_t rdc_health_get(rdc_gpu_group_t group_id,
+                                      unsigned int* components) = 0;
+  virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id,
+                                        rdc_health_response_t *response) = 0;
+  virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0;
+
  virtual ~RdcWatchTable() {}
 };

@@ -32,6 +32,8 @@ THE SOFTWARE.
 #include "rdc_lib/RdcCacheManager.h"
 #include "rdc_lib/rdc_common.h"

+#define HEALTH_MAX_KEEP_SAMPLES 300
+
 namespace amd {
 namespace rdc {

@@ -81,6 +83,9 @@ struct RdcJobStatsCacheEntry {
 // <job_id, job_stats>
 typedef std::map<std::string, RdcJobStatsCacheEntry> RdcJobStatsCache;

+// <group_id, health_samples>
+typedef std::map<rdc_gpu_group_t, RdcCacheSamples> RdcHealthStatsCache;
+
 class RdcCacheManagerImpl : public RdcCacheManager {
 public:
  rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field,
@@ -105,6 +110,21 @@ class RdcCacheManagerImpl : public RdcCacheManager {
  rdc_status_t rdc_job_remove(const char job_id[64]) override;
  rdc_status_t rdc_job_remove_all() override;

+  rdc_status_t rdc_health_set(rdc_gpu_group_t group_id,
+                              uint32_t gpu_index,
+                              const rdc_field_value& value) override;
+  rdc_status_t rdc_health_get_values(rdc_gpu_group_t group_id,
+                                     uint32_t gpu_index,
+                                     rdc_field_t field_id,
+                                     uint64_t start_timestamp,
+                                     uint64_t end_timestamp,
+                                     rdc_field_value* start_value,
+                                     rdc_field_value* end_value) override;
+  rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
+  rdc_status_t rdc_update_health_stats(rdc_gpu_group_t group_id,
+                                       uint32_t gpu_index,
+                                       const rdc_field_value& value) override;
+
 private:
  void set_summary(const FieldSummaryStats& stats, rdc_stats_summary_t& gpu,
                   rdc_stats_summary_t& summary,  // NOLINT
@@ -113,6 +133,7 @@ class RdcCacheManagerImpl : public RdcCacheManager {
                           uint32_t num_gpus);  // NOLINT
  RdcCacheSamples cache_samples_;
  RdcJobStatsCache cache_jobs_;
+  RdcHealthStatsCache cache_health_;
  std::mutex cache_mutex_;
 };

@@ -108,6 +108,12 @@ class RdcEmbeddedHandler final : public RdcHandler {

  rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;

+  // Health API
+  rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override;
+  rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override;
+  rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override;
+  rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
+
  explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode);
  ~RdcEmbeddedHandler() final;

@@ -105,6 +105,12 @@ class RdcStandaloneHandler : public RdcHandler {

  rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;

+  // Health API
+  rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override;
+  rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override;
+  rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override;
+  rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
+
  explicit RdcStandaloneHandler(const char* ip_and_port, const char* root_ca,
                                const char* client_cert, const char* client_key);

@@ -55,6 +55,12 @@ struct JobWatchTableEntry {
  std::vector<RdcFieldKey> fields;  //< store fields for faster query
 };

+struct HealthWatchTableEntry {
+  unsigned int components;
+  rdc_field_grp_t field_group_id;
+  std::vector<RdcFieldKey> fields;  //< store fields for faster query
+};
+
 class RdcWatchTableImpl : public RdcWatchTable {
 public:
  rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, const char job_id[64],
@@ -74,6 +80,11 @@ class RdcWatchTableImpl : public RdcWatchTable {
  //!< is reached, which will be handled in the clean_up() function.
  rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override;

+  rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override;
+  rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override;
+  rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override;
+  rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
+
  //!< When the RDC is running as RDC_OPERATION_MODE_MANUAL, the user will
  //!< call this function periodically. Instead of providing other APIs to
  //!< cleanup the cache, this function will update and cleanup the cache.
@@ -85,7 +96,8 @@ class RdcWatchTableImpl : public RdcWatchTable {
  rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) override;

  RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr,
-                    const RdcModuleMgrPtr& module_mgr, const RdcNotificationPtr& notif);
+                    const RdcMetricFetcherPtr& metric_fetcher, const RdcModuleMgrPtr& module_mgr,
+                    const RdcNotificationPtr& notif);

 private:
  //!< Helper function to Update the fields_in_table when unwatch tables
@@ -104,13 +116,39 @@ class RdcWatchTableImpl : public RdcWatchTable {
  bool is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id,
                          std::string& job_id) const;  // NOLINT

+  bool is_health_watch_field(uint32_t gpu_index, rdc_field_t field_id,
+                             rdc_gpu_group_t& group_id) const;
+
  rdc_status_t rdc_notif_update_cache(rdc_evnt_notification_t* events, uint32_t num_events);
  //!< The function will be pass as the callback for bulk fetch
  static rdc_status_t handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values,
                                    void* user_data);

+  rdc_status_t create_health_field_group(unsigned int components,
+                                         rdc_field_grp_t* field_group_id);
+  //!< output: Whether health incidents are full
+  bool add_health_incident(uint32_t gpu_index,
+                           rdc_health_system_t component,
+                           rdc_health_result_t  health,
+                           uint32_t err_code,
+                           std::string err_msg,
+                           rdc_health_incidents_t* incident,
+                           rdc_health_response_t* response);
+  rdc_status_t get_start_end_values(rdc_gpu_group_t group_id,
+                                    uint32_t gpu_index,
+                                    rdc_field_t field,
+                                    rdc_field_value *start_value,
+                                    rdc_field_value *end_value);
+  rdc_status_t pcie_check(rdc_gpu_group_t group_id,
+                          uint32_t gpu_index, rdc_health_response_t* response);
+  rdc_status_t xgmi_check(rdc_gpu_group_t group_id,
+                          uint32_t gpu_index, rdc_health_response_t* response);
+  rdc_status_t memory_check(rdc_gpu_group_t group_id,
+                          uint32_t gpu_index, rdc_health_response_t* response);
+
  RdcGroupSettingsPtr group_settings_;
  RdcCacheManagerPtr cache_mgr_;
+  RdcMetricFetcherPtr metric_fetcher_;
  RdcModuleMgrPtr rdc_module_mgr_;
  RdcNotificationPtr notifications_;

@@ -126,6 +164,9 @@ class RdcWatchTableImpl : public RdcWatchTable {
  //!< Those settings will only be updated when watching or unwatching.
  std::map<RdcFieldKey, FieldSettings> fields_to_watch_;

+  //!< The health watch table to store the health settings.
+  std::map<uint32_t, HealthWatchTableEntry> health_watch_table_;
+
  //!< The last clean up time
  std::atomic<uint64_t> last_cleanup_time_;
  std::mutex watch_mutex_;
@@ -190,6 +190,18 @@ service RdcAPI {
  //  rdc_policy_condition_t condition);
  rpc UnRegisterPolicy(UnRegisterPolicyRequest) returns (UnRegisterPolicyResponse) {}

+  // Health API
+  // rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components);
+  rpc SetHealth(SetHealthRequest) returns (SetHealthResponse) {}
+
+  // rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components);
+  rpc GetHealth(GetHealthRequest) returns (GetHealthResponse) {}
+
+  // rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t r*esponse);
+  rpc CheckHealth(CheckHealthRequest) returns (CheckHealthResponse) {}
+
+  // rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id);
+  rpc ClearHealth(ClearHealthRequest) returns (ClearHealthResponse) {}
 }

 message Empty {
@@ -620,3 +632,55 @@ message UnRegisterPolicyRequest {
 message UnRegisterPolicyResponse {
  uint32 status = 1;
 }
+
+message SetHealthRequest {
+  uint32 group_id = 1;
+  uint32 components = 2;
+}
+
+message SetHealthResponse {
+  uint32 status = 1;
+}
+message GetHealthRequest {
+  uint32 group_id = 1;
+}
+
+message GetHealthResponse {
+  uint32 status = 1;
+  uint32 components = 2;
+}
+
+message CheckHealthRequest {
+  uint32 group_id = 1;
+}
+
+message HealthDetail {
+  string msg = 1;
+  uint32 code = 2;
+}
+
+message HealthIncidents {
+  uint32 gpu_index = 1;
+  uint32 component = 2;
+  uint32 health = 3;
+  HealthDetail error = 4;
+}
+
+message HealthResponse {
+  uint32 overall_health = 1;
+  uint32 incidents_count = 2;
+  repeated HealthIncidents incidents = 3;
+}
+
+message CheckHealthResponse {
+  uint32 status = 1;
+  HealthResponse response = 2;
+}
+
+message ClearHealthRequest {
+  uint32 group_id = 1;
+}
+
+message ClearHealthResponse {
+  uint32 status = 1;
+}
@@ -163,6 +163,14 @@ class rdc_field_t(c_int):
     RDC_EVNT_NOTIF_PRE_RESET = 2002
     RDC_EVNT_NOTIF_POST_RESET = 2003
     RDC_EVNT_NOTIF_RING_HANG = 2004
+     RDC_HEALTH_XGMI_ERROR = 3000
+     RDC_HEALTH_PCIE_REPLAY_COUNT = 3001
+     RDC_HEALTH_RETIRED_PAGE_NUM = 3002
+     RDC_HEALTH_PENDING_PAGE_NUM = 3003
+     RDC_HEALTH_RETIRED_PAGE_LIMIT = 3004
+     RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT = 3005
+     RDC_HEALTH_POWER_THROTTLE_TIME = 3006
+     RDC_HEALTH_THERMAL_THROTTLE_TIME = 3007

 rdc_handle_t = c_void_p
 rdc_gpu_group_t = c_uint32
@@ -401,6 +401,45 @@ rdc_field_t get_field_id_from_name(const char* name) {
  return RDC_FI_INVALID;
 }

+rdc_status_t rdc_health_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
+                            unsigned int components) {
+  if (!p_rdc_handle) {
+    return RDC_ST_INVALID_HANDLER;
+  }
+
+  return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
+      ->rdc_health_set(group_id, components);
+}
+
+rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
+                            unsigned int* components) {
+  if (!p_rdc_handle) {
+    return RDC_ST_INVALID_HANDLER;
+  }
+
+  return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
+      ->rdc_health_get(group_id, components);
+}
+
+rdc_status_t rdc_health_check(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
+                              rdc_health_response_t *response) {
+  if (!p_rdc_handle) {
+    return RDC_ST_INVALID_HANDLER;
+  }
+
+  return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
+      ->rdc_health_check(group_id, response);
+}
+
+rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) {
+  if (!p_rdc_handle) {
+    return RDC_ST_INVALID_HANDLER;
+  }
+
+  return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
+      ->rdc_health_clear(group_id);
+}
+
 char* strncpy_with_null(char* dest, const char* src, size_t n) {
  if (n == 0) {
    return dest;
@@ -449,5 +449,142 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(const char job_id[64],
  return RDC_ST_OK;
 }

+rdc_status_t RdcCacheManagerImpl::rdc_health_set(rdc_gpu_group_t group_id,
+                                                 uint32_t gpu_index,
+                                                 const rdc_field_value& value) {
+  std::lock_guard<std::mutex> guard(cache_mutex_);
+  RdcFieldKey field{gpu_index, value.field_id};
+
+  // Set initial values
+  RdcCacheEntry entry;
+  entry.last_time = value.ts;
+  entry.value = value.value;
+  entry.type = value.type;
+
+  auto health_ite = cache_health_.find(group_id);
+  if (health_ite == cache_health_.end()) {
+    std::vector<RdcCacheEntry> ve;
+    ve.push_back(entry);
+
+    RdcCacheSamples cache_sample;
+    cache_sample.insert({field, ve});
+
+    cache_health_.insert({group_id, cache_sample});
+  }
+  else {
+    auto samples_ite = health_ite->second.find(field);
+    if (samples_ite == health_ite->second.end()) {
+      std::vector<RdcCacheEntry> ve;
+      ve.push_back(entry);
+
+      health_ite->second.insert({field, ve});
+    } else {
+      samples_ite->second.push_back(entry);
+    }
+  }
+
+  return RDC_ST_OK;
+}
+
+rdc_status_t RdcCacheManagerImpl::rdc_health_get_values(rdc_gpu_group_t group_id,
+                                                        uint32_t gpu_index,
+                                                        rdc_field_t field_id,
+                                                        uint64_t start_timestamp,
+                                                        uint64_t end_timestamp,
+                                                        rdc_field_value* start_value,
+                                                        rdc_field_value* end_value) {
+  if (!start_value && !end_value)
+    return RDC_ST_BAD_PARAMETER;
+
+  std::lock_guard<std::mutex> guard(cache_mutex_);
+  auto health_ite = cache_health_.find(group_id);
+  if (health_ite == cache_health_.end())
+    return RDC_ST_NOT_FOUND;
+
+  RdcFieldKey field{gpu_index, field_id};
+  auto samples_ite = health_ite->second.find(field);
+  if (samples_ite == health_ite->second.end() ||
+      samples_ite->second.size() == 0)
+    return RDC_ST_NOT_FOUND;
+
+  auto cache_values = samples_ite->second;
+  rdc_status_t result = RDC_ST_OK;
+  if (start_value != nullptr) {
+    //get start value
+    result = RDC_ST_NOT_FOUND;
+    for (auto entry = cache_values.begin(); entry != cache_values.end(); entry++) {
+      if (entry->last_time >= start_timestamp) {
+        start_value->field_id = field_id;
+        start_value->ts = entry->last_time;
+        start_value->type = entry->type;
+        if (entry->type == STRING)
+          strncpy_with_null(start_value->value.str, entry->value.str, RDC_MAX_STR_LENGTH);
+        else
+          start_value->value.l_int = entry->value.l_int;
+        result = RDC_ST_OK;
+        break;
+      }
+    } //end for
+  } //end if
+
+  if ((RDC_ST_OK == result) && (end_value != nullptr)) {
+    // get end value
+    result = RDC_ST_NOT_FOUND;
+    for (auto entry = cache_values.rbegin(); entry != cache_values.rend(); entry++) {
+      if (entry->last_time <= end_timestamp) {
+        end_value->field_id = field_id;
+        end_value->ts = entry->last_time;
+        end_value->type = entry->type;
+        if (entry->type == STRING)
+          strncpy_with_null(end_value->value.str, entry->value.str, RDC_MAX_STR_LENGTH);
+        else
+          end_value->value.l_int = entry->value.l_int;
+        result = RDC_ST_OK;
+        break;
+      }
+    } //end for
+  } //end if
+
+  return result;
+}
+
+rdc_status_t RdcCacheManagerImpl::rdc_health_clear(rdc_gpu_group_t group_id) {
+  std::lock_guard<std::mutex> guard(cache_mutex_);
+  cache_health_.erase(group_id);
+  return RDC_ST_OK;
+}
+
+rdc_status_t RdcCacheManagerImpl::rdc_update_health_stats(rdc_gpu_group_t group_id,
+                                                          uint32_t gpu_index,
+                                                          const rdc_field_value& value) {
+  std::lock_guard<std::mutex> guard(cache_mutex_);
+  auto health_ite = cache_health_.find(group_id);
+  if (health_ite == cache_health_.end()) {
+    return RDC_ST_NOT_FOUND;
+  }
+
+  RdcFieldKey field{gpu_index, value.field_id};
+  auto samples_ite = health_ite->second.find(field);
+  if (samples_ite == health_ite->second.end()) {
+    return RDC_ST_NOT_FOUND;
+  }
+
+  // Check HEALTH_MAX_KEEP_SAMPLES
+  auto& cache_values = samples_ite->second;
+  int item_remove = cache_values.size() - HEALTH_MAX_KEEP_SAMPLES + 1;
+  if (item_remove > 0) {
+    cache_values.erase(cache_values.begin(), cache_values.begin() + item_remove);
+  }
+
+  RdcCacheEntry entry;
+  entry.last_time = value.ts;
+  entry.value = value.value;
+  entry.type = value.type;
+
+  cache_values.push_back(entry);
+
+  return RDC_ST_OK;
+}
+
 }  // namespace rdc
 }  // namespace amd
@@ -79,7 +79,7 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode)
      metric_fetcher_(new RdcMetricFetcherImpl()),
      rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)),
      rdc_notif_(new RdcNotificationImpl()),
-      watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, rdc_module_mgr_, rdc_notif_)),
+      watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, metric_fetcher_, rdc_module_mgr_, rdc_notif_)),
      metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)),
      policy_(new RdcPolicyImpl(group_settings_,metric_fetcher_)) {
  if (mode == RDC_OPERATION_MODE_AUTO) {
@@ -460,5 +460,38 @@ rdc_status_t RdcEmbeddedHandler::rdc_policy_unregister(rdc_gpu_group_t group_id)
  return policy_->rdc_policy_unregister(group_id);
 }

+// Health API
+rdc_status_t RdcEmbeddedHandler::rdc_health_set(rdc_gpu_group_t group_id,
+                                                unsigned int components) {
+  if (0 == components) {
+    return RDC_ST_BAD_PARAMETER;
+  }
+
+  return watch_table_->rdc_health_set(group_id, components);
+}
+
+rdc_status_t RdcEmbeddedHandler::rdc_health_get(rdc_gpu_group_t group_id,
+                                                unsigned int *components) {
+  if (components == nullptr) {
+    return RDC_ST_BAD_PARAMETER;
+  }
+
+  return watch_table_->rdc_health_get(group_id, components);
+}
+
+rdc_status_t RdcEmbeddedHandler::rdc_health_check(rdc_gpu_group_t group_id,
+                                                  rdc_health_response_t *response) {
+  if (response == nullptr) {
+    return RDC_ST_BAD_PARAMETER;
+  }
+
+  return watch_table_->rdc_health_check(group_id, response);
+}
+
+rdc_status_t RdcEmbeddedHandler::rdc_health_clear(rdc_gpu_group_t group_id) {
+
+  return watch_table_->rdc_health_clear(group_id);
+}
+
 }  // namespace rdc
 }  // namespace amd
@@ -804,6 +804,66 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
      read_gpu_metrics_uint64_t();
      break;

+    case RDC_HEALTH_XGMI_ERROR: {
+      amdsmi_xgmi_status_t status;
+      ret = amdsmi_gpu_xgmi_error_status(processor_handle, &status);
+      value->status = Smi2RdcError(ret);
+      value->type = INTEGER;
+      if (value->status == AMDSMI_STATUS_SUCCESS) {
+        value->value.l_int = static_cast<int64_t>(status);
+      }
+      break;
+    }
+
+    case RDC_HEALTH_PCIE_REPLAY_COUNT: {
+      amdsmi_pcie_info_t pcie_info;
+      ret = amdsmi_get_pcie_info(processor_handle, &pcie_info);
+      value->status = Smi2RdcError(ret);
+      value->type = INTEGER;
+      if (value->status == AMDSMI_STATUS_SUCCESS) {
+        value->value.l_int = static_cast<int64_t>(pcie_info.pcie_metric.pcie_replay_count);
+      }
+      break;
+    }
+
+    case RDC_HEALTH_RETIRED_PAGE_NUM:
+    case RDC_HEALTH_PENDING_PAGE_NUM: {
+      uint32_t num_pages = 0;
+      ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, nullptr);
+      if (AMDSMI_STATUS_SUCCESS == ret) {
+          if (RDC_HEALTH_RETIRED_PAGE_NUM == field_id) {
+            value->status = Smi2RdcError(ret);
+            value->type = INTEGER;
+            value->value.l_int = static_cast<int64_t>(num_pages);
+            break;
+          }
+
+          if ((0 < num_pages) &&
+              (RDC_HEALTH_PENDING_PAGE_NUM == field_id)) {
+            std::vector<amdsmi_retired_page_record_t> bad_page_info(num_pages);
+            ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages,
+                                               bad_page_info.data());
+            value->status = Smi2RdcError(ret);
+            value->type = INTEGER;
+            if (AMDSMI_STATUS_SUCCESS == ret) {
+              uint64_t pending_page_num = 0;
+              for (uint32_t i=0; i < num_pages; i++) {
+                if (AMDSMI_MEM_PAGE_STATUS_PENDING == bad_page_info[i].status)
+                    pending_page_num++;
+              }
+
+              value->value.l_int = static_cast<int64_t>(pending_page_num);
+            }
+          }
+      } else
+         value->status = Smi2RdcError(ret);
+      break;
+    }
+
+    case RDC_HEALTH_RETIRED_PAGE_LIMIT:
+    case RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT:
+    case RDC_HEALTH_POWER_THROTTLE_TIME: //gpu_metrics 1.6
+    case RDC_HEALTH_THERMAL_THROTTLE_TIME: //gpu_metrics 1.6
    default:
      break;
  }
@@ -180,6 +180,9 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
      RDC_EVNT_XGMI_1_THRPUT,   RDC_EVNT_XGMI_2_THRPUT,   RDC_EVNT_XGMI_3_THRPUT,
      RDC_EVNT_XGMI_4_THRPUT,   RDC_EVNT_XGMI_5_THRPUT,   RDC_FI_OAM_ID,
      RDC_FI_GPU_MM_ENC_UTIL,   RDC_FI_GPU_MM_DEC_UTIL,   RDC_FI_GPU_MEMORY_ACTIVITY,
+      RDC_HEALTH_XGMI_ERROR,          RDC_HEALTH_PCIE_REPLAY_COUNT,   RDC_HEALTH_RETIRED_PAGE_NUM,
+      RDC_HEALTH_PENDING_PAGE_NUM,    RDC_HEALTH_RETIRED_PAGE_LIMIT,  RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT,
+      RDC_HEALTH_POWER_THROTTLE_TIME, RDC_HEALTH_THERMAL_THROTTLE_TIME,
  };
  std::copy(fields.begin(), fields.end(), field_ids);
  *field_count = fields.size();
@@ -41,10 +41,12 @@ namespace rdc {

 RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings,
                                     const RdcCacheManagerPtr& cache_mgr,
+                                     const RdcMetricFetcherPtr& metric_fetcher,
                                     const RdcModuleMgrPtr& module_mgr,
                                     const RdcNotificationPtr& notif)
    : group_settings_(group_settings),
      cache_mgr_(cache_mgr),
+      metric_fetcher_(metric_fetcher),
      rdc_module_mgr_(module_mgr),
      notifications_(notif),
      last_cleanup_time_(0) {}
@@ -373,6 +375,423 @@ rdc_status_t RdcWatchTableImpl::rdc_field_unwatch(rdc_gpu_group_t group_id,
  return update_field_in_table_when_unwatch(ite->first);
 }

+rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int components,
+                                                          rdc_field_grp_t* field_group_id) {
+  // set filed ids
+  std::vector<rdc_field_t> field_ids{};
+  if (components & RDC_HEALTH_WATCH_PCIE) {
+      field_ids.push_back(RDC_HEALTH_PCIE_REPLAY_COUNT);
+  }
+
+  if (components & RDC_HEALTH_WATCH_XGMI) {
+      field_ids.push_back(RDC_HEALTH_XGMI_ERROR);
+  }
+
+  if (components & RDC_HEALTH_WATCH_MEM) {
+      field_ids.push_back(RDC_FI_ECC_UNCORRECT_TOTAL);
+      field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM);
+      field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM);
+      field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT);
+      field_ids.push_back(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT);
+  }
+
+  if (components & RDC_HEALTH_WATCH_INFOROM) {
+  }
+
+  if (components & RDC_HEALTH_WATCH_THERMAL) {
+      field_ids.push_back(RDC_HEALTH_THERMAL_THROTTLE_TIME);
+  }
+
+  if (components & RDC_HEALTH_WATCH_POWER) {
+      field_ids.push_back(RDC_HEALTH_POWER_THROTTLE_TIME);
+  }
+
+  if (0 == field_ids.size()) {
+    RDC_LOG(RDC_ERROR, "Fail to health set. The components must contain at least one watch.");
+    return RDC_ST_BAD_PARAMETER;
+  }
+
+  const std::string field_group_name("health-field-group");
+  return group_settings_->rdc_group_field_create(field_ids.size(), field_ids.data(),
+                                                 field_group_name.c_str(), field_group_id);
+}
+
+rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id,
+                                               unsigned int components) {
+  // remove old health for same group_id
+  rdc_health_clear(group_id);
+
+  // create a field group base on the components
+  rdc_field_grp_t field_group_id;
+  rdc_status_t result = create_health_field_group(components, &field_group_id);
+  if (result != RDC_ST_OK) {
+    return result;
+  }
+
+  // get field key
+  std::vector<RdcFieldKey> fields_in_watch;
+  result = get_fields_from_group(group_id, field_group_id, fields_in_watch);
+  if (result != RDC_ST_OK) {
+    return result;
+  }
+
+  // add to the health watch table
+  do {  //< lock guard for thread safe
+    std::lock_guard<std::mutex> guard(watch_mutex_);
+    HealthWatchTableEntry hentry{components, field_group_id, fields_in_watch};
+    health_watch_table_.insert({group_id, hentry});
+  } while (0);
+
+  for (auto fields = fields_in_watch.begin(); fields != fields_in_watch.end(); fields++) {
+    // get initial values
+    rdc_field_value value;
+    result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value);
+    if (result != RDC_ST_OK)
+      break;
+
+    // set initial values to cache
+    result = cache_mgr_->rdc_health_set(group_id, fields->first, value);
+    if (result != RDC_ST_OK)
+      break;
+  }
+
+  // Start to watch the fields and update fields per 1 second.
+  result = rdc_field_watch(group_id, field_group_id, 1000000, 0, 0);
+  return result;
+}
+
+rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id,
+                                               unsigned int *components) {
+  if (nullptr == components)
+    return RDC_ST_BAD_PARAMETER;
+
+  std::lock_guard<std::mutex> guard(watch_mutex_);
+  auto table_iter = health_watch_table_.find(group_id);
+
+  // already in the health watch table
+  if (table_iter != health_watch_table_.end())
+    *components = table_iter->second.components;
+  else
+    *components = 0;
+
+  return RDC_ST_OK;
+}
+
+bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index,
+                                            rdc_health_system_t component,
+                                            rdc_health_result_t  health,
+                                            uint32_t err_code,
+                                            std::string err_msg,
+                                            rdc_health_incidents_t* incident,
+                                            rdc_health_response_t* response) {
+  bool result = false;
+
+  incident->gpu_index  = gpu_index;
+  incident->component  = component;
+  incident->health     = health;
+  incident->error.code = err_code;
+  strncpy_with_null(incident->error.msg, err_msg.c_str(), MAX_HEALTH_MSG_LENGTH);
+
+  if (incident->health > response->overall_health)
+    response->overall_health = incident->health;
+  response->incidents_count++;
+  if (response->incidents_count >= HEALTH_MAX_ERROR_ITEMS) {
+    RDC_LOG(RDC_INFO, "Health incidents are full!");
+    result = true;
+  }
+
+  return (result);
+}
+
+rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id,
+                                                     uint32_t gpu_index,
+                                                     rdc_field_t field,
+                                                     rdc_field_value *start_value,
+                                                     rdc_field_value *end_value) {
+  if ((nullptr == start_value) || (nullptr == end_value))
+    return RDC_ST_BAD_PARAMETER;
+
+  uint64_t start_timestamp = 0;
+
+  //get the history data last 1 minute
+  start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
+
+  //get the values of the field at the start_timestamp/end_timestampe
+  rdc_status_t result = cache_mgr_->rdc_health_get_values(group_id,
+                                          gpu_index, field,
+                                          start_timestamp, 0,
+                                          start_value, nullptr);
+  if (result != RDC_ST_OK) {
+    RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result);
+    return result;
+  }
+
+  // get end values
+  result = metric_fetcher_->fetch_smi_field(gpu_index, field, end_value);
+  if (result != RDC_ST_OK)
+    RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " current data. Return: " << result);
+
+  return result;
+}
+
+rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id,
+                                           uint32_t gpu_index,
+                                           rdc_health_response_t* response) {
+  //get field start/end values
+  rdc_field_value start = {}, end = {};
+  rdc_status_t result = get_start_end_values(group_id,
+                                             gpu_index,
+                                             RDC_HEALTH_PCIE_REPLAY_COUNT,
+                                             &start,
+                                             &end);
+  if (result != RDC_ST_OK)
+    return result;
+
+  uint64_t pcie_replay_count = end.value.l_int - start.value.l_int;
+  if (pcie_replay_count > PCIE_MAX_REPLAYS_PERMIN) {
+    rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
+
+    std::string err_msg = "Detected ";
+    err_msg += std::to_string(pcie_replay_count);
+    err_msg += " PCIe replays per minute exceeding the max limit ";
+    err_msg += std::to_string(PCIE_MAX_REPLAYS_PERMIN);
+    err_msg += ".";
+
+    //add incident
+    if (add_health_incident(gpu_index,
+                            RDC_HEALTH_WATCH_PCIE,
+                            RDC_HEALTH_RESULT_WARN,
+                            RDC_FR_PCI_REPLAY_RATE,
+                            err_msg,
+                            incident,
+                            response))
+      return RDC_ST_MAX_LIMIT;
+  }
+
+  return RDC_ST_OK;
+}
+
+rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id,
+                                           uint32_t gpu_index,
+                                           rdc_health_response_t* response) {
+  //get field start/end values
+  rdc_field_value start = {}, end = {};
+  rdc_status_t result = get_start_end_values(group_id,
+                                             gpu_index,
+                                             RDC_HEALTH_XGMI_ERROR,
+                                             &start,
+                                             &end);
+  if (result != RDC_ST_OK)
+    return result;
+
+  amdsmi_xgmi_status_t status = static_cast<amdsmi_xgmi_status_t>(end.value.l_int);
+  if (AMDSMI_XGMI_STATUS_NO_ERRORS != status) {
+    rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
+
+    uint32_t err_code;
+    std::string err_msg = "Detected ";
+    if (AMDSMI_XGMI_STATUS_ERROR == status) {
+      err_msg += " a single XGMI error";
+      err_code = RDC_FR_XGMI_SINGLE_ERROR;
+    } else {
+      err_msg += " multiple XGMI errors";
+      err_code = RDC_FR_XGMI_MULTIPLE_ERROR;
+    }
+    err_msg += ".";
+
+    //add incident
+    if (add_health_incident(gpu_index,
+                            RDC_HEALTH_WATCH_XGMI,
+                            RDC_HEALTH_RESULT_FAIL,
+                            err_code,
+                            err_msg,
+                            incident,
+                            response))
+      return RDC_ST_MAX_LIMIT;
+  }
+
+  return RDC_ST_OK;
+}
+
+rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
+                                             uint32_t gpu_index,
+                                             rdc_health_response_t* response) {
+  //get field start/end values
+  rdc_field_value start = {}, end = {};
+  rdc_status_t result = get_start_end_values(group_id,
+                                             gpu_index,
+                                             RDC_FI_ECC_UNCORRECT_TOTAL,
+                                             &start,
+                                             &end);
+  if (result != RDC_ST_OK)
+    return result;
+
+  uint64_t ecc_uncorrectable_count = 0;
+  ecc_uncorrectable_count = end.value.l_int - start.value.l_int;
+  if (ecc_uncorrectable_count > 0) {
+    rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
+
+    std::string err_msg = "Detected ";
+    err_msg += std::to_string(ecc_uncorrectable_count);
+    err_msg += " uncorrectable ECC error(s) in the last minute.";
+
+    //add incident
+    if (add_health_incident(gpu_index,
+                            RDC_HEALTH_WATCH_MEM,
+                            RDC_HEALTH_RESULT_FAIL,
+                            RDC_FR_ECC_UNCORRECTABLE_DETECTED,
+                            err_msg,
+                            incident,
+                            response))
+      return RDC_ST_MAX_LIMIT;
+  }
+
+  result = get_start_end_values(group_id,
+                                gpu_index,
+                                RDC_HEALTH_PENDING_PAGE_NUM,
+                                &start,
+                                &end);
+  if (result != RDC_ST_OK)
+    return result;
+
+  uint64_t num_pages = end.value.l_int - start.value.l_int;
+  if (num_pages > 0) {
+    rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
+
+    std::string err_msg = "Detected ";
+    err_msg += std::to_string(num_pages);
+    err_msg += " pending retired page(s).";
+
+    //add incident
+    if (add_health_incident(gpu_index,
+                            RDC_HEALTH_WATCH_MEM,
+                            RDC_HEALTH_RESULT_WARN,
+                            RDC_FR_PENDING_PAGE_RETIREMENTS,
+                            err_msg,
+                            incident,
+                            response))
+      return RDC_ST_MAX_LIMIT;
+  }
+
+  //To do: RDC_FR_RETIRED_PAGES_LIMIT
+  //To do: RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT
+
+  return RDC_ST_OK;
+}
+
+rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id,
+                                                 rdc_health_response_t *response) {
+  if (nullptr == response)
+    return RDC_ST_BAD_PARAMETER;
+
+  unsigned int components = 0;
+  std::vector<RdcFieldKey> fields_in_watch;
+  do {  //< lock guard for thread safe
+    std::lock_guard<std::mutex> guard(watch_mutex_);
+    auto health = health_watch_table_.find(group_id);
+    if (health == health_watch_table_.end())
+      return RDC_ST_NOT_FOUND;
+    components = health->second.components;
+    fields_in_watch = health->second.fields;
+  } while (0);
+
+  rdc_group_info_t ginfo;
+  rdc_status_t result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
+  if (result != RDC_ST_OK)
+    return result;
+
+  for (auto fields = fields_in_watch.begin(); fields != fields_in_watch.end(); fields++) {
+    // get current values
+    rdc_field_value value;
+    result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value);
+    if (result != RDC_ST_OK)
+      break;
+
+    // set current values to cache
+    result = cache_mgr_->rdc_update_health_stats(group_id, fields->first, value);
+    if (result != RDC_ST_OK)
+      break;
+  }
+
+  //init response
+  response->overall_health = RDC_HEALTH_RESULT_PASS;
+  response->incidents_count = 0;
+
+  for (uint32_t gindex = 0; gindex < ginfo.count; gindex++) {
+    //PCIe
+    if (components & RDC_HEALTH_WATCH_PCIE) {
+      result = pcie_check(group_id, ginfo.entity_ids[gindex], response);
+      if (result == RDC_ST_MAX_LIMIT)
+        return result;
+    }
+
+    //XGMI
+    if (components & RDC_HEALTH_WATCH_XGMI) {
+      result = xgmi_check(group_id, ginfo.entity_ids[gindex], response);
+      if (result == RDC_ST_MAX_LIMIT)
+        return result;
+    }
+
+    //Memory
+    if (components & RDC_HEALTH_WATCH_MEM) {
+      result = memory_check(group_id, ginfo.entity_ids[gindex], response);
+      if (result == RDC_ST_MAX_LIMIT)
+        return result;
+    }
+
+    //InfoROM
+    if (components & RDC_HEALTH_WATCH_INFOROM) {
+      //To do:
+      return RDC_ST_NOT_SUPPORTED;
+    }
+
+    //Thermal
+    if (components & RDC_HEALTH_WATCH_THERMAL) {
+      //To do:
+      return RDC_ST_NOT_SUPPORTED;
+    }
+
+    //Power
+    if (components & RDC_HEALTH_WATCH_POWER) {
+      //To do:
+      return RDC_ST_NOT_SUPPORTED;
+    }
+  } //end of for gindex
+
+  return RDC_ST_OK;
+}
+
+rdc_status_t RdcWatchTableImpl::rdc_health_clear(rdc_gpu_group_t group_id) {
+  rdc_field_grp_t field_group_id;
+
+  do { //< lock guard for thread safe
+    std::lock_guard<std::mutex> guard(watch_mutex_);
+    auto health = health_watch_table_.find(group_id);
+    if (health == health_watch_table_.end()) {
+      return RDC_ST_NOT_FOUND;
+    }
+    field_group_id = health->second.field_group_id;
+  } while (0);
+
+  // at first, unwatch the old fields.
+  rdc_status_t result = rdc_field_unwatch(group_id, field_group_id);
+  if (result != RDC_ST_OK) {
+    return result;
+  }
+
+  // destroy the old field group
+  group_settings_->rdc_group_field_destroy(field_group_id);
+
+  do {  //< lock guard for thread safe
+    std::lock_guard<std::mutex> guard(watch_mutex_);
+    health_watch_table_.erase(group_id);
+  } while (0);
+
+  result = cache_mgr_->rdc_health_clear(group_id);
+
+  return RDC_ST_OK;
+}
+
 bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id,
                                           std::string& job_id) const {
  RdcFieldKey key{gpu_index, field_id};
@@ -388,6 +807,21 @@ bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, rdc_field_t field
  return false;
 }

+bool RdcWatchTableImpl::is_health_watch_field(uint32_t gpu_index, rdc_field_t field_id,
+                                              rdc_gpu_group_t& group_id) const {
+  RdcFieldKey key{gpu_index, field_id};
+
+  for (auto ite = health_watch_table_.begin(); ite != health_watch_table_.end(); ite++) {
+    auto& fields = ite->second.fields;
+    if (std::find(fields.begin(), fields.end(), key) != fields.end()) {
+      group_id = ite->first;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values,
                                              void* user_data) {
  if (values == nullptr || user_data == nullptr) {
@@ -421,6 +855,12 @@ rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values, uin
    if (watchTable->is_job_watch_field(gpu_index, field_id, job_id)) {
      watchTable->cache_mgr_->rdc_update_job_stats(gpu_index, job_id, values[i].field_value);
    }
+
+    // Update the health stats cache
+    rdc_gpu_group_t group_id;
+    if (watchTable->is_health_watch_field(gpu_index, field_id, group_id)) {
+      watchTable->cache_mgr_->rdc_update_health_stats(group_id, gpu_index, values[i].field_value);
+    }
  }
  return RDC_ST_OK;
 }
@@ -492,6 +932,12 @@ rdc_status_t RdcWatchTableImpl::rdc_notif_update_cache(rdc_evnt_notification_t*
    if (is_job_watch_field(gpu_index, field_id, job_id)) {
      cache_mgr_->rdc_update_job_stats(gpu_index, job_id, events[i].field);
    }
+
+    // Update the health stats cache
+    rdc_gpu_group_t group_id;
+    if (is_health_watch_field(gpu_index, field_id, group_id)) {
+      cache_mgr_->rdc_update_health_stats(group_id, gpu_index, events[i].field);
+    }
  }
  return RDC_ST_OK;
 }
@@ -549,6 +995,7 @@ void RdcWatchTableImpl::debug_status() {
  RDC_LOG(RDC_DEBUG, "fields_to_watch_:" << fields_to_watch_.size()
                                         << " watch_table_:" << watch_table_.size()
                                         << " job_watch_table_:" << job_watch_table_.size()
+                                         << " health_watch_table_:" << health_watch_table_.size()
                                         << " cache stats:" << cache_mgr_->get_cache_stats());

  if (watch_table_.size() > 0) {
@@ -575,6 +1022,18 @@ void RdcWatchTableImpl::debug_status() {
            jite->first << ": " << jite->second.group_id << " fields : " << strstream.str());
  }

+  if (health_watch_table_.size() > 0) {
+    RDC_LOG(RDC_DEBUG, "health watch table details: ");
+  }
+  for (auto hite = health_watch_table_.begin(); hite != health_watch_table_.end(); hite++) {
+    std::stringstream strstream;
+    for (const auto& p : hite->second.fields) {
+      strstream << "<" << p.first << "," << p.second << "> ";
+    }
+    RDC_LOG(RDC_DEBUG,
+            "group id : " << hite->first << " components : " << hite->second.components << " fields : " << strstream.str());
+  }
+
  if (fields_to_watch_.size() > 0) {
    RDC_LOG(RDC_DEBUG, "fields to watch details:");
  }
@@ -870,5 +870,87 @@ rdc_status_t RdcStandaloneHandler::rdc_policy_unregister(rdc_gpu_group_t group_i
  return error_handle(status, reply.status());
 }

+// Health RdcAPI
+rdc_status_t RdcStandaloneHandler::rdc_health_set(rdc_gpu_group_t group_id,
+                                                  unsigned int components) {
+  ::rdc::SetHealthRequest request;
+  ::rdc::SetHealthResponse reply;
+  ::grpc::ClientContext context;
+
+  request.set_group_id(group_id);
+  request.set_components(components);
+  ::grpc::Status status = stub_->SetHealth(&context, request, &reply);
+  rdc_status_t err_status = error_handle(status, reply.status());
+
+  return err_status;
+}
+
+rdc_status_t RdcStandaloneHandler::rdc_health_get(rdc_gpu_group_t group_id,
+                                                  unsigned int* components) {
+  if (!components) {
+    return RDC_ST_BAD_PARAMETER;
+  }
+
+  ::rdc::GetHealthRequest request;
+  ::rdc::GetHealthResponse reply;
+  ::grpc::ClientContext context;
+
+  request.set_group_id(group_id);
+  ::grpc::Status status = stub_->GetHealth(&context, request, &reply);
+  rdc_status_t err_status = error_handle(status, reply.status());
+  if (err_status != RDC_ST_OK) return err_status;
+
+  *components = reply.components();
+  return RDC_ST_OK;
+}
+
+rdc_status_t RdcStandaloneHandler::rdc_health_check(rdc_gpu_group_t group_id,
+                                                    rdc_health_response_t *response) {
+  if (!response) {
+    return RDC_ST_BAD_PARAMETER;
+  }
+
+  ::rdc::CheckHealthRequest request;
+  ::rdc::CheckHealthResponse reply;
+  ::grpc::ClientContext context;
+
+  request.set_group_id(group_id);
+  ::grpc::Status status = stub_->CheckHealth(&context, request, &reply);
+  rdc_status_t err_status = error_handle(status, reply.status());
+  if (err_status != RDC_ST_OK) return err_status;
+
+  auto res = reply.response();
+  response->overall_health = static_cast<rdc_health_result_t>(res.overall_health());
+  response->incidents_count = res.incidents_count();
+
+  for (int i = 0; i < res.incidents_size(); i++) {
+    const ::rdc::HealthIncidents& result = res.incidents(i);
+    rdc_health_incidents_t& to_result = response->incidents[i];
+
+    to_result.gpu_index = result.gpu_index();
+    to_result.component = static_cast<rdc_health_system_t>(result.component());
+    to_result.health = static_cast<rdc_health_result_t>(result.health());
+
+    //set error
+    to_result.error.code = result.error().code();
+    strncpy_with_null(to_result.error.msg, result.error().msg().c_str(), MAX_HEALTH_MSG_LENGTH);
+  }
+
+  return RDC_ST_OK;
+}
+
+rdc_status_t RdcStandaloneHandler::rdc_health_clear(rdc_gpu_group_t group_id) {
+  ::rdc::ClearHealthRequest request;
+  ::rdc::ClearHealthResponse reply;
+  ::grpc::ClientContext context;
+
+  request.set_group_id(group_id);
+  ::grpc::Status status = stub_->ClearHealth(&context, request, &reply);
+  rdc_status_t err_status = error_handle(status, reply.status());
+  if (err_status != RDC_ST_OK) return err_status;
+
+  return RDC_ST_OK;
+}
+
 }  // namespace rdc
 }  // namespace amd
@@ -68,6 +68,7 @@ set(RDCI_SRC_LIST
    "${SRC_DIR}/RdciGroupSubSystem.cc"
    "${SRC_DIR}/RdciStatsSubSystem.cc"
    "${SRC_DIR}/RdciPolicySubSystem.cc"
+    "${SRC_DIR}/RdciHealthSubSystem.cc"
    "${SRC_DIR}/RdciSubSystem.cc"
    "${SRC_DIR}/rdci.cc")
 message("RDCI_SRC_LIST=${RDCI_SRC_LIST}")
@@ -0,0 +1,77 @@
+/*
+Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef RDCI_INCLUDE_RDCIHEALTHSUBSYSTEM_H_
+#define RDCI_INCLUDE_RDCIHEALTHSUBSYSTEM_H_
+#include <signal.h>
+
+#include <string>
+
+#include "RdciSubSystem.h"
+
+namespace amd {
+namespace rdc {
+
+class RdciHealthSubSystem : public RdciSubSystem {
+ public:
+  RdciHealthSubSystem();
+  ~RdciHealthSubSystem();
+  void parse_cmd_opts(int argc, char** argv) override;
+  void process() override;
+
+ private:
+  void show_help() const;
+
+  void get_watches() const;
+  void set_watches() const;
+  void health_check() const;
+  void health_clear() const;
+
+  std::string health_string(rdc_health_result_t health) const;
+  std::string component_string(rdc_health_system_t component) const;
+  void output_errstr(const std::string& input) const;
+  unsigned int handle_one_component(rdc_health_response_t &response,
+                                    unsigned int start_index,
+                                    uint32_t gpu_index,
+                                    rdc_health_system_t component,
+                                    rdc_health_result_t &component_health,
+                                    std::vector<std::string> &err_str) const;
+  unsigned int handle_one_gpu(rdc_health_response_t &response,
+                              unsigned int start_index,
+                              uint32_t gpu_index) const;
+
+  enum OPERATIONS {
+    HEALTH_UNKNOWN = 0,
+    HEALTH_HELP,
+    HEALTH_FETCH,
+    HEALTH_SET,
+    HEALTH_CHECK,
+    HEALTH_CLEAR,
+  } health_ops_;
+
+  rdc_gpu_group_t      group_id_;
+  unsigned int         components_;
+};
+
+}  // namespace rdc
+}  // namespace amd
+
+#endif  // RDCI_INCLUDE_RDCIHEALTHSUBSYSTEM_H_
@@ -0,0 +1,557 @@
+/*
+Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include "RdciHealthSubSystem.h"
+
+#include <getopt.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <ctime>
+#include <iomanip>
+#include <limits>
+
+#include "common/rdc_utils.h"
+#include "rdc/rdc.h"
+#include "rdc_lib/RdcException.h"
+#include "rdc_lib/rdc_common.h"
+
+namespace amd {
+namespace rdc {
+
+RdciHealthSubSystem::RdciHealthSubSystem() {}
+
+RdciHealthSubSystem::~RdciHealthSubSystem() {}
+
+void RdciHealthSubSystem::parse_cmd_opts(int argc, char** argv) {
+  const int HOST_OPTIONS = 1000;
+  const int JSON_OPTIONS = 1001;
+  const int CLEAR_OPTIONS = 1002;
+  const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS},
+                                        {"unauth", optional_argument, nullptr, 'u'},
+                                        {"help", optional_argument, nullptr, 'h'},
+                                        {"json", optional_argument, nullptr, JSON_OPTIONS},
+                                        {"clear", optional_argument, nullptr, CLEAR_OPTIONS},
+                                        {"group", required_argument, nullptr, 'g'},
+                                        {"fetch", optional_argument, nullptr, 'f'},
+                                        {"set", required_argument, nullptr, 's'},
+                                        {"check", optional_argument, nullptr, 'c'},
+                                        {nullptr, 0, nullptr, 0}};
+
+  bool group_id_set = false;
+  int option_index = 0, opt = 0;
+  std::string flags;
+  unsigned int components = 0;
+
+  while ((opt = getopt_long(argc, argv, "uhg:fs:c", long_options, &option_index)) != -1) {
+    switch (opt) {
+      case HOST_OPTIONS:
+        ip_port_ = optarg;
+        break;
+
+      case JSON_OPTIONS:
+        set_json_output(true);
+        break;
+
+      case CLEAR_OPTIONS:
+        health_ops_ = HEALTH_CLEAR;
+        break;
+
+      case 'u':
+        use_auth_ = false;
+        break;
+
+      case 'h':
+        health_ops_ = HEALTH_HELP;
+        return;
+
+      case 'g':
+        if (!IsNumber(optarg)) {
+          show_help();
+          throw RdcException(RDC_ST_BAD_PARAMETER, "The group id needs to be a number");
+        }
+        group_id_ = std::stoi(optarg);
+        group_id_set = true;
+        break;
+
+      case 'f':
+        health_ops_ = HEALTH_FETCH;
+        break;
+
+      case 's':
+        health_ops_ = HEALTH_SET;
+
+        flags = optarg;
+        for (unsigned int i = 0; i < flags.length(); i++) {
+          switch (flags.at(i)) {
+            case 'a':
+              components |= RDC_HEALTH_WATCH_PCIE;
+              components |= RDC_HEALTH_WATCH_XGMI;
+              components |= RDC_HEALTH_WATCH_MEM;
+              //To do:
+              //components |= RDC_HEALTH_WATCH_INFOROM;
+              //components |= RDC_HEALTH_WATCH_THERMAL;
+              //components |= RDC_HEALTH_WATCH_POWER;
+              break;
+
+            case 'p':
+              components |= RDC_HEALTH_WATCH_PCIE;
+              break;
+
+            case 'm':
+              components |= RDC_HEALTH_WATCH_MEM;
+              break;
+
+            case 'i':
+              //To do:
+              //components |= RDC_HEALTH_WATCH_INFOROM;
+              throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported");
+              break;
+
+            case 't':
+              //To do:
+              //components |= RDC_HEALTH_WATCH_THERMAL;
+              //components |= RDC_HEALTH_WATCH_POWER;
+              throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported");
+              break;
+
+            case 'x':
+              components |= RDC_HEALTH_WATCH_XGMI;
+              break;
+
+            default:
+              throw RdcException(RDC_ST_BAD_PARAMETER, "Invalid flags");
+              break;
+          }
+        }
+
+        if (0 == components) {
+            throw RdcException(RDC_ST_BAD_PARAMETER, "No flags");
+        } else
+          components_ = components;
+        break;
+
+      case 'c':
+        health_ops_ = HEALTH_CHECK;
+        break;
+
+      default:
+        show_help();
+        throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options");
+    }
+  }
+
+  if (!group_id_set) {
+    show_help();
+    throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the GPU group id");
+  }
+}
+
+void RdciHealthSubSystem::show_help() const {
+  if (is_json_output()) return;
+  std::cout << " health -- Used to manage the health watches of a group. \n"
+            << " The health of the GPUs in a group can then be monitored"
+            << " during a process.\n\n";
+  std::cout << "Usage\n";
+  std::cout << "    rdci health [--host <IP/FQDN>:port] [-u] [-j] -g <groupId> -s <flags>\n";
+  std::cout << "    rdci health [--host <IP/FQDN>:port] [-u] [-j] -g <groupId> -c\n";
+  std::cout << "    rdci health [--host <IP/FQDN>:port] [-u] [-j] -g <groupId> -f\n";
+  std::cout << "    rdci health [--host <IP/FQDN>:prot] [-u] [-j] -g <groupId> --clear\n";
+  std::cout << "\nFlags:\n";
+  show_common_usage();
+  std::cout << "  --json                         Output using json.\n";
+  std::cout << "  --clear                        Disable all watches being monitored.\n";
+  std::cout << "  -g  --group    groupId         The GPU group to query "
+            << "on the specified host.\n";
+  std::cout << "  -f  --fetch                    Fetch the current watch status.\n";
+  std::cout << "  -s  --set      flags           The list of components can be watched. "
+            << "[default = pm]\n";
+  std::cout << "                                  a - watch all components\n";
+  std::cout << "                                  p - watch PCIe\n";
+  std::cout << "                                  m - watch Memory\n";
+  //std::cout << "                                  i - watch infoROM\n";
+  //std::cout << "                                  t - watch power and thermal\n";
+  std::cout << "                                  x - watch XGMI\n";
+  std::cout << "  -c  --check                    Check to see if any errors or warnings have "
+            << "occurred in the currently monitored watches.\n";
+}
+
+void RdciHealthSubSystem::get_watches() const {
+  rdc_status_t result;
+  unsigned int components = 0;
+  std::string on = "On";
+  std::string off = "Off";
+
+  result = rdc_health_get(rdc_handle_, group_id_, &components);
+  if (result != RDC_ST_OK) {
+    std::string error_msg = rdc_status_string(result);
+    if (result == RDC_ST_NOT_FOUND) {
+      error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field.";
+    }
+    throw RdcException(result, error_msg.c_str());
+  }
+
+  if (is_json_output()) {
+    std::cout << "\"heading\" : \"Health monitor systems status\", ";
+    std::cout << "\"body\" : [";
+    std::cout << "{\"Component\" : \"PCIe\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "\"},";
+    std::cout << "{\"Component\" : \"XGMI\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "\"},";
+    std::cout << "{\"Component\" : \"Memory\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "\"},";
+    /*std::cout << "{\"Component\" : \"InfoROM\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "\"},";
+    std::cout << "{\"Component\" : \"Thermal\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "\"},";
+    std::cout << "{\"Component\" : \"Power\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "\"}";*/
+    std::cout << "]";
+  } else {
+    std::cout << "Health monitor systems status:" << std::endl;
+    std::cout << "+--------------------+" //"-" width :20
+              << "---------------------------------------------------+\n"; //-" width :51
+    std::cout << "|" << std::setw(20) << std::left << " PCIe"    << "| "
+              << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "|\n";
+    std::cout << "|" << std::setw(20) << std::left << " XGMI"    << "| "
+              << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n";
+    std::cout << "|" << std::setw(20) << std::left << " Memory"  << "| "
+              << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n";
+    /*std::cout << "|" << std::setw(20) << std::left << " InfoROM" << "| "
+              << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "|\n";
+    std::cout << "|" << std::setw(20) << std::left << " Thermal" << "| "
+              << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "|\n";
+    std::cout << "|" << std::setw(20) << std::left << " Power"   << "| "
+              << std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n";*/
+    std::cout << "+--------------------+" //"-" width :20
+              << "---------------------------------------------------+\n"; //-" width :51
+  }
+}
+
+void RdciHealthSubSystem::set_watches() const {
+  rdc_status_t result;
+
+  result = rdc_health_set(rdc_handle_, group_id_, components_);
+  if (result != RDC_ST_OK) {
+    std::string error_msg = rdc_status_string(result);
+    if (result == RDC_ST_NOT_FOUND) {
+      error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field.";
+    }
+    throw RdcException(result, error_msg.c_str());
+  }
+
+  std::cout << "Group " << group_id_ << " health monitor systems set successfully." << std::endl;
+}
+
+std::string RdciHealthSubSystem::health_string(rdc_health_result_t health) const {
+  switch (health) {
+    case RDC_HEALTH_RESULT_PASS:
+      return "Pass";
+
+    case RDC_HEALTH_RESULT_WARN:
+      return "Warning";
+
+    case RDC_HEALTH_RESULT_FAIL:
+      return "Fail";
+
+    default:
+      return "Unknown";
+  }
+}
+
+std::string RdciHealthSubSystem::component_string(rdc_health_system_t component) const {
+    switch (component) {
+      case RDC_HEALTH_WATCH_PCIE:
+        return "PCIe system: ";
+
+      case RDC_HEALTH_WATCH_XGMI:
+        return"XGMI system: ";
+
+      case RDC_HEALTH_WATCH_MEM:
+        return "Memory system: ";
+
+      case RDC_HEALTH_WATCH_INFOROM:
+        return "Inforom system: ";
+
+      case RDC_HEALTH_WATCH_THERMAL:
+        return "Thermal system:";
+
+      case RDC_HEALTH_WATCH_POWER:
+        return "Power system: ";
+
+      default:
+        return "Unknown";
+    }
+}
+
+void RdciHealthSubSystem::output_errstr(const std::string& input) const {
+  std::string word, line_str;
+  unsigned int width = 60, line_size = 0;
+  std::istringstream iss(input);
+
+  while (iss >> word) {
+    if (line_size + word.size() >= width) {
+      std::cout << "|" << std::setw(20) << " " << "| "
+                << std::setw(width) << std::left << line_str << "|\n";
+
+      //add new line string
+      line_str = word;
+      line_size = word.size();
+    } else {
+      if (line_size > 0) {
+        line_str += " ";
+        line_str += word;
+        line_size += word.size() + 1;
+      } else {
+        line_str += word;
+        line_size += word.size();
+      }
+    }
+  } //end while
+
+  if (0 < line_size)
+      std::cout << "|" << std::setw(20) << " " << "| "
+                << std::setw(width) << std::left << line_str << "|\n";
+}
+
+unsigned int RdciHealthSubSystem::handle_one_component(rdc_health_response_t &response,
+                                                       unsigned int start_index,
+                                                       uint32_t gpu_index,
+                                                       rdc_health_system_t component,
+                                                       rdc_health_result_t &component_health,
+                                                       std::vector<std::string> &err_str) const {
+  unsigned int count = 0;
+  rdc_health_incidents_t *incident;
+  std::string all_err_str;
+
+  for (unsigned int i = start_index; i < response.incidents_count; i++) {
+    incident = &response.incidents[i];
+
+    //same GPU Index, same component
+    if ((incident->gpu_index != gpu_index) ||
+        (incident->component != component))
+      break;
+
+    //set component health
+    if (incident->health > component_health)
+      component_health = incident->health;
+
+    all_err_str = " - ";
+    all_err_str += incident->error.msg;
+    err_str.push_back(all_err_str);
+
+    count++;
+  }
+
+  return count;
+}
+
+unsigned int RdciHealthSubSystem::handle_one_gpu(rdc_health_response_t &response,
+                                                 unsigned int start_index,
+                                                 uint32_t gpu_index) const {
+  unsigned int count = 0, comp_count = 0;
+  rdc_health_incidents_t *incident;
+  rdc_health_result_t gpu_health = RDC_HEALTH_RESULT_PASS;
+  std::string component_str, health_str, gpu_health_str;
+  typedef struct {
+    rdc_health_result_t component_health;
+    std::vector<std::string> err_str;
+  } component_detail_t;
+  std::map<rdc_health_system_t, component_detail_t> component_detail_map;
+
+  for (unsigned int i = start_index; i < response.incidents_count; i++) {
+    incident = &response.incidents[i];
+
+    //same GPU Index
+    if (incident->gpu_index != gpu_index)
+      break;
+
+    //set gpu health
+    if (incident->health > gpu_health)
+      gpu_health = incident->health;
+
+    //handle smae component
+    component_detail_t detail;
+    detail.component_health = RDC_HEALTH_RESULT_PASS;
+    detail.err_str.clear();
+
+    comp_count = handle_one_component(response, i, gpu_index, incident->component, detail.component_health, detail.err_str);
+    i += comp_count - 1;
+    count += comp_count;
+
+    // Add to the component detail map
+    component_detail_map.insert({incident->component, detail});
+  }
+
+  //output gpu_index health result
+  gpu_health_str = health_string(gpu_health);
+
+  if (is_json_output()) {
+    std::cout << "{\"Index\" : \"" << std::to_string(gpu_index) << "\", ";
+    std::cout <<  "\"Health\" : \"" << gpu_health_str << "\", ";
+    std::cout <<  "\"Error\" : [";
+
+    unsigned int i = 0; 
+    for (auto ite : component_detail_map) {
+      component_str = component_string(ite.first);
+      health_str = health_string(ite.second.component_health);
+
+      std::cout << "{\"Component\" : \"" << component_str << "\", ";
+      std::cout << "\"Health\" : \"" << health_str << "\", ";
+
+      std::cout << "\"Message\" : [";
+      unsigned int j = 0; 
+      for (auto err_ite : ite.second.err_str) {
+        std::cout << "\"" << err_ite << "\"";
+        j++;
+        if (j < ite.second.err_str.size())
+          std::cout << ", ";
+      }
+      std::cout << "]}"; //end Message
+
+      i++;
+      if (i < component_detail_map.size()) {
+        std::cout << ", ";
+      }
+    }
+    std::cout <<  "]}"; //end Error
+  } else {
+    std::cout << "|" << std::setw(20) << " GPU ID: " + std::to_string(gpu_index) << "| "
+              << std::setw(60) << std::left << gpu_health_str << "|\n";
+    std::cout << "|" << std::setw(20) << " " << "| "
+              << std::setw(60) << " " << "|\n";
+
+    for (auto ite : component_detail_map) {
+      component_str = component_string(ite.first);
+      health_str = health_string(ite.second.component_health);
+      std::cout << "|" << std::setw(20) << " " << "| "
+                << std::setw(60) << std::left << component_str + health_str << "|\n";
+
+      for (auto msg : ite.second.err_str)
+        output_errstr(msg);
+
+      std::cout << "|" << std::setw(20) << " " << "| "
+                << std::setw(60) << " " << "|\n";
+    }
+    std::cout << "+--------------------+-" //"-" width :20
+              << "------------------------------------------------------------+\n"; //-" width :60
+  }
+
+  return count;
+}
+
+void RdciHealthSubSystem::health_check() const {
+  unsigned int components = 0;
+  rdc_status_t result;
+  rdc_health_response_t response;
+
+  result = rdc_health_get(rdc_handle_, group_id_, &components);
+  if (result != RDC_ST_OK) {
+    std::string error_msg = rdc_status_string(result);
+    if (result == RDC_ST_NOT_FOUND) {
+      error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field.";
+    }
+    throw RdcException(result, error_msg.c_str());
+  }
+
+  if (0 == components) {
+    std::string error_msg = "Health watches not enable, please enable watches first.";
+    throw RdcException(RDC_ST_UNKNOWN_ERROR, error_msg.c_str());
+  }
+
+  result = rdc_health_check(rdc_handle_, group_id_, &response);
+  if (result != RDC_ST_OK) {
+    throw RdcException(result, rdc_status_string(result));
+  }
+
+  //output headline
+  std::string overall_str = health_string(response.overall_health);
+  if (is_json_output()) {
+    std::cout << "\"heading\" : \"Health monitor report\", ";
+    std::cout << "\"body\" : ";
+    std::cout <<  "{\"Group\" : \"" << std::to_string(group_id_) << "\", ";
+    std::cout <<   "\"Overall Health\" : \"" << overall_str << "\", ";
+    std::cout <<   "\"GPU\" : [";
+  } else {
+    std::cout << "Health monitor report:" << std::endl;
+    std::cout << "+--------------------+-" //"-" width :20
+              << "------------------------------------------------------------+\n"; //-" width :60
+    std::cout << "|" << std::setw(20) << std::left << " Group " + std::to_string(group_id_) << "| "
+              << std::setw(60) << std::left << "Overall Health: " + overall_str << "|\n";
+    std::cout << "+====================+=" //"=" width :20
+              << "============================================================+\n"; //"=" width :60
+  }
+
+  //output health of per GPU
+  unsigned int index = 0;
+  while (index < response.incidents_count) {
+    uint32_t gpu_index = response.incidents[index].gpu_index;
+
+    unsigned int count = handle_one_gpu(response, index, gpu_index);
+    index += count;
+    if (is_json_output() && (index < response.incidents_count))
+      std::cout << ",";
+  }
+
+  if (is_json_output())
+    std::cout <<  "]}"; //end Group
+}
+
+void RdciHealthSubSystem::health_clear() const {
+  rdc_status_t result;
+
+  result = rdc_health_clear(rdc_handle_, group_id_);
+  if (result != RDC_ST_OK) {
+    std::string error_msg = rdc_status_string(result);
+    if (result == RDC_ST_NOT_FOUND) {
+      error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field.";
+    }
+    throw RdcException(result, error_msg.c_str());
+  }
+
+  std::cout << "Clear Group " << group_id_ << " all health monitor systems." << std::endl;
+}
+
+void RdciHealthSubSystem::process() {
+  switch (health_ops_) {
+    case HEALTH_HELP:
+    case HEALTH_UNKNOWN:
+      show_help();
+      break;
+
+    case HEALTH_FETCH:
+      get_watches();
+      break;
+
+    case HEALTH_SET:
+      set_watches();
+      break;
+
+    case HEALTH_CHECK:
+      health_check();
+      break;
+
+    case HEALTH_CLEAR:
+      health_clear();
+      break;
+
+    default:
+      show_help();
+      break;
+  }
+}
+
+}  // namespace rdc
+}  // namespace amd
@@ -31,6 +31,7 @@ THE SOFTWARE.
 #include "RdciGroupSubSystem.h"
 #include "RdciStatsSubSystem.h"
 #include "RdciPolicySubSystem.h"
+#include "RdciHealthSubSystem.h"
 #include "rdc/rdc.h"
 #include "rdc_lib/RdcException.h"
 #include "rdc_lib/rdc_common.h"
@@ -50,7 +51,7 @@ int main(int argc, char** argv) {
  const std::string usage_help =
      "Usage:\trdci <subsystem>|<options>\n"
      "subsystem: \n"
-      "          discovery, dmon, group, fieldgroup, stats, diag, policy\n"
+      "          discovery, dmon, group, fieldgroup, stats, diag, policy, health\n"
      "options: \n"
      "        -v(--version) : Print client version information only\n";

@@ -81,6 +82,8 @@ int main(int argc, char** argv) {
      subsystem.reset(new amd::rdc::RdciGroupSubSystem());
    } else if (subsystem_name == "fieldgroup") {
      subsystem.reset(new amd::rdc::RdciFieldGroupSubSystem());
+    } else if (subsystem_name == "health") {
+      subsystem.reset(new amd::rdc::RdciHealthSubSystem());
    } else if (subsystem_name == "stats") {
      subsystem.reset(new amd::rdc::RdciStatsSubSystem());
    } else if (subsystem_name == "policy") {
@@ -153,6 +153,22 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
                                  const ::rdc::UnRegisterPolicyRequest* request,
                                  ::rdc::UnRegisterPolicyResponse* reply) override;

+  ::grpc::Status SetHealth(::grpc::ServerContext* context,
+                           const ::rdc::SetHealthRequest* request,
+                           ::rdc::SetHealthResponse* reply) override;
+
+  ::grpc::Status GetHealth(::grpc::ServerContext* context,
+                           const ::rdc::GetHealthRequest* request,
+                           ::rdc::GetHealthResponse* reply) override;
+
+  ::grpc::Status CheckHealth(::grpc::ServerContext* context,
+                             const ::rdc::CheckHealthRequest* request,
+                             ::rdc::CheckHealthResponse* reply) override;
+
+  ::grpc::Status ClearHealth(::grpc::ServerContext* context,
+                             const ::rdc::ClearHealthRequest* request,
+                             ::rdc::ClearHealthResponse* reply) override;
+
 private:
  bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target);
  rdc_handle_t rdc_handle_;
@@ -947,5 +947,93 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData)
  return ::grpc::Status::OK;
 }

+::grpc::Status RdcAPIServiceImpl::SetHealth(::grpc::ServerContext* context,
+                                            const ::rdc::SetHealthRequest* request,
+                                            ::rdc::SetHealthResponse* reply) {
+  (void)(context);
+  if (!reply || !request) {
+    return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
+  }
+
+  rdc_status_t result = rdc_health_set(rdc_handle_, request->group_id(), request->components());
+
+  reply->set_status(result);
+
+  return ::grpc::Status::OK;
+}
+
+::grpc::Status RdcAPIServiceImpl::GetHealth(::grpc::ServerContext* context,
+                                            const ::rdc::GetHealthRequest* request,
+                                            ::rdc::GetHealthResponse* reply) {
+  (void)(context);
+  if (!reply || !request) {
+    return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
+  }
+
+  unsigned int components;
+  rdc_status_t result = rdc_health_get(rdc_handle_, request->group_id(), &components);
+
+  reply->set_status(result);
+  if (result != RDC_ST_OK) {
+    return ::grpc::Status::OK;
+  }
+
+  reply->set_components(components);
+
+  return ::grpc::Status::OK;
+}
+
+::grpc::Status RdcAPIServiceImpl::CheckHealth(::grpc::ServerContext* context,
+                                              const ::rdc::CheckHealthRequest* request,
+                                              ::rdc::CheckHealthResponse* reply) {
+  (void)(context);
+  if (!reply || !request) {
+    return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
+  }
+
+  rdc_health_response_t response;
+  rdc_status_t result = rdc_health_check(rdc_handle_, request->group_id(), &response);
+
+  reply->set_status(result);
+  if (result != RDC_ST_OK) {
+    return ::grpc::Status::OK;
+  }
+
+  ::rdc::HealthResponse* to_response = reply->mutable_response();
+  to_response->set_overall_health(response.overall_health);
+  to_response->set_incidents_count(response.incidents_count);
+
+  for (uint32_t i = 0; i < response.incidents_count; i++) {
+    const rdc_health_incidents_t& incident = response.incidents[i];
+    ::rdc::HealthIncidents* to_incidents = to_response->add_incidents();
+
+    to_incidents->set_gpu_index(incident.gpu_index);
+    to_incidents->set_component(incident.component);
+    to_incidents->set_health(incident.health);
+
+    //error
+    auto to_error = to_incidents->mutable_error();
+    to_error->set_code(incident.error.code);
+    to_error->set_msg(incident.error.msg);
+  }
+
+  return ::grpc::Status::OK;
+}
+
+::grpc::Status RdcAPIServiceImpl::ClearHealth(::grpc::ServerContext* context,
+                                              const ::rdc::ClearHealthRequest* request,
+                                              ::rdc::ClearHealthResponse* reply) {
+  (void)(context);
+  if (!reply || !request) {
+    return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
+  }
+
+  rdc_status_t result = rdc_health_clear(rdc_handle_, request->group_id());
+
+  reply->set_status(result);
+
+  return ::grpc::Status::OK;
+}
+
 }  // namespace rdc
 }  // namespace amd