Backgroud health check
Add the RdcSmiHealth module, which will call rocm_smi_lib.
It will support following health:
- XGMI error detected
- PCIE replay count detected
- Memory check
- InfoROM check
- Power/Thermal check
The grpc client and server side health function is added.
The health module is added to the rdci.
At present, XGMI/PCIE and a part of Memory have been implemented.
Others will be added as soon as possible.
Change-Id: I1bd99290bdc7dea733f21a41a8c4bcefb2138112
[ROCm/rdc commit: 853d3b0cc5]
Этот коммит содержится в:
коммит произвёл
Meng, Li (Jassmine)
родитель
39758d913c
Коммит
71e2727a8f
@@ -161,3 +161,12 @@ FLD_DESC_ENT(RDC_EVNT_NOTIF_THERMAL_THROTTLE, "Clk freq decrease due to temp",
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_PRE_RESET, "GPU reset is about to occur", "GPU_PRE_RESET", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_POST_RESET, "GPU reset just occurred", "GPU_POST_RESET", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG, "GPU ring hang just occured", "RING_HANG", false)
|
||||
|
||||
// RDC health related fields
|
||||
FLD_DESC_ENT(RDC_HEALTH_XGMI_ERROR, "XGMI one or more errors detected", "XGMI_ERROR", true)
|
||||
FLD_DESC_ENT(RDC_HEALTH_PCIE_REPLAY_COUNT, "Total PCIE replay count", "PCIE_REPLAY_COUNT", true)
|
||||
FLD_DESC_ENT(RDC_HEALTH_PENDING_PAGE_NUM, "Pending page number", "PENDING_PAGE_NUM", true)
|
||||
FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit", "RETIRED_PAGE_LIMIT", false)
|
||||
FLD_DESC_ENT(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, "Uncorrectable page limit", "UNCORRECTABLE_PAGE_LIMIT", false)
|
||||
FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", false)
|
||||
FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", false)
|
||||
|
||||
@@ -120,6 +120,12 @@ set(POLICY_EXAMPLE_EXE "policy")
|
||||
add_executable(${POLICY_EXAMPLE_EXE} "${POLICY_EXAMPLE_SRC_LIST}")
|
||||
target_link_libraries(${POLICY_EXAMPLE_EXE} pthread dl rdc_bootstrap)
|
||||
|
||||
set(HEALTH_EXAMPLE_SRC_LIST "health_example.cc")
|
||||
cmake_print_variables(HEALTH_EXAMPLE_SRC_LIST)
|
||||
set(HEALTH_EXAMPLE_EXE "health")
|
||||
add_executable(${HEALTH_EXAMPLE_EXE} "${HEALTH_EXAMPLE_SRC_LIST}")
|
||||
target_link_libraries(${HEALTH_EXAMPLE_EXE} pthread dl rdc_bootstrap)
|
||||
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
message(" Finished Cmake Example ")
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
|
||||
@@ -0,0 +1,359 @@
|
||||
/*
|
||||
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
|
||||
rdc_status_t get_watches(rdc_handle_t rdc_handle, rdc_gpu_group_t group_id) {
|
||||
unsigned int components;
|
||||
rdc_status_t result = rdc_health_get(rdc_handle, group_id, &components);
|
||||
if (result == RDC_ST_OK) {
|
||||
std::string on = "On";
|
||||
std::string off = "Off";
|
||||
|
||||
std::cout << "Health monitor systems status:" << std::endl;
|
||||
std::cout << "+--------------------+" //"-" width :20
|
||||
<< "---------------------------------------------------+\n"; //-" width :51
|
||||
std::cout << "|" << std::setw(20) << std::left << " PCIe" << "| "
|
||||
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "|\n";
|
||||
std::cout << "|" << std::setw(20) << std::left << " XGMI" << "| "
|
||||
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n";
|
||||
std::cout << "|" << std::setw(20) << std::left << " Memory" << "| "
|
||||
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n";
|
||||
std::cout << "+--------------------+" //"-" width :20
|
||||
<< "---------------------------------------------------+\n"; //-" width :51
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string health_string(rdc_health_result_t health) {
|
||||
switch (health) {
|
||||
case RDC_HEALTH_RESULT_PASS:
|
||||
return "Pass";
|
||||
|
||||
case RDC_HEALTH_RESULT_WARN:
|
||||
return "Warning";
|
||||
|
||||
case RDC_HEALTH_RESULT_FAIL:
|
||||
return "Fail";
|
||||
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
std::string component_string(rdc_health_system_t component) {
|
||||
switch (component) {
|
||||
case RDC_HEALTH_WATCH_PCIE:
|
||||
return "PCIe system: ";
|
||||
|
||||
case RDC_HEALTH_WATCH_XGMI:
|
||||
return"XGMI system: ";
|
||||
|
||||
case RDC_HEALTH_WATCH_MEM:
|
||||
return "Memory system: ";
|
||||
|
||||
case RDC_HEALTH_WATCH_INFOROM:
|
||||
return "Inforom system: ";
|
||||
|
||||
case RDC_HEALTH_WATCH_THERMAL:
|
||||
return "Thermal system:";
|
||||
|
||||
case RDC_HEALTH_WATCH_POWER:
|
||||
return "Power system: ";
|
||||
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
void output_errstr(const std::string& input) {
|
||||
std::string word, line_str;
|
||||
unsigned int width = 60, line_size = 0;
|
||||
std::istringstream iss(input);
|
||||
|
||||
while (iss >> word) {
|
||||
if (line_size + word.size() >= width) {
|
||||
std::cout << "|" << std::setw(20) << " " << "| "
|
||||
<< std::setw(width) << std::left << line_str << "|\n";
|
||||
|
||||
//add new line string
|
||||
line_str = word;
|
||||
line_size = word.size();
|
||||
} else {
|
||||
if (line_size > 0) {
|
||||
line_str += " ";
|
||||
line_str += word;
|
||||
line_size += word.size() + 1;
|
||||
} else {
|
||||
line_str += word;
|
||||
line_size += word.size();
|
||||
}
|
||||
}
|
||||
} //end while
|
||||
|
||||
if (0 < line_size)
|
||||
std::cout << "|" << std::setw(20) << " " << "| "
|
||||
<< std::setw(width) << std::left << line_str << "|\n";
|
||||
}
|
||||
|
||||
unsigned int handle_one_component(rdc_health_response_t &response,
|
||||
unsigned int start_index,
|
||||
uint32_t gpu_index,
|
||||
rdc_health_system_t component,
|
||||
rdc_health_result_t &component_health,
|
||||
std::vector<std::string> &err_str) {
|
||||
unsigned int count = 0;
|
||||
rdc_health_incidents_t *incident;
|
||||
std::string all_err_str;
|
||||
|
||||
for (unsigned int i = start_index; i < response.incidents_count; i++) {
|
||||
incident = &response.incidents[i];
|
||||
|
||||
//same GPU Index, same component
|
||||
if ((incident->gpu_index != gpu_index) ||
|
||||
(incident->component != component))
|
||||
break;
|
||||
|
||||
//set component health
|
||||
if (incident->health > component_health)
|
||||
component_health = incident->health;
|
||||
|
||||
all_err_str = " - ";
|
||||
all_err_str += incident->error.msg;
|
||||
err_str.push_back(all_err_str);
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
unsigned int handle_one_gpu(rdc_health_response_t &response,
|
||||
unsigned int start_index,
|
||||
uint32_t gpu_index) {
|
||||
unsigned int count = 0, comp_count = 0;
|
||||
rdc_health_incidents_t *incident;
|
||||
rdc_health_result_t gpu_health = RDC_HEALTH_RESULT_PASS;
|
||||
std::string component_str, health_str, gpu_health_str;
|
||||
typedef struct {
|
||||
rdc_health_result_t component_health;
|
||||
std::vector<std::string> err_str;
|
||||
} component_detail_t;
|
||||
std::map<rdc_health_system_t, component_detail_t> component_detail_map;
|
||||
|
||||
for (unsigned int i = start_index; i < response.incidents_count; i++) {
|
||||
incident = &response.incidents[i];
|
||||
|
||||
//same GPU Index
|
||||
if (incident->gpu_index != gpu_index)
|
||||
break;
|
||||
|
||||
//set gpu health
|
||||
if (incident->health > gpu_health)
|
||||
gpu_health = incident->health;
|
||||
|
||||
//handle smae component
|
||||
component_detail_t detail;
|
||||
detail.component_health = RDC_HEALTH_RESULT_PASS;
|
||||
detail.err_str.clear();
|
||||
|
||||
comp_count = handle_one_component(response, i, gpu_index, incident->component, detail.component_health, detail.err_str);
|
||||
i += comp_count - 1;
|
||||
count += comp_count;
|
||||
|
||||
// Add to the component detail map
|
||||
component_detail_map.insert({incident->component, detail});
|
||||
}
|
||||
|
||||
//output gpu_index health result
|
||||
gpu_health_str = health_string(gpu_health);
|
||||
|
||||
std::cout << "|" << std::setw(20) << " GPU ID: " + std::to_string(gpu_index) << "| "
|
||||
<< std::setw(60) << std::left << gpu_health_str << "|\n";
|
||||
std::cout << "|" << std::setw(20) << " " << "| "
|
||||
<< std::setw(60) << " " << "|\n";
|
||||
|
||||
for (auto ite : component_detail_map) {
|
||||
component_str = component_string(ite.first);
|
||||
health_str = health_string(ite.second.component_health);
|
||||
std::cout << "|" << std::setw(20) << " " << "| "
|
||||
<< std::setw(60) << std::left << component_str + health_str << "|\n";
|
||||
|
||||
for (auto msg : ite.second.err_str)
|
||||
output_errstr(msg);
|
||||
|
||||
std::cout << "|" << std::setw(20) << " " << "| "
|
||||
<< std::setw(60) << " " << "|\n";
|
||||
}
|
||||
std::cout << "+--------------------+-" //"-" width :20
|
||||
<< "------------------------------------------------------------+\n"; //-" width :60
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
int main(int, char**) {
|
||||
rdc_status_t result;
|
||||
rdc_handle_t rdc_handle;
|
||||
char hostIpAddress[] = {"127.0.0.1:50051"};
|
||||
char group_name[] = {"healthgroup1"};
|
||||
|
||||
std::cout << "Start rdci in Standalone mode\n";
|
||||
|
||||
// Init the rdc
|
||||
result = rdc_init(0);
|
||||
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl;
|
||||
goto cleanup;
|
||||
} else {
|
||||
std::cout << "RDC Initialized.\n";
|
||||
}
|
||||
|
||||
result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// Now we can use the same API for standalone
|
||||
// (1) create group and add GPUs
|
||||
rdc_gpu_group_t group_id;
|
||||
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error creating group. Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto cleanup;
|
||||
}
|
||||
std::cout << "Created the GPU group " << group_id << std::endl;
|
||||
|
||||
result = rdc_group_gpu_add(rdc_handle, group_id, 0); // Add GPU 0
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error adding group. Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto destroygroup;
|
||||
}
|
||||
|
||||
rdc_device_attributes_t attribute;
|
||||
result = rdc_device_get_attributes(rdc_handle, 0, &attribute);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result);
|
||||
goto destroygroup;
|
||||
}
|
||||
std::cout << "Add GPU 0: " << attribute.device_name << " to group "
|
||||
<< group_id << std::endl;
|
||||
|
||||
// (2) get heath current watches before setting
|
||||
result = get_watches(rdc_handle, group_id);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error getting health watches. Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto destroygroup;
|
||||
}
|
||||
|
||||
// (3) set health watches.
|
||||
unsigned int components;
|
||||
components = RDC_HEALTH_WATCH_PCIE | RDC_HEALTH_WATCH_XGMI | RDC_HEALTH_WATCH_MEM;
|
||||
result = rdc_health_set(rdc_handle, group_id, components);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error setting health watches. Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto destroygroup;
|
||||
}
|
||||
std::cout << "Set health watches to all." << std::endl;
|
||||
|
||||
// (4) get heath current watches after setting
|
||||
result = get_watches(rdc_handle, group_id);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error getting health watches. Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto destroygroup;
|
||||
}
|
||||
|
||||
std::cout << "Start to health monitor group:" << group_id
|
||||
<< std::endl;
|
||||
std::cout << "Sleep a few seconds before retreive the data ...\n";
|
||||
// For standalone mode, the daemon will update and cache the samples
|
||||
// take samples, standalone mode, do nothing
|
||||
usleep(5000000); // sleep 5 seconds before fetch the stats
|
||||
|
||||
// (5) Get the health stats
|
||||
rdc_health_response_t response;
|
||||
result = rdc_health_check(rdc_handle, group_id, &response);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error health check. Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto destroygroup;
|
||||
} else {
|
||||
//output headline
|
||||
std::string overall_str = health_string(response.overall_health);
|
||||
std::cout << "Health monitor report:" << std::endl;
|
||||
std::cout << "+--------------------+-" //"-" width :20
|
||||
<< "------------------------------------------------------------+\n"; //-" width :60
|
||||
std::cout << "|" << std::setw(20) << std::left << " Group " + std::to_string(group_id) << "| "
|
||||
<< std::setw(60) << std::left << "Overall Health: " + overall_str << "|\n";
|
||||
std::cout << "+====================+=" //"=" width :20
|
||||
<< "============================================================+\n"; //"=" width :60
|
||||
|
||||
//output health of per GPU
|
||||
unsigned int index = 0;
|
||||
while (index < response.incidents_count) {
|
||||
uint32_t gpu_index = response.incidents[index].gpu_index;
|
||||
|
||||
unsigned int count = handle_one_gpu(response, index, gpu_index);
|
||||
index += count;
|
||||
}
|
||||
}
|
||||
|
||||
// (6) Clear the health
|
||||
result = rdc_health_clear(rdc_handle, group_id);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error clear health. Return: " << rdc_status_string(result)
|
||||
<< std::endl;
|
||||
goto destroygroup;
|
||||
}
|
||||
std::cout << "Clear Group " << group_id << " all health monitor systems." << std::endl;
|
||||
|
||||
destroygroup:
|
||||
// Delete the GPU group
|
||||
result = rdc_group_gpu_destroy(rdc_handle, group_id);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::cout << "Error delete GPU group. Return: " << rdc_status_string(result);
|
||||
goto cleanup;
|
||||
}
|
||||
std::cout << "Deleted the GPU group " << group_id << std::endl;
|
||||
|
||||
// Cleanup consists of shutting down RDC.
|
||||
cleanup:
|
||||
std::cout << "Cleaning up.\n";
|
||||
rdc_disconnect(rdc_handle);
|
||||
rdc_shutdown();
|
||||
return result;
|
||||
}
|
||||
@@ -335,6 +335,18 @@ typedef enum {
|
||||
RDC_EVNT_NOTIF_RING_HANG, //!< GPU ring hang just occurred
|
||||
|
||||
RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_RING_HANG,
|
||||
|
||||
/**
|
||||
* @brief RDC health related fields
|
||||
*/
|
||||
RDC_HEALTH_XGMI_ERROR = 3000, //!< XGMI one or more errors detected
|
||||
RDC_HEALTH_PCIE_REPLAY_COUNT, //!< Total PCIE replay count
|
||||
RDC_HEALTH_RETIRED_PAGE_NUM, //!< Retired page number
|
||||
RDC_HEALTH_PENDING_PAGE_NUM, //!< Pending page number
|
||||
RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< The threshold of retired page
|
||||
RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT,//!< The threshold of uncorrectable page
|
||||
RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter
|
||||
RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds)
|
||||
} rdc_field_t;
|
||||
|
||||
// even and odd numbers are used for correctable and uncorrectable errors
|
||||
@@ -589,6 +601,81 @@ typedef struct {
|
||||
rdc_policy_action_t action; //!< Action to take
|
||||
} rdc_policy_t;
|
||||
|
||||
/**
|
||||
* @brief type of health watches
|
||||
*/
|
||||
typedef enum {
|
||||
RDC_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches
|
||||
RDC_HEALTH_WATCH_XGMI = 0x2, //!< XGMI system watches
|
||||
RDC_HEALTH_WATCH_MEM = 0x4, //!< Memory watches
|
||||
RDC_HEALTH_WATCH_INFOROM = 0x8, //!< Inforom watches
|
||||
RDC_HEALTH_WATCH_THERMAL = 0x10, //!< Temperature watches
|
||||
RDC_HEALTH_WATCH_POWER = 0x20, //!< Power watches
|
||||
} rdc_health_system_t;
|
||||
|
||||
/**
|
||||
* @brief type of health result
|
||||
*/
|
||||
typedef enum {
|
||||
RDC_HEALTH_RESULT_PASS, //!< The health test pass
|
||||
RDC_HEALTH_RESULT_WARN, //!< The health test has warnings
|
||||
RDC_HEALTH_RESULT_FAIL //!< The health test fail
|
||||
} rdc_health_result_t;
|
||||
|
||||
/**
|
||||
* @brief The maximum length of the health messages
|
||||
*/
|
||||
#define MAX_HEALTH_MSG_LENGTH 4096
|
||||
|
||||
/**
|
||||
* 8 replays per minute is the maximum recommended
|
||||
*/
|
||||
#define PCIE_MAX_REPLAYS_PERMIN 8
|
||||
|
||||
// The error code set at rdc_health_incidents_t.error.code
|
||||
typedef enum {
|
||||
RDC_FR_PCI_REPLAY_RATE = 1000,
|
||||
RDC_FR_ECC_UNCORRECTABLE_DETECTED = 1001,
|
||||
RDC_FR_PENDING_PAGE_RETIREMENTS = 1002,
|
||||
RDC_FR_RETIRED_PAGES_LIMIT = 1003,
|
||||
RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT = 1004,
|
||||
RDC_FR_CLOCKS_THROTTLE_THERMAL = 1005,
|
||||
RDC_FR_CLOCKS_THROTTLE_POWER = 1006,
|
||||
RDC_FR_XGMI_SINGLE_ERROR = 1007,
|
||||
RDC_FR_XGMI_MULTIPLE_ERROR = 1008,
|
||||
RDC_FR_CORRUPT_INFOROM = 1009
|
||||
} rdc_health_error_code_t;
|
||||
|
||||
/**
|
||||
* @brief details of the health errors
|
||||
*/
|
||||
typedef struct {
|
||||
char msg[MAX_HEALTH_MSG_LENGTH]; //!< The test result details
|
||||
uint32_t code; //!< The low level error code
|
||||
} rdc_health_detail_t;
|
||||
|
||||
/**
|
||||
* @brief details of the per health incidents
|
||||
*/
|
||||
typedef struct {
|
||||
uint32_t gpu_index; //!< which GPU in this group have the issue
|
||||
rdc_health_system_t component; //!< which components have the issue
|
||||
rdc_health_result_t health; //!< health diagnosis of this incident
|
||||
rdc_health_detail_t error; //!< The details of the error, rdc_health_error_code_t
|
||||
} rdc_health_incidents_t;
|
||||
|
||||
|
||||
#define HEALTH_MAX_ERROR_ITEMS 64
|
||||
|
||||
/**
|
||||
* @brief The health responses for test cases
|
||||
*/
|
||||
typedef struct {
|
||||
rdc_health_result_t overall_health; //!< The overall health of this entire host
|
||||
unsigned int incidents_count; //!< The number of health incidents reported in this struct
|
||||
rdc_health_incidents_t incidents[HEALTH_MAX_ERROR_ITEMS]; //!< Report of the errors detected
|
||||
} rdc_health_response_t;
|
||||
|
||||
/**
|
||||
* @brief Initialize ROCm RDC.
|
||||
*
|
||||
@@ -1274,6 +1361,72 @@ rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t grou
|
||||
*/
|
||||
rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
|
||||
|
||||
/**
|
||||
* @brief enable the health check for a group
|
||||
*
|
||||
* @details For each group, only one parameter can be set. If you want to
|
||||
* clear the setting for a group, set component == 0x0
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
*
|
||||
* @param[in] group_id The GPU group id.
|
||||
*
|
||||
* @param[in] components The list of components that should be enabled for health check
|
||||
* for example, RDC_HEALTH_WATCH_THERMAL | RDC_HEALTH_WATCH_POWER
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_health_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
unsigned int components);
|
||||
|
||||
/**
|
||||
* @brief get the health check settings of a group
|
||||
*
|
||||
* @details get the health check settings of a component
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
*
|
||||
* @param[in] group_id The GPU group id.
|
||||
*
|
||||
* @param[out] components The list of components that should be enabled for health check
|
||||
* for example, RDC_HEALTH_WATCH_THERMAL | RDC_HEALTH_WATCH_POWER
|
||||
* if it is 0x0, then the health check not set for the group yet.
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
unsigned int* components);
|
||||
|
||||
/**
|
||||
* @brief Check health watch results
|
||||
*
|
||||
* @details If it has incidents.
|
||||
* For each incident, check the component and error message.
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
*
|
||||
* @param[in] group_id The GPU group id.
|
||||
*
|
||||
* @param[inout] response The detail results of the health.
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_health_check(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_health_response_t* response);
|
||||
|
||||
/**
|
||||
* @brief clear the health watch
|
||||
*
|
||||
* @details For each group, clear the setting.
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
*
|
||||
* @param[in] group_id The GPU group id.
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif // __cplusplus
|
||||
|
||||
@@ -59,6 +59,21 @@ class RdcCacheManager {
|
||||
virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0;
|
||||
virtual rdc_status_t rdc_job_remove_all() = 0;
|
||||
|
||||
virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
const rdc_field_value& value) = 0;
|
||||
virtual rdc_status_t rdc_health_get_values(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_field_t field_id,
|
||||
uint64_t start_timestamp,
|
||||
uint64_t end_timestamp,
|
||||
rdc_field_value* start_value,
|
||||
rdc_field_value* end_value) = 0;
|
||||
virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0;
|
||||
virtual rdc_status_t rdc_update_health_stats(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
const rdc_field_value& value) = 0;
|
||||
|
||||
virtual ~RdcCacheManager() {}
|
||||
};
|
||||
|
||||
|
||||
@@ -106,6 +106,12 @@ class RdcHandler {
|
||||
|
||||
virtual rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) = 0;
|
||||
|
||||
// Health API
|
||||
virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) = 0;
|
||||
virtual rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) = 0;
|
||||
virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) = 0;
|
||||
virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0;
|
||||
|
||||
virtual ~RdcHandler() {}
|
||||
};
|
||||
|
||||
|
||||
@@ -50,6 +50,14 @@ class RdcWatchTable {
|
||||
virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
|
||||
rdc_field_grp_t field_group_id) = 0;
|
||||
|
||||
virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id,
|
||||
unsigned int components) = 0;
|
||||
virtual rdc_status_t rdc_health_get(rdc_gpu_group_t group_id,
|
||||
unsigned int* components) = 0;
|
||||
virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id,
|
||||
rdc_health_response_t *response) = 0;
|
||||
virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0;
|
||||
|
||||
virtual ~RdcWatchTable() {}
|
||||
};
|
||||
|
||||
|
||||
@@ -32,6 +32,8 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/RdcCacheManager.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
|
||||
#define HEALTH_MAX_KEEP_SAMPLES 300
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
@@ -81,6 +83,9 @@ struct RdcJobStatsCacheEntry {
|
||||
// <job_id, job_stats>
|
||||
typedef std::map<std::string, RdcJobStatsCacheEntry> RdcJobStatsCache;
|
||||
|
||||
// <group_id, health_samples>
|
||||
typedef std::map<rdc_gpu_group_t, RdcCacheSamples> RdcHealthStatsCache;
|
||||
|
||||
class RdcCacheManagerImpl : public RdcCacheManager {
|
||||
public:
|
||||
rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field,
|
||||
@@ -105,6 +110,21 @@ class RdcCacheManagerImpl : public RdcCacheManager {
|
||||
rdc_status_t rdc_job_remove(const char job_id[64]) override;
|
||||
rdc_status_t rdc_job_remove_all() override;
|
||||
|
||||
rdc_status_t rdc_health_set(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
const rdc_field_value& value) override;
|
||||
rdc_status_t rdc_health_get_values(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_field_t field_id,
|
||||
uint64_t start_timestamp,
|
||||
uint64_t end_timestamp,
|
||||
rdc_field_value* start_value,
|
||||
rdc_field_value* end_value) override;
|
||||
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
|
||||
rdc_status_t rdc_update_health_stats(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
const rdc_field_value& value) override;
|
||||
|
||||
private:
|
||||
void set_summary(const FieldSummaryStats& stats, rdc_stats_summary_t& gpu,
|
||||
rdc_stats_summary_t& summary, // NOLINT
|
||||
@@ -113,6 +133,7 @@ class RdcCacheManagerImpl : public RdcCacheManager {
|
||||
uint32_t num_gpus); // NOLINT
|
||||
RdcCacheSamples cache_samples_;
|
||||
RdcJobStatsCache cache_jobs_;
|
||||
RdcHealthStatsCache cache_health_;
|
||||
std::mutex cache_mutex_;
|
||||
};
|
||||
|
||||
|
||||
@@ -108,6 +108,12 @@ class RdcEmbeddedHandler final : public RdcHandler {
|
||||
|
||||
rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;
|
||||
|
||||
// Health API
|
||||
rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override;
|
||||
rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override;
|
||||
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override;
|
||||
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
|
||||
|
||||
explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode);
|
||||
~RdcEmbeddedHandler() final;
|
||||
|
||||
|
||||
@@ -105,6 +105,12 @@ class RdcStandaloneHandler : public RdcHandler {
|
||||
|
||||
rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;
|
||||
|
||||
// Health API
|
||||
rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override;
|
||||
rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override;
|
||||
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override;
|
||||
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
|
||||
|
||||
explicit RdcStandaloneHandler(const char* ip_and_port, const char* root_ca,
|
||||
const char* client_cert, const char* client_key);
|
||||
|
||||
|
||||
@@ -55,6 +55,12 @@ struct JobWatchTableEntry {
|
||||
std::vector<RdcFieldKey> fields; //< store fields for faster query
|
||||
};
|
||||
|
||||
struct HealthWatchTableEntry {
|
||||
unsigned int components;
|
||||
rdc_field_grp_t field_group_id;
|
||||
std::vector<RdcFieldKey> fields; //< store fields for faster query
|
||||
};
|
||||
|
||||
class RdcWatchTableImpl : public RdcWatchTable {
|
||||
public:
|
||||
rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, const char job_id[64],
|
||||
@@ -74,6 +80,11 @@ class RdcWatchTableImpl : public RdcWatchTable {
|
||||
//!< is reached, which will be handled in the clean_up() function.
|
||||
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override;
|
||||
|
||||
rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override;
|
||||
rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override;
|
||||
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override;
|
||||
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
|
||||
|
||||
//!< When the RDC is running as RDC_OPERATION_MODE_MANUAL, the user will
|
||||
//!< call this function periodically. Instead of providing other APIs to
|
||||
//!< cleanup the cache, this function will update and cleanup the cache.
|
||||
@@ -85,7 +96,8 @@ class RdcWatchTableImpl : public RdcWatchTable {
|
||||
rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) override;
|
||||
|
||||
RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr,
|
||||
const RdcModuleMgrPtr& module_mgr, const RdcNotificationPtr& notif);
|
||||
const RdcMetricFetcherPtr& metric_fetcher, const RdcModuleMgrPtr& module_mgr,
|
||||
const RdcNotificationPtr& notif);
|
||||
|
||||
private:
|
||||
//!< Helper function to Update the fields_in_table when unwatch tables
|
||||
@@ -104,13 +116,39 @@ class RdcWatchTableImpl : public RdcWatchTable {
|
||||
bool is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id,
|
||||
std::string& job_id) const; // NOLINT
|
||||
|
||||
bool is_health_watch_field(uint32_t gpu_index, rdc_field_t field_id,
|
||||
rdc_gpu_group_t& group_id) const;
|
||||
|
||||
rdc_status_t rdc_notif_update_cache(rdc_evnt_notification_t* events, uint32_t num_events);
|
||||
//!< The function will be pass as the callback for bulk fetch
|
||||
static rdc_status_t handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values,
|
||||
void* user_data);
|
||||
|
||||
rdc_status_t create_health_field_group(unsigned int components,
|
||||
rdc_field_grp_t* field_group_id);
|
||||
//!< output: Whether health incidents are full
|
||||
bool add_health_incident(uint32_t gpu_index,
|
||||
rdc_health_system_t component,
|
||||
rdc_health_result_t health,
|
||||
uint32_t err_code,
|
||||
std::string err_msg,
|
||||
rdc_health_incidents_t* incident,
|
||||
rdc_health_response_t* response);
|
||||
rdc_status_t get_start_end_values(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_field_t field,
|
||||
rdc_field_value *start_value,
|
||||
rdc_field_value *end_value);
|
||||
rdc_status_t pcie_check(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index, rdc_health_response_t* response);
|
||||
rdc_status_t xgmi_check(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index, rdc_health_response_t* response);
|
||||
rdc_status_t memory_check(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index, rdc_health_response_t* response);
|
||||
|
||||
RdcGroupSettingsPtr group_settings_;
|
||||
RdcCacheManagerPtr cache_mgr_;
|
||||
RdcMetricFetcherPtr metric_fetcher_;
|
||||
RdcModuleMgrPtr rdc_module_mgr_;
|
||||
RdcNotificationPtr notifications_;
|
||||
|
||||
@@ -126,6 +164,9 @@ class RdcWatchTableImpl : public RdcWatchTable {
|
||||
//!< Those settings will only be updated when watching or unwatching.
|
||||
std::map<RdcFieldKey, FieldSettings> fields_to_watch_;
|
||||
|
||||
//!< The health watch table to store the health settings.
|
||||
std::map<uint32_t, HealthWatchTableEntry> health_watch_table_;
|
||||
|
||||
//!< The last clean up time
|
||||
std::atomic<uint64_t> last_cleanup_time_;
|
||||
std::mutex watch_mutex_;
|
||||
|
||||
@@ -190,6 +190,18 @@ service RdcAPI {
|
||||
// rdc_policy_condition_t condition);
|
||||
rpc UnRegisterPolicy(UnRegisterPolicyRequest) returns (UnRegisterPolicyResponse) {}
|
||||
|
||||
// Health API
|
||||
// rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components);
|
||||
rpc SetHealth(SetHealthRequest) returns (SetHealthResponse) {}
|
||||
|
||||
// rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components);
|
||||
rpc GetHealth(GetHealthRequest) returns (GetHealthResponse) {}
|
||||
|
||||
// rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t r*esponse);
|
||||
rpc CheckHealth(CheckHealthRequest) returns (CheckHealthResponse) {}
|
||||
|
||||
// rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id);
|
||||
rpc ClearHealth(ClearHealthRequest) returns (ClearHealthResponse) {}
|
||||
}
|
||||
|
||||
message Empty {
|
||||
@@ -620,3 +632,55 @@ message UnRegisterPolicyRequest {
|
||||
message UnRegisterPolicyResponse {
|
||||
uint32 status = 1;
|
||||
}
|
||||
|
||||
message SetHealthRequest {
|
||||
uint32 group_id = 1;
|
||||
uint32 components = 2;
|
||||
}
|
||||
|
||||
message SetHealthResponse {
|
||||
uint32 status = 1;
|
||||
}
|
||||
message GetHealthRequest {
|
||||
uint32 group_id = 1;
|
||||
}
|
||||
|
||||
message GetHealthResponse {
|
||||
uint32 status = 1;
|
||||
uint32 components = 2;
|
||||
}
|
||||
|
||||
message CheckHealthRequest {
|
||||
uint32 group_id = 1;
|
||||
}
|
||||
|
||||
message HealthDetail {
|
||||
string msg = 1;
|
||||
uint32 code = 2;
|
||||
}
|
||||
|
||||
message HealthIncidents {
|
||||
uint32 gpu_index = 1;
|
||||
uint32 component = 2;
|
||||
uint32 health = 3;
|
||||
HealthDetail error = 4;
|
||||
}
|
||||
|
||||
message HealthResponse {
|
||||
uint32 overall_health = 1;
|
||||
uint32 incidents_count = 2;
|
||||
repeated HealthIncidents incidents = 3;
|
||||
}
|
||||
|
||||
message CheckHealthResponse {
|
||||
uint32 status = 1;
|
||||
HealthResponse response = 2;
|
||||
}
|
||||
|
||||
message ClearHealthRequest {
|
||||
uint32 group_id = 1;
|
||||
}
|
||||
|
||||
message ClearHealthResponse {
|
||||
uint32 status = 1;
|
||||
}
|
||||
|
||||
@@ -163,6 +163,14 @@ class rdc_field_t(c_int):
|
||||
RDC_EVNT_NOTIF_PRE_RESET = 2002
|
||||
RDC_EVNT_NOTIF_POST_RESET = 2003
|
||||
RDC_EVNT_NOTIF_RING_HANG = 2004
|
||||
RDC_HEALTH_XGMI_ERROR = 3000
|
||||
RDC_HEALTH_PCIE_REPLAY_COUNT = 3001
|
||||
RDC_HEALTH_RETIRED_PAGE_NUM = 3002
|
||||
RDC_HEALTH_PENDING_PAGE_NUM = 3003
|
||||
RDC_HEALTH_RETIRED_PAGE_LIMIT = 3004
|
||||
RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT = 3005
|
||||
RDC_HEALTH_POWER_THROTTLE_TIME = 3006
|
||||
RDC_HEALTH_THERMAL_THROTTLE_TIME = 3007
|
||||
|
||||
rdc_handle_t = c_void_p
|
||||
rdc_gpu_group_t = c_uint32
|
||||
|
||||
@@ -401,6 +401,45 @@ rdc_field_t get_field_id_from_name(const char* name) {
|
||||
return RDC_FI_INVALID;
|
||||
}
|
||||
|
||||
rdc_status_t rdc_health_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
unsigned int components) {
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_health_set(group_id, components);
|
||||
}
|
||||
|
||||
rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
unsigned int* components) {
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_health_get(group_id, components);
|
||||
}
|
||||
|
||||
rdc_status_t rdc_health_check(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
||||
rdc_health_response_t *response) {
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_health_check(group_id, response);
|
||||
}
|
||||
|
||||
rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) {
|
||||
if (!p_rdc_handle) {
|
||||
return RDC_ST_INVALID_HANDLER;
|
||||
}
|
||||
|
||||
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
|
||||
->rdc_health_clear(group_id);
|
||||
}
|
||||
|
||||
char* strncpy_with_null(char* dest, const char* src, size_t n) {
|
||||
if (n == 0) {
|
||||
return dest;
|
||||
|
||||
@@ -449,5 +449,142 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(const char job_id[64],
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcCacheManagerImpl::rdc_health_set(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
const rdc_field_value& value) {
|
||||
std::lock_guard<std::mutex> guard(cache_mutex_);
|
||||
RdcFieldKey field{gpu_index, value.field_id};
|
||||
|
||||
// Set initial values
|
||||
RdcCacheEntry entry;
|
||||
entry.last_time = value.ts;
|
||||
entry.value = value.value;
|
||||
entry.type = value.type;
|
||||
|
||||
auto health_ite = cache_health_.find(group_id);
|
||||
if (health_ite == cache_health_.end()) {
|
||||
std::vector<RdcCacheEntry> ve;
|
||||
ve.push_back(entry);
|
||||
|
||||
RdcCacheSamples cache_sample;
|
||||
cache_sample.insert({field, ve});
|
||||
|
||||
cache_health_.insert({group_id, cache_sample});
|
||||
}
|
||||
else {
|
||||
auto samples_ite = health_ite->second.find(field);
|
||||
if (samples_ite == health_ite->second.end()) {
|
||||
std::vector<RdcCacheEntry> ve;
|
||||
ve.push_back(entry);
|
||||
|
||||
health_ite->second.insert({field, ve});
|
||||
} else {
|
||||
samples_ite->second.push_back(entry);
|
||||
}
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcCacheManagerImpl::rdc_health_get_values(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_field_t field_id,
|
||||
uint64_t start_timestamp,
|
||||
uint64_t end_timestamp,
|
||||
rdc_field_value* start_value,
|
||||
rdc_field_value* end_value) {
|
||||
if (!start_value && !end_value)
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
|
||||
std::lock_guard<std::mutex> guard(cache_mutex_);
|
||||
auto health_ite = cache_health_.find(group_id);
|
||||
if (health_ite == cache_health_.end())
|
||||
return RDC_ST_NOT_FOUND;
|
||||
|
||||
RdcFieldKey field{gpu_index, field_id};
|
||||
auto samples_ite = health_ite->second.find(field);
|
||||
if (samples_ite == health_ite->second.end() ||
|
||||
samples_ite->second.size() == 0)
|
||||
return RDC_ST_NOT_FOUND;
|
||||
|
||||
auto cache_values = samples_ite->second;
|
||||
rdc_status_t result = RDC_ST_OK;
|
||||
if (start_value != nullptr) {
|
||||
//get start value
|
||||
result = RDC_ST_NOT_FOUND;
|
||||
for (auto entry = cache_values.begin(); entry != cache_values.end(); entry++) {
|
||||
if (entry->last_time >= start_timestamp) {
|
||||
start_value->field_id = field_id;
|
||||
start_value->ts = entry->last_time;
|
||||
start_value->type = entry->type;
|
||||
if (entry->type == STRING)
|
||||
strncpy_with_null(start_value->value.str, entry->value.str, RDC_MAX_STR_LENGTH);
|
||||
else
|
||||
start_value->value.l_int = entry->value.l_int;
|
||||
result = RDC_ST_OK;
|
||||
break;
|
||||
}
|
||||
} //end for
|
||||
} //end if
|
||||
|
||||
if ((RDC_ST_OK == result) && (end_value != nullptr)) {
|
||||
// get end value
|
||||
result = RDC_ST_NOT_FOUND;
|
||||
for (auto entry = cache_values.rbegin(); entry != cache_values.rend(); entry++) {
|
||||
if (entry->last_time <= end_timestamp) {
|
||||
end_value->field_id = field_id;
|
||||
end_value->ts = entry->last_time;
|
||||
end_value->type = entry->type;
|
||||
if (entry->type == STRING)
|
||||
strncpy_with_null(end_value->value.str, entry->value.str, RDC_MAX_STR_LENGTH);
|
||||
else
|
||||
end_value->value.l_int = entry->value.l_int;
|
||||
result = RDC_ST_OK;
|
||||
break;
|
||||
}
|
||||
} //end for
|
||||
} //end if
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
rdc_status_t RdcCacheManagerImpl::rdc_health_clear(rdc_gpu_group_t group_id) {
|
||||
std::lock_guard<std::mutex> guard(cache_mutex_);
|
||||
cache_health_.erase(group_id);
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcCacheManagerImpl::rdc_update_health_stats(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
const rdc_field_value& value) {
|
||||
std::lock_guard<std::mutex> guard(cache_mutex_);
|
||||
auto health_ite = cache_health_.find(group_id);
|
||||
if (health_ite == cache_health_.end()) {
|
||||
return RDC_ST_NOT_FOUND;
|
||||
}
|
||||
|
||||
RdcFieldKey field{gpu_index, value.field_id};
|
||||
auto samples_ite = health_ite->second.find(field);
|
||||
if (samples_ite == health_ite->second.end()) {
|
||||
return RDC_ST_NOT_FOUND;
|
||||
}
|
||||
|
||||
// Check HEALTH_MAX_KEEP_SAMPLES
|
||||
auto& cache_values = samples_ite->second;
|
||||
int item_remove = cache_values.size() - HEALTH_MAX_KEEP_SAMPLES + 1;
|
||||
if (item_remove > 0) {
|
||||
cache_values.erase(cache_values.begin(), cache_values.begin() + item_remove);
|
||||
}
|
||||
|
||||
RdcCacheEntry entry;
|
||||
entry.last_time = value.ts;
|
||||
entry.value = value.value;
|
||||
entry.type = value.type;
|
||||
|
||||
cache_values.push_back(entry);
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
@@ -79,7 +79,7 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode)
|
||||
metric_fetcher_(new RdcMetricFetcherImpl()),
|
||||
rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)),
|
||||
rdc_notif_(new RdcNotificationImpl()),
|
||||
watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, rdc_module_mgr_, rdc_notif_)),
|
||||
watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, metric_fetcher_, rdc_module_mgr_, rdc_notif_)),
|
||||
metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)),
|
||||
policy_(new RdcPolicyImpl(group_settings_,metric_fetcher_)) {
|
||||
if (mode == RDC_OPERATION_MODE_AUTO) {
|
||||
@@ -460,5 +460,38 @@ rdc_status_t RdcEmbeddedHandler::rdc_policy_unregister(rdc_gpu_group_t group_id)
|
||||
return policy_->rdc_policy_unregister(group_id);
|
||||
}
|
||||
|
||||
// Health API
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_health_set(rdc_gpu_group_t group_id,
|
||||
unsigned int components) {
|
||||
if (0 == components) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
return watch_table_->rdc_health_set(group_id, components);
|
||||
}
|
||||
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_health_get(rdc_gpu_group_t group_id,
|
||||
unsigned int *components) {
|
||||
if (components == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
return watch_table_->rdc_health_get(group_id, components);
|
||||
}
|
||||
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_health_check(rdc_gpu_group_t group_id,
|
||||
rdc_health_response_t *response) {
|
||||
if (response == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
return watch_table_->rdc_health_check(group_id, response);
|
||||
}
|
||||
|
||||
rdc_status_t RdcEmbeddedHandler::rdc_health_clear(rdc_gpu_group_t group_id) {
|
||||
|
||||
return watch_table_->rdc_health_clear(group_id);
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
@@ -804,6 +804,66 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
read_gpu_metrics_uint64_t();
|
||||
break;
|
||||
|
||||
case RDC_HEALTH_XGMI_ERROR: {
|
||||
amdsmi_xgmi_status_t status;
|
||||
ret = amdsmi_gpu_xgmi_error_status(processor_handle, &status);
|
||||
value->status = Smi2RdcError(ret);
|
||||
value->type = INTEGER;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(status);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case RDC_HEALTH_PCIE_REPLAY_COUNT: {
|
||||
amdsmi_pcie_info_t pcie_info;
|
||||
ret = amdsmi_get_pcie_info(processor_handle, &pcie_info);
|
||||
value->status = Smi2RdcError(ret);
|
||||
value->type = INTEGER;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(pcie_info.pcie_metric.pcie_replay_count);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case RDC_HEALTH_RETIRED_PAGE_NUM:
|
||||
case RDC_HEALTH_PENDING_PAGE_NUM: {
|
||||
uint32_t num_pages = 0;
|
||||
ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, nullptr);
|
||||
if (AMDSMI_STATUS_SUCCESS == ret) {
|
||||
if (RDC_HEALTH_RETIRED_PAGE_NUM == field_id) {
|
||||
value->status = Smi2RdcError(ret);
|
||||
value->type = INTEGER;
|
||||
value->value.l_int = static_cast<int64_t>(num_pages);
|
||||
break;
|
||||
}
|
||||
|
||||
if ((0 < num_pages) &&
|
||||
(RDC_HEALTH_PENDING_PAGE_NUM == field_id)) {
|
||||
std::vector<amdsmi_retired_page_record_t> bad_page_info(num_pages);
|
||||
ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages,
|
||||
bad_page_info.data());
|
||||
value->status = Smi2RdcError(ret);
|
||||
value->type = INTEGER;
|
||||
if (AMDSMI_STATUS_SUCCESS == ret) {
|
||||
uint64_t pending_page_num = 0;
|
||||
for (uint32_t i=0; i < num_pages; i++) {
|
||||
if (AMDSMI_MEM_PAGE_STATUS_PENDING == bad_page_info[i].status)
|
||||
pending_page_num++;
|
||||
}
|
||||
|
||||
value->value.l_int = static_cast<int64_t>(pending_page_num);
|
||||
}
|
||||
}
|
||||
} else
|
||||
value->status = Smi2RdcError(ret);
|
||||
break;
|
||||
}
|
||||
|
||||
case RDC_HEALTH_RETIRED_PAGE_LIMIT:
|
||||
case RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT:
|
||||
case RDC_HEALTH_POWER_THROTTLE_TIME: //gpu_metrics 1.6
|
||||
case RDC_HEALTH_THERMAL_THROTTLE_TIME: //gpu_metrics 1.6
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -180,6 +180,9 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
|
||||
RDC_EVNT_XGMI_1_THRPUT, RDC_EVNT_XGMI_2_THRPUT, RDC_EVNT_XGMI_3_THRPUT,
|
||||
RDC_EVNT_XGMI_4_THRPUT, RDC_EVNT_XGMI_5_THRPUT, RDC_FI_OAM_ID,
|
||||
RDC_FI_GPU_MM_ENC_UTIL, RDC_FI_GPU_MM_DEC_UTIL, RDC_FI_GPU_MEMORY_ACTIVITY,
|
||||
RDC_HEALTH_XGMI_ERROR, RDC_HEALTH_PCIE_REPLAY_COUNT, RDC_HEALTH_RETIRED_PAGE_NUM,
|
||||
RDC_HEALTH_PENDING_PAGE_NUM, RDC_HEALTH_RETIRED_PAGE_LIMIT, RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT,
|
||||
RDC_HEALTH_POWER_THROTTLE_TIME, RDC_HEALTH_THERMAL_THROTTLE_TIME,
|
||||
};
|
||||
std::copy(fields.begin(), fields.end(), field_ids);
|
||||
*field_count = fields.size();
|
||||
|
||||
@@ -41,10 +41,12 @@ namespace rdc {
|
||||
|
||||
RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings,
|
||||
const RdcCacheManagerPtr& cache_mgr,
|
||||
const RdcMetricFetcherPtr& metric_fetcher,
|
||||
const RdcModuleMgrPtr& module_mgr,
|
||||
const RdcNotificationPtr& notif)
|
||||
: group_settings_(group_settings),
|
||||
cache_mgr_(cache_mgr),
|
||||
metric_fetcher_(metric_fetcher),
|
||||
rdc_module_mgr_(module_mgr),
|
||||
notifications_(notif),
|
||||
last_cleanup_time_(0) {}
|
||||
@@ -373,6 +375,423 @@ rdc_status_t RdcWatchTableImpl::rdc_field_unwatch(rdc_gpu_group_t group_id,
|
||||
return update_field_in_table_when_unwatch(ite->first);
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int components,
|
||||
rdc_field_grp_t* field_group_id) {
|
||||
// set filed ids
|
||||
std::vector<rdc_field_t> field_ids{};
|
||||
if (components & RDC_HEALTH_WATCH_PCIE) {
|
||||
field_ids.push_back(RDC_HEALTH_PCIE_REPLAY_COUNT);
|
||||
}
|
||||
|
||||
if (components & RDC_HEALTH_WATCH_XGMI) {
|
||||
field_ids.push_back(RDC_HEALTH_XGMI_ERROR);
|
||||
}
|
||||
|
||||
if (components & RDC_HEALTH_WATCH_MEM) {
|
||||
field_ids.push_back(RDC_FI_ECC_UNCORRECT_TOTAL);
|
||||
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM);
|
||||
field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM);
|
||||
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT);
|
||||
field_ids.push_back(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT);
|
||||
}
|
||||
|
||||
if (components & RDC_HEALTH_WATCH_INFOROM) {
|
||||
}
|
||||
|
||||
if (components & RDC_HEALTH_WATCH_THERMAL) {
|
||||
field_ids.push_back(RDC_HEALTH_THERMAL_THROTTLE_TIME);
|
||||
}
|
||||
|
||||
if (components & RDC_HEALTH_WATCH_POWER) {
|
||||
field_ids.push_back(RDC_HEALTH_POWER_THROTTLE_TIME);
|
||||
}
|
||||
|
||||
if (0 == field_ids.size()) {
|
||||
RDC_LOG(RDC_ERROR, "Fail to health set. The components must contain at least one watch.");
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
const std::string field_group_name("health-field-group");
|
||||
return group_settings_->rdc_group_field_create(field_ids.size(), field_ids.data(),
|
||||
field_group_name.c_str(), field_group_id);
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id,
|
||||
unsigned int components) {
|
||||
// remove old health for same group_id
|
||||
rdc_health_clear(group_id);
|
||||
|
||||
// create a field group base on the components
|
||||
rdc_field_grp_t field_group_id;
|
||||
rdc_status_t result = create_health_field_group(components, &field_group_id);
|
||||
if (result != RDC_ST_OK) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// get field key
|
||||
std::vector<RdcFieldKey> fields_in_watch;
|
||||
result = get_fields_from_group(group_id, field_group_id, fields_in_watch);
|
||||
if (result != RDC_ST_OK) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// add to the health watch table
|
||||
do { //< lock guard for thread safe
|
||||
std::lock_guard<std::mutex> guard(watch_mutex_);
|
||||
HealthWatchTableEntry hentry{components, field_group_id, fields_in_watch};
|
||||
health_watch_table_.insert({group_id, hentry});
|
||||
} while (0);
|
||||
|
||||
for (auto fields = fields_in_watch.begin(); fields != fields_in_watch.end(); fields++) {
|
||||
// get initial values
|
||||
rdc_field_value value;
|
||||
result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value);
|
||||
if (result != RDC_ST_OK)
|
||||
break;
|
||||
|
||||
// set initial values to cache
|
||||
result = cache_mgr_->rdc_health_set(group_id, fields->first, value);
|
||||
if (result != RDC_ST_OK)
|
||||
break;
|
||||
}
|
||||
|
||||
// Start to watch the fields and update fields per 1 second.
|
||||
result = rdc_field_watch(group_id, field_group_id, 1000000, 0, 0);
|
||||
return result;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id,
|
||||
unsigned int *components) {
|
||||
if (nullptr == components)
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
|
||||
std::lock_guard<std::mutex> guard(watch_mutex_);
|
||||
auto table_iter = health_watch_table_.find(group_id);
|
||||
|
||||
// already in the health watch table
|
||||
if (table_iter != health_watch_table_.end())
|
||||
*components = table_iter->second.components;
|
||||
else
|
||||
*components = 0;
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index,
|
||||
rdc_health_system_t component,
|
||||
rdc_health_result_t health,
|
||||
uint32_t err_code,
|
||||
std::string err_msg,
|
||||
rdc_health_incidents_t* incident,
|
||||
rdc_health_response_t* response) {
|
||||
bool result = false;
|
||||
|
||||
incident->gpu_index = gpu_index;
|
||||
incident->component = component;
|
||||
incident->health = health;
|
||||
incident->error.code = err_code;
|
||||
strncpy_with_null(incident->error.msg, err_msg.c_str(), MAX_HEALTH_MSG_LENGTH);
|
||||
|
||||
if (incident->health > response->overall_health)
|
||||
response->overall_health = incident->health;
|
||||
response->incidents_count++;
|
||||
if (response->incidents_count >= HEALTH_MAX_ERROR_ITEMS) {
|
||||
RDC_LOG(RDC_INFO, "Health incidents are full!");
|
||||
result = true;
|
||||
}
|
||||
|
||||
return (result);
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_field_t field,
|
||||
rdc_field_value *start_value,
|
||||
rdc_field_value *end_value) {
|
||||
if ((nullptr == start_value) || (nullptr == end_value))
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
|
||||
uint64_t start_timestamp = 0;
|
||||
|
||||
//get the history data last 1 minute
|
||||
start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
|
||||
|
||||
//get the values of the field at the start_timestamp/end_timestampe
|
||||
rdc_status_t result = cache_mgr_->rdc_health_get_values(group_id,
|
||||
gpu_index, field,
|
||||
start_timestamp, 0,
|
||||
start_value, nullptr);
|
||||
if (result != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// get end values
|
||||
result = metric_fetcher_->fetch_smi_field(gpu_index, field, end_value);
|
||||
if (result != RDC_ST_OK)
|
||||
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " current data. Return: " << result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_health_response_t* response) {
|
||||
//get field start/end values
|
||||
rdc_field_value start = {}, end = {};
|
||||
rdc_status_t result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_HEALTH_PCIE_REPLAY_COUNT,
|
||||
&start,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
|
||||
uint64_t pcie_replay_count = end.value.l_int - start.value.l_int;
|
||||
if (pcie_replay_count > PCIE_MAX_REPLAYS_PERMIN) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
|
||||
std::string err_msg = "Detected ";
|
||||
err_msg += std::to_string(pcie_replay_count);
|
||||
err_msg += " PCIe replays per minute exceeding the max limit ";
|
||||
err_msg += std::to_string(PCIE_MAX_REPLAYS_PERMIN);
|
||||
err_msg += ".";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_PCIE,
|
||||
RDC_HEALTH_RESULT_WARN,
|
||||
RDC_FR_PCI_REPLAY_RATE,
|
||||
err_msg,
|
||||
incident,
|
||||
response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_health_response_t* response) {
|
||||
//get field start/end values
|
||||
rdc_field_value start = {}, end = {};
|
||||
rdc_status_t result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_HEALTH_XGMI_ERROR,
|
||||
&start,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
|
||||
amdsmi_xgmi_status_t status = static_cast<amdsmi_xgmi_status_t>(end.value.l_int);
|
||||
if (AMDSMI_XGMI_STATUS_NO_ERRORS != status) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
|
||||
uint32_t err_code;
|
||||
std::string err_msg = "Detected ";
|
||||
if (AMDSMI_XGMI_STATUS_ERROR == status) {
|
||||
err_msg += " a single XGMI error";
|
||||
err_code = RDC_FR_XGMI_SINGLE_ERROR;
|
||||
} else {
|
||||
err_msg += " multiple XGMI errors";
|
||||
err_code = RDC_FR_XGMI_MULTIPLE_ERROR;
|
||||
}
|
||||
err_msg += ".";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_XGMI,
|
||||
RDC_HEALTH_RESULT_FAIL,
|
||||
err_code,
|
||||
err_msg,
|
||||
incident,
|
||||
response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
|
||||
uint32_t gpu_index,
|
||||
rdc_health_response_t* response) {
|
||||
//get field start/end values
|
||||
rdc_field_value start = {}, end = {};
|
||||
rdc_status_t result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_FI_ECC_UNCORRECT_TOTAL,
|
||||
&start,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
|
||||
uint64_t ecc_uncorrectable_count = 0;
|
||||
ecc_uncorrectable_count = end.value.l_int - start.value.l_int;
|
||||
if (ecc_uncorrectable_count > 0) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
|
||||
std::string err_msg = "Detected ";
|
||||
err_msg += std::to_string(ecc_uncorrectable_count);
|
||||
err_msg += " uncorrectable ECC error(s) in the last minute.";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_MEM,
|
||||
RDC_HEALTH_RESULT_FAIL,
|
||||
RDC_FR_ECC_UNCORRECTABLE_DETECTED,
|
||||
err_msg,
|
||||
incident,
|
||||
response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
|
||||
result = get_start_end_values(group_id,
|
||||
gpu_index,
|
||||
RDC_HEALTH_PENDING_PAGE_NUM,
|
||||
&start,
|
||||
&end);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
|
||||
uint64_t num_pages = end.value.l_int - start.value.l_int;
|
||||
if (num_pages > 0) {
|
||||
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
|
||||
|
||||
std::string err_msg = "Detected ";
|
||||
err_msg += std::to_string(num_pages);
|
||||
err_msg += " pending retired page(s).";
|
||||
|
||||
//add incident
|
||||
if (add_health_incident(gpu_index,
|
||||
RDC_HEALTH_WATCH_MEM,
|
||||
RDC_HEALTH_RESULT_WARN,
|
||||
RDC_FR_PENDING_PAGE_RETIREMENTS,
|
||||
err_msg,
|
||||
incident,
|
||||
response))
|
||||
return RDC_ST_MAX_LIMIT;
|
||||
}
|
||||
|
||||
//To do: RDC_FR_RETIRED_PAGES_LIMIT
|
||||
//To do: RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id,
|
||||
rdc_health_response_t *response) {
|
||||
if (nullptr == response)
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
|
||||
unsigned int components = 0;
|
||||
std::vector<RdcFieldKey> fields_in_watch;
|
||||
do { //< lock guard for thread safe
|
||||
std::lock_guard<std::mutex> guard(watch_mutex_);
|
||||
auto health = health_watch_table_.find(group_id);
|
||||
if (health == health_watch_table_.end())
|
||||
return RDC_ST_NOT_FOUND;
|
||||
components = health->second.components;
|
||||
fields_in_watch = health->second.fields;
|
||||
} while (0);
|
||||
|
||||
rdc_group_info_t ginfo;
|
||||
rdc_status_t result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
|
||||
if (result != RDC_ST_OK)
|
||||
return result;
|
||||
|
||||
for (auto fields = fields_in_watch.begin(); fields != fields_in_watch.end(); fields++) {
|
||||
// get current values
|
||||
rdc_field_value value;
|
||||
result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value);
|
||||
if (result != RDC_ST_OK)
|
||||
break;
|
||||
|
||||
// set current values to cache
|
||||
result = cache_mgr_->rdc_update_health_stats(group_id, fields->first, value);
|
||||
if (result != RDC_ST_OK)
|
||||
break;
|
||||
}
|
||||
|
||||
//init response
|
||||
response->overall_health = RDC_HEALTH_RESULT_PASS;
|
||||
response->incidents_count = 0;
|
||||
|
||||
for (uint32_t gindex = 0; gindex < ginfo.count; gindex++) {
|
||||
//PCIe
|
||||
if (components & RDC_HEALTH_WATCH_PCIE) {
|
||||
result = pcie_check(group_id, ginfo.entity_ids[gindex], response);
|
||||
if (result == RDC_ST_MAX_LIMIT)
|
||||
return result;
|
||||
}
|
||||
|
||||
//XGMI
|
||||
if (components & RDC_HEALTH_WATCH_XGMI) {
|
||||
result = xgmi_check(group_id, ginfo.entity_ids[gindex], response);
|
||||
if (result == RDC_ST_MAX_LIMIT)
|
||||
return result;
|
||||
}
|
||||
|
||||
//Memory
|
||||
if (components & RDC_HEALTH_WATCH_MEM) {
|
||||
result = memory_check(group_id, ginfo.entity_ids[gindex], response);
|
||||
if (result == RDC_ST_MAX_LIMIT)
|
||||
return result;
|
||||
}
|
||||
|
||||
//InfoROM
|
||||
if (components & RDC_HEALTH_WATCH_INFOROM) {
|
||||
//To do:
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
//Thermal
|
||||
if (components & RDC_HEALTH_WATCH_THERMAL) {
|
||||
//To do:
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
//Power
|
||||
if (components & RDC_HEALTH_WATCH_POWER) {
|
||||
//To do:
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
} //end of for gindex
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::rdc_health_clear(rdc_gpu_group_t group_id) {
|
||||
rdc_field_grp_t field_group_id;
|
||||
|
||||
do { //< lock guard for thread safe
|
||||
std::lock_guard<std::mutex> guard(watch_mutex_);
|
||||
auto health = health_watch_table_.find(group_id);
|
||||
if (health == health_watch_table_.end()) {
|
||||
return RDC_ST_NOT_FOUND;
|
||||
}
|
||||
field_group_id = health->second.field_group_id;
|
||||
} while (0);
|
||||
|
||||
// at first, unwatch the old fields.
|
||||
rdc_status_t result = rdc_field_unwatch(group_id, field_group_id);
|
||||
if (result != RDC_ST_OK) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// destroy the old field group
|
||||
group_settings_->rdc_group_field_destroy(field_group_id);
|
||||
|
||||
do { //< lock guard for thread safe
|
||||
std::lock_guard<std::mutex> guard(watch_mutex_);
|
||||
health_watch_table_.erase(group_id);
|
||||
} while (0);
|
||||
|
||||
result = cache_mgr_->rdc_health_clear(group_id);
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id,
|
||||
std::string& job_id) const {
|
||||
RdcFieldKey key{gpu_index, field_id};
|
||||
@@ -388,6 +807,21 @@ bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, rdc_field_t field
|
||||
return false;
|
||||
}
|
||||
|
||||
bool RdcWatchTableImpl::is_health_watch_field(uint32_t gpu_index, rdc_field_t field_id,
|
||||
rdc_gpu_group_t& group_id) const {
|
||||
RdcFieldKey key{gpu_index, field_id};
|
||||
|
||||
for (auto ite = health_watch_table_.begin(); ite != health_watch_table_.end(); ite++) {
|
||||
auto& fields = ite->second.fields;
|
||||
if (std::find(fields.begin(), fields.end(), key) != fields.end()) {
|
||||
group_id = ite->first;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values,
|
||||
void* user_data) {
|
||||
if (values == nullptr || user_data == nullptr) {
|
||||
@@ -421,6 +855,12 @@ rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values, uin
|
||||
if (watchTable->is_job_watch_field(gpu_index, field_id, job_id)) {
|
||||
watchTable->cache_mgr_->rdc_update_job_stats(gpu_index, job_id, values[i].field_value);
|
||||
}
|
||||
|
||||
// Update the health stats cache
|
||||
rdc_gpu_group_t group_id;
|
||||
if (watchTable->is_health_watch_field(gpu_index, field_id, group_id)) {
|
||||
watchTable->cache_mgr_->rdc_update_health_stats(group_id, gpu_index, values[i].field_value);
|
||||
}
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
@@ -492,6 +932,12 @@ rdc_status_t RdcWatchTableImpl::rdc_notif_update_cache(rdc_evnt_notification_t*
|
||||
if (is_job_watch_field(gpu_index, field_id, job_id)) {
|
||||
cache_mgr_->rdc_update_job_stats(gpu_index, job_id, events[i].field);
|
||||
}
|
||||
|
||||
// Update the health stats cache
|
||||
rdc_gpu_group_t group_id;
|
||||
if (is_health_watch_field(gpu_index, field_id, group_id)) {
|
||||
cache_mgr_->rdc_update_health_stats(group_id, gpu_index, events[i].field);
|
||||
}
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
@@ -549,6 +995,7 @@ void RdcWatchTableImpl::debug_status() {
|
||||
RDC_LOG(RDC_DEBUG, "fields_to_watch_:" << fields_to_watch_.size()
|
||||
<< " watch_table_:" << watch_table_.size()
|
||||
<< " job_watch_table_:" << job_watch_table_.size()
|
||||
<< " health_watch_table_:" << health_watch_table_.size()
|
||||
<< " cache stats:" << cache_mgr_->get_cache_stats());
|
||||
|
||||
if (watch_table_.size() > 0) {
|
||||
@@ -575,6 +1022,18 @@ void RdcWatchTableImpl::debug_status() {
|
||||
jite->first << ": " << jite->second.group_id << " fields : " << strstream.str());
|
||||
}
|
||||
|
||||
if (health_watch_table_.size() > 0) {
|
||||
RDC_LOG(RDC_DEBUG, "health watch table details: ");
|
||||
}
|
||||
for (auto hite = health_watch_table_.begin(); hite != health_watch_table_.end(); hite++) {
|
||||
std::stringstream strstream;
|
||||
for (const auto& p : hite->second.fields) {
|
||||
strstream << "<" << p.first << "," << p.second << "> ";
|
||||
}
|
||||
RDC_LOG(RDC_DEBUG,
|
||||
"group id : " << hite->first << " components : " << hite->second.components << " fields : " << strstream.str());
|
||||
}
|
||||
|
||||
if (fields_to_watch_.size() > 0) {
|
||||
RDC_LOG(RDC_DEBUG, "fields to watch details:");
|
||||
}
|
||||
|
||||
@@ -870,5 +870,87 @@ rdc_status_t RdcStandaloneHandler::rdc_policy_unregister(rdc_gpu_group_t group_i
|
||||
return error_handle(status, reply.status());
|
||||
}
|
||||
|
||||
// Health RdcAPI
|
||||
rdc_status_t RdcStandaloneHandler::rdc_health_set(rdc_gpu_group_t group_id,
|
||||
unsigned int components) {
|
||||
::rdc::SetHealthRequest request;
|
||||
::rdc::SetHealthResponse reply;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
request.set_group_id(group_id);
|
||||
request.set_components(components);
|
||||
::grpc::Status status = stub_->SetHealth(&context, request, &reply);
|
||||
rdc_status_t err_status = error_handle(status, reply.status());
|
||||
|
||||
return err_status;
|
||||
}
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::rdc_health_get(rdc_gpu_group_t group_id,
|
||||
unsigned int* components) {
|
||||
if (!components) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
::rdc::GetHealthRequest request;
|
||||
::rdc::GetHealthResponse reply;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
request.set_group_id(group_id);
|
||||
::grpc::Status status = stub_->GetHealth(&context, request, &reply);
|
||||
rdc_status_t err_status = error_handle(status, reply.status());
|
||||
if (err_status != RDC_ST_OK) return err_status;
|
||||
|
||||
*components = reply.components();
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::rdc_health_check(rdc_gpu_group_t group_id,
|
||||
rdc_health_response_t *response) {
|
||||
if (!response) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
::rdc::CheckHealthRequest request;
|
||||
::rdc::CheckHealthResponse reply;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
request.set_group_id(group_id);
|
||||
::grpc::Status status = stub_->CheckHealth(&context, request, &reply);
|
||||
rdc_status_t err_status = error_handle(status, reply.status());
|
||||
if (err_status != RDC_ST_OK) return err_status;
|
||||
|
||||
auto res = reply.response();
|
||||
response->overall_health = static_cast<rdc_health_result_t>(res.overall_health());
|
||||
response->incidents_count = res.incidents_count();
|
||||
|
||||
for (int i = 0; i < res.incidents_size(); i++) {
|
||||
const ::rdc::HealthIncidents& result = res.incidents(i);
|
||||
rdc_health_incidents_t& to_result = response->incidents[i];
|
||||
|
||||
to_result.gpu_index = result.gpu_index();
|
||||
to_result.component = static_cast<rdc_health_system_t>(result.component());
|
||||
to_result.health = static_cast<rdc_health_result_t>(result.health());
|
||||
|
||||
//set error
|
||||
to_result.error.code = result.error().code();
|
||||
strncpy_with_null(to_result.error.msg, result.error().msg().c_str(), MAX_HEALTH_MSG_LENGTH);
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcStandaloneHandler::rdc_health_clear(rdc_gpu_group_t group_id) {
|
||||
::rdc::ClearHealthRequest request;
|
||||
::rdc::ClearHealthResponse reply;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
request.set_group_id(group_id);
|
||||
::grpc::Status status = stub_->ClearHealth(&context, request, &reply);
|
||||
rdc_status_t err_status = error_handle(status, reply.status());
|
||||
if (err_status != RDC_ST_OK) return err_status;
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
@@ -68,6 +68,7 @@ set(RDCI_SRC_LIST
|
||||
"${SRC_DIR}/RdciGroupSubSystem.cc"
|
||||
"${SRC_DIR}/RdciStatsSubSystem.cc"
|
||||
"${SRC_DIR}/RdciPolicySubSystem.cc"
|
||||
"${SRC_DIR}/RdciHealthSubSystem.cc"
|
||||
"${SRC_DIR}/RdciSubSystem.cc"
|
||||
"${SRC_DIR}/rdci.cc")
|
||||
message("RDCI_SRC_LIST=${RDCI_SRC_LIST}")
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
/*
|
||||
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef RDCI_INCLUDE_RDCIHEALTHSUBSYSTEM_H_
|
||||
#define RDCI_INCLUDE_RDCIHEALTHSUBSYSTEM_H_
|
||||
#include <signal.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "RdciSubSystem.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RdciHealthSubSystem : public RdciSubSystem {
|
||||
public:
|
||||
RdciHealthSubSystem();
|
||||
~RdciHealthSubSystem();
|
||||
void parse_cmd_opts(int argc, char** argv) override;
|
||||
void process() override;
|
||||
|
||||
private:
|
||||
void show_help() const;
|
||||
|
||||
void get_watches() const;
|
||||
void set_watches() const;
|
||||
void health_check() const;
|
||||
void health_clear() const;
|
||||
|
||||
std::string health_string(rdc_health_result_t health) const;
|
||||
std::string component_string(rdc_health_system_t component) const;
|
||||
void output_errstr(const std::string& input) const;
|
||||
unsigned int handle_one_component(rdc_health_response_t &response,
|
||||
unsigned int start_index,
|
||||
uint32_t gpu_index,
|
||||
rdc_health_system_t component,
|
||||
rdc_health_result_t &component_health,
|
||||
std::vector<std::string> &err_str) const;
|
||||
unsigned int handle_one_gpu(rdc_health_response_t &response,
|
||||
unsigned int start_index,
|
||||
uint32_t gpu_index) const;
|
||||
|
||||
enum OPERATIONS {
|
||||
HEALTH_UNKNOWN = 0,
|
||||
HEALTH_HELP,
|
||||
HEALTH_FETCH,
|
||||
HEALTH_SET,
|
||||
HEALTH_CHECK,
|
||||
HEALTH_CLEAR,
|
||||
} health_ops_;
|
||||
|
||||
rdc_gpu_group_t group_id_;
|
||||
unsigned int components_;
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // RDCI_INCLUDE_RDCIHEALTHSUBSYSTEM_H_
|
||||
@@ -0,0 +1,557 @@
|
||||
/*
|
||||
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include "RdciHealthSubSystem.h"
|
||||
|
||||
#include <getopt.h>
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <ctime>
|
||||
#include <iomanip>
|
||||
#include <limits>
|
||||
|
||||
#include "common/rdc_utils.h"
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcException.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
RdciHealthSubSystem::RdciHealthSubSystem() {}
|
||||
|
||||
RdciHealthSubSystem::~RdciHealthSubSystem() {}
|
||||
|
||||
void RdciHealthSubSystem::parse_cmd_opts(int argc, char** argv) {
|
||||
const int HOST_OPTIONS = 1000;
|
||||
const int JSON_OPTIONS = 1001;
|
||||
const int CLEAR_OPTIONS = 1002;
|
||||
const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS},
|
||||
{"unauth", optional_argument, nullptr, 'u'},
|
||||
{"help", optional_argument, nullptr, 'h'},
|
||||
{"json", optional_argument, nullptr, JSON_OPTIONS},
|
||||
{"clear", optional_argument, nullptr, CLEAR_OPTIONS},
|
||||
{"group", required_argument, nullptr, 'g'},
|
||||
{"fetch", optional_argument, nullptr, 'f'},
|
||||
{"set", required_argument, nullptr, 's'},
|
||||
{"check", optional_argument, nullptr, 'c'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
bool group_id_set = false;
|
||||
int option_index = 0, opt = 0;
|
||||
std::string flags;
|
||||
unsigned int components = 0;
|
||||
|
||||
while ((opt = getopt_long(argc, argv, "uhg:fs:c", long_options, &option_index)) != -1) {
|
||||
switch (opt) {
|
||||
case HOST_OPTIONS:
|
||||
ip_port_ = optarg;
|
||||
break;
|
||||
|
||||
case JSON_OPTIONS:
|
||||
set_json_output(true);
|
||||
break;
|
||||
|
||||
case CLEAR_OPTIONS:
|
||||
health_ops_ = HEALTH_CLEAR;
|
||||
break;
|
||||
|
||||
case 'u':
|
||||
use_auth_ = false;
|
||||
break;
|
||||
|
||||
case 'h':
|
||||
health_ops_ = HEALTH_HELP;
|
||||
return;
|
||||
|
||||
case 'g':
|
||||
if (!IsNumber(optarg)) {
|
||||
show_help();
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "The group id needs to be a number");
|
||||
}
|
||||
group_id_ = std::stoi(optarg);
|
||||
group_id_set = true;
|
||||
break;
|
||||
|
||||
case 'f':
|
||||
health_ops_ = HEALTH_FETCH;
|
||||
break;
|
||||
|
||||
case 's':
|
||||
health_ops_ = HEALTH_SET;
|
||||
|
||||
flags = optarg;
|
||||
for (unsigned int i = 0; i < flags.length(); i++) {
|
||||
switch (flags.at(i)) {
|
||||
case 'a':
|
||||
components |= RDC_HEALTH_WATCH_PCIE;
|
||||
components |= RDC_HEALTH_WATCH_XGMI;
|
||||
components |= RDC_HEALTH_WATCH_MEM;
|
||||
//To do:
|
||||
//components |= RDC_HEALTH_WATCH_INFOROM;
|
||||
//components |= RDC_HEALTH_WATCH_THERMAL;
|
||||
//components |= RDC_HEALTH_WATCH_POWER;
|
||||
break;
|
||||
|
||||
case 'p':
|
||||
components |= RDC_HEALTH_WATCH_PCIE;
|
||||
break;
|
||||
|
||||
case 'm':
|
||||
components |= RDC_HEALTH_WATCH_MEM;
|
||||
break;
|
||||
|
||||
case 'i':
|
||||
//To do:
|
||||
//components |= RDC_HEALTH_WATCH_INFOROM;
|
||||
throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported");
|
||||
break;
|
||||
|
||||
case 't':
|
||||
//To do:
|
||||
//components |= RDC_HEALTH_WATCH_THERMAL;
|
||||
//components |= RDC_HEALTH_WATCH_POWER;
|
||||
throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported");
|
||||
break;
|
||||
|
||||
case 'x':
|
||||
components |= RDC_HEALTH_WATCH_XGMI;
|
||||
break;
|
||||
|
||||
default:
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "Invalid flags");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (0 == components) {
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "No flags");
|
||||
} else
|
||||
components_ = components;
|
||||
break;
|
||||
|
||||
case 'c':
|
||||
health_ops_ = HEALTH_CHECK;
|
||||
break;
|
||||
|
||||
default:
|
||||
show_help();
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options");
|
||||
}
|
||||
}
|
||||
|
||||
if (!group_id_set) {
|
||||
show_help();
|
||||
throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the GPU group id");
|
||||
}
|
||||
}
|
||||
|
||||
void RdciHealthSubSystem::show_help() const {
|
||||
if (is_json_output()) return;
|
||||
std::cout << " health -- Used to manage the health watches of a group. \n"
|
||||
<< " The health of the GPUs in a group can then be monitored"
|
||||
<< " during a process.\n\n";
|
||||
std::cout << "Usage\n";
|
||||
std::cout << " rdci health [--host <IP/FQDN>:port] [-u] [-j] -g <groupId> -s <flags>\n";
|
||||
std::cout << " rdci health [--host <IP/FQDN>:port] [-u] [-j] -g <groupId> -c\n";
|
||||
std::cout << " rdci health [--host <IP/FQDN>:port] [-u] [-j] -g <groupId> -f\n";
|
||||
std::cout << " rdci health [--host <IP/FQDN>:prot] [-u] [-j] -g <groupId> --clear\n";
|
||||
std::cout << "\nFlags:\n";
|
||||
show_common_usage();
|
||||
std::cout << " --json Output using json.\n";
|
||||
std::cout << " --clear Disable all watches being monitored.\n";
|
||||
std::cout << " -g --group groupId The GPU group to query "
|
||||
<< "on the specified host.\n";
|
||||
std::cout << " -f --fetch Fetch the current watch status.\n";
|
||||
std::cout << " -s --set flags The list of components can be watched. "
|
||||
<< "[default = pm]\n";
|
||||
std::cout << " a - watch all components\n";
|
||||
std::cout << " p - watch PCIe\n";
|
||||
std::cout << " m - watch Memory\n";
|
||||
//std::cout << " i - watch infoROM\n";
|
||||
//std::cout << " t - watch power and thermal\n";
|
||||
std::cout << " x - watch XGMI\n";
|
||||
std::cout << " -c --check Check to see if any errors or warnings have "
|
||||
<< "occurred in the currently monitored watches.\n";
|
||||
}
|
||||
|
||||
void RdciHealthSubSystem::get_watches() const {
|
||||
rdc_status_t result;
|
||||
unsigned int components = 0;
|
||||
std::string on = "On";
|
||||
std::string off = "Off";
|
||||
|
||||
result = rdc_health_get(rdc_handle_, group_id_, &components);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::string error_msg = rdc_status_string(result);
|
||||
if (result == RDC_ST_NOT_FOUND) {
|
||||
error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field.";
|
||||
}
|
||||
throw RdcException(result, error_msg.c_str());
|
||||
}
|
||||
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"heading\" : \"Health monitor systems status\", ";
|
||||
std::cout << "\"body\" : [";
|
||||
std::cout << "{\"Component\" : \"PCIe\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "\"},";
|
||||
std::cout << "{\"Component\" : \"XGMI\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "\"},";
|
||||
std::cout << "{\"Component\" : \"Memory\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "\"},";
|
||||
/*std::cout << "{\"Component\" : \"InfoROM\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "\"},";
|
||||
std::cout << "{\"Component\" : \"Thermal\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "\"},";
|
||||
std::cout << "{\"Component\" : \"Power\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "\"}";*/
|
||||
std::cout << "]";
|
||||
} else {
|
||||
std::cout << "Health monitor systems status:" << std::endl;
|
||||
std::cout << "+--------------------+" //"-" width :20
|
||||
<< "---------------------------------------------------+\n"; //-" width :51
|
||||
std::cout << "|" << std::setw(20) << std::left << " PCIe" << "| "
|
||||
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "|\n";
|
||||
std::cout << "|" << std::setw(20) << std::left << " XGMI" << "| "
|
||||
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n";
|
||||
std::cout << "|" << std::setw(20) << std::left << " Memory" << "| "
|
||||
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n";
|
||||
/*std::cout << "|" << std::setw(20) << std::left << " InfoROM" << "| "
|
||||
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "|\n";
|
||||
std::cout << "|" << std::setw(20) << std::left << " Thermal" << "| "
|
||||
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "|\n";
|
||||
std::cout << "|" << std::setw(20) << std::left << " Power" << "| "
|
||||
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n";*/
|
||||
std::cout << "+--------------------+" //"-" width :20
|
||||
<< "---------------------------------------------------+\n"; //-" width :51
|
||||
}
|
||||
}
|
||||
|
||||
void RdciHealthSubSystem::set_watches() const {
|
||||
rdc_status_t result;
|
||||
|
||||
result = rdc_health_set(rdc_handle_, group_id_, components_);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::string error_msg = rdc_status_string(result);
|
||||
if (result == RDC_ST_NOT_FOUND) {
|
||||
error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field.";
|
||||
}
|
||||
throw RdcException(result, error_msg.c_str());
|
||||
}
|
||||
|
||||
std::cout << "Group " << group_id_ << " health monitor systems set successfully." << std::endl;
|
||||
}
|
||||
|
||||
std::string RdciHealthSubSystem::health_string(rdc_health_result_t health) const {
|
||||
switch (health) {
|
||||
case RDC_HEALTH_RESULT_PASS:
|
||||
return "Pass";
|
||||
|
||||
case RDC_HEALTH_RESULT_WARN:
|
||||
return "Warning";
|
||||
|
||||
case RDC_HEALTH_RESULT_FAIL:
|
||||
return "Fail";
|
||||
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
std::string RdciHealthSubSystem::component_string(rdc_health_system_t component) const {
|
||||
switch (component) {
|
||||
case RDC_HEALTH_WATCH_PCIE:
|
||||
return "PCIe system: ";
|
||||
|
||||
case RDC_HEALTH_WATCH_XGMI:
|
||||
return"XGMI system: ";
|
||||
|
||||
case RDC_HEALTH_WATCH_MEM:
|
||||
return "Memory system: ";
|
||||
|
||||
case RDC_HEALTH_WATCH_INFOROM:
|
||||
return "Inforom system: ";
|
||||
|
||||
case RDC_HEALTH_WATCH_THERMAL:
|
||||
return "Thermal system:";
|
||||
|
||||
case RDC_HEALTH_WATCH_POWER:
|
||||
return "Power system: ";
|
||||
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
void RdciHealthSubSystem::output_errstr(const std::string& input) const {
|
||||
std::string word, line_str;
|
||||
unsigned int width = 60, line_size = 0;
|
||||
std::istringstream iss(input);
|
||||
|
||||
while (iss >> word) {
|
||||
if (line_size + word.size() >= width) {
|
||||
std::cout << "|" << std::setw(20) << " " << "| "
|
||||
<< std::setw(width) << std::left << line_str << "|\n";
|
||||
|
||||
//add new line string
|
||||
line_str = word;
|
||||
line_size = word.size();
|
||||
} else {
|
||||
if (line_size > 0) {
|
||||
line_str += " ";
|
||||
line_str += word;
|
||||
line_size += word.size() + 1;
|
||||
} else {
|
||||
line_str += word;
|
||||
line_size += word.size();
|
||||
}
|
||||
}
|
||||
} //end while
|
||||
|
||||
if (0 < line_size)
|
||||
std::cout << "|" << std::setw(20) << " " << "| "
|
||||
<< std::setw(width) << std::left << line_str << "|\n";
|
||||
}
|
||||
|
||||
unsigned int RdciHealthSubSystem::handle_one_component(rdc_health_response_t &response,
|
||||
unsigned int start_index,
|
||||
uint32_t gpu_index,
|
||||
rdc_health_system_t component,
|
||||
rdc_health_result_t &component_health,
|
||||
std::vector<std::string> &err_str) const {
|
||||
unsigned int count = 0;
|
||||
rdc_health_incidents_t *incident;
|
||||
std::string all_err_str;
|
||||
|
||||
for (unsigned int i = start_index; i < response.incidents_count; i++) {
|
||||
incident = &response.incidents[i];
|
||||
|
||||
//same GPU Index, same component
|
||||
if ((incident->gpu_index != gpu_index) ||
|
||||
(incident->component != component))
|
||||
break;
|
||||
|
||||
//set component health
|
||||
if (incident->health > component_health)
|
||||
component_health = incident->health;
|
||||
|
||||
all_err_str = " - ";
|
||||
all_err_str += incident->error.msg;
|
||||
err_str.push_back(all_err_str);
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
unsigned int RdciHealthSubSystem::handle_one_gpu(rdc_health_response_t &response,
|
||||
unsigned int start_index,
|
||||
uint32_t gpu_index) const {
|
||||
unsigned int count = 0, comp_count = 0;
|
||||
rdc_health_incidents_t *incident;
|
||||
rdc_health_result_t gpu_health = RDC_HEALTH_RESULT_PASS;
|
||||
std::string component_str, health_str, gpu_health_str;
|
||||
typedef struct {
|
||||
rdc_health_result_t component_health;
|
||||
std::vector<std::string> err_str;
|
||||
} component_detail_t;
|
||||
std::map<rdc_health_system_t, component_detail_t> component_detail_map;
|
||||
|
||||
for (unsigned int i = start_index; i < response.incidents_count; i++) {
|
||||
incident = &response.incidents[i];
|
||||
|
||||
//same GPU Index
|
||||
if (incident->gpu_index != gpu_index)
|
||||
break;
|
||||
|
||||
//set gpu health
|
||||
if (incident->health > gpu_health)
|
||||
gpu_health = incident->health;
|
||||
|
||||
//handle smae component
|
||||
component_detail_t detail;
|
||||
detail.component_health = RDC_HEALTH_RESULT_PASS;
|
||||
detail.err_str.clear();
|
||||
|
||||
comp_count = handle_one_component(response, i, gpu_index, incident->component, detail.component_health, detail.err_str);
|
||||
i += comp_count - 1;
|
||||
count += comp_count;
|
||||
|
||||
// Add to the component detail map
|
||||
component_detail_map.insert({incident->component, detail});
|
||||
}
|
||||
|
||||
//output gpu_index health result
|
||||
gpu_health_str = health_string(gpu_health);
|
||||
|
||||
if (is_json_output()) {
|
||||
std::cout << "{\"Index\" : \"" << std::to_string(gpu_index) << "\", ";
|
||||
std::cout << "\"Health\" : \"" << gpu_health_str << "\", ";
|
||||
std::cout << "\"Error\" : [";
|
||||
|
||||
unsigned int i = 0;
|
||||
for (auto ite : component_detail_map) {
|
||||
component_str = component_string(ite.first);
|
||||
health_str = health_string(ite.second.component_health);
|
||||
|
||||
std::cout << "{\"Component\" : \"" << component_str << "\", ";
|
||||
std::cout << "\"Health\" : \"" << health_str << "\", ";
|
||||
|
||||
std::cout << "\"Message\" : [";
|
||||
unsigned int j = 0;
|
||||
for (auto err_ite : ite.second.err_str) {
|
||||
std::cout << "\"" << err_ite << "\"";
|
||||
j++;
|
||||
if (j < ite.second.err_str.size())
|
||||
std::cout << ", ";
|
||||
}
|
||||
std::cout << "]}"; //end Message
|
||||
|
||||
i++;
|
||||
if (i < component_detail_map.size()) {
|
||||
std::cout << ", ";
|
||||
}
|
||||
}
|
||||
std::cout << "]}"; //end Error
|
||||
} else {
|
||||
std::cout << "|" << std::setw(20) << " GPU ID: " + std::to_string(gpu_index) << "| "
|
||||
<< std::setw(60) << std::left << gpu_health_str << "|\n";
|
||||
std::cout << "|" << std::setw(20) << " " << "| "
|
||||
<< std::setw(60) << " " << "|\n";
|
||||
|
||||
for (auto ite : component_detail_map) {
|
||||
component_str = component_string(ite.first);
|
||||
health_str = health_string(ite.second.component_health);
|
||||
std::cout << "|" << std::setw(20) << " " << "| "
|
||||
<< std::setw(60) << std::left << component_str + health_str << "|\n";
|
||||
|
||||
for (auto msg : ite.second.err_str)
|
||||
output_errstr(msg);
|
||||
|
||||
std::cout << "|" << std::setw(20) << " " << "| "
|
||||
<< std::setw(60) << " " << "|\n";
|
||||
}
|
||||
std::cout << "+--------------------+-" //"-" width :20
|
||||
<< "------------------------------------------------------------+\n"; //-" width :60
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
void RdciHealthSubSystem::health_check() const {
|
||||
unsigned int components = 0;
|
||||
rdc_status_t result;
|
||||
rdc_health_response_t response;
|
||||
|
||||
result = rdc_health_get(rdc_handle_, group_id_, &components);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::string error_msg = rdc_status_string(result);
|
||||
if (result == RDC_ST_NOT_FOUND) {
|
||||
error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field.";
|
||||
}
|
||||
throw RdcException(result, error_msg.c_str());
|
||||
}
|
||||
|
||||
if (0 == components) {
|
||||
std::string error_msg = "Health watches not enable, please enable watches first.";
|
||||
throw RdcException(RDC_ST_UNKNOWN_ERROR, error_msg.c_str());
|
||||
}
|
||||
|
||||
result = rdc_health_check(rdc_handle_, group_id_, &response);
|
||||
if (result != RDC_ST_OK) {
|
||||
throw RdcException(result, rdc_status_string(result));
|
||||
}
|
||||
|
||||
//output headline
|
||||
std::string overall_str = health_string(response.overall_health);
|
||||
if (is_json_output()) {
|
||||
std::cout << "\"heading\" : \"Health monitor report\", ";
|
||||
std::cout << "\"body\" : ";
|
||||
std::cout << "{\"Group\" : \"" << std::to_string(group_id_) << "\", ";
|
||||
std::cout << "\"Overall Health\" : \"" << overall_str << "\", ";
|
||||
std::cout << "\"GPU\" : [";
|
||||
} else {
|
||||
std::cout << "Health monitor report:" << std::endl;
|
||||
std::cout << "+--------------------+-" //"-" width :20
|
||||
<< "------------------------------------------------------------+\n"; //-" width :60
|
||||
std::cout << "|" << std::setw(20) << std::left << " Group " + std::to_string(group_id_) << "| "
|
||||
<< std::setw(60) << std::left << "Overall Health: " + overall_str << "|\n";
|
||||
std::cout << "+====================+=" //"=" width :20
|
||||
<< "============================================================+\n"; //"=" width :60
|
||||
}
|
||||
|
||||
//output health of per GPU
|
||||
unsigned int index = 0;
|
||||
while (index < response.incidents_count) {
|
||||
uint32_t gpu_index = response.incidents[index].gpu_index;
|
||||
|
||||
unsigned int count = handle_one_gpu(response, index, gpu_index);
|
||||
index += count;
|
||||
if (is_json_output() && (index < response.incidents_count))
|
||||
std::cout << ",";
|
||||
}
|
||||
|
||||
if (is_json_output())
|
||||
std::cout << "]}"; //end Group
|
||||
}
|
||||
|
||||
void RdciHealthSubSystem::health_clear() const {
|
||||
rdc_status_t result;
|
||||
|
||||
result = rdc_health_clear(rdc_handle_, group_id_);
|
||||
if (result != RDC_ST_OK) {
|
||||
std::string error_msg = rdc_status_string(result);
|
||||
if (result == RDC_ST_NOT_FOUND) {
|
||||
error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field.";
|
||||
}
|
||||
throw RdcException(result, error_msg.c_str());
|
||||
}
|
||||
|
||||
std::cout << "Clear Group " << group_id_ << " all health monitor systems." << std::endl;
|
||||
}
|
||||
|
||||
void RdciHealthSubSystem::process() {
|
||||
switch (health_ops_) {
|
||||
case HEALTH_HELP:
|
||||
case HEALTH_UNKNOWN:
|
||||
show_help();
|
||||
break;
|
||||
|
||||
case HEALTH_FETCH:
|
||||
get_watches();
|
||||
break;
|
||||
|
||||
case HEALTH_SET:
|
||||
set_watches();
|
||||
break;
|
||||
|
||||
case HEALTH_CHECK:
|
||||
health_check();
|
||||
break;
|
||||
|
||||
case HEALTH_CLEAR:
|
||||
health_clear();
|
||||
break;
|
||||
|
||||
default:
|
||||
show_help();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
@@ -31,6 +31,7 @@ THE SOFTWARE.
|
||||
#include "RdciGroupSubSystem.h"
|
||||
#include "RdciStatsSubSystem.h"
|
||||
#include "RdciPolicySubSystem.h"
|
||||
#include "RdciHealthSubSystem.h"
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcException.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
@@ -50,7 +51,7 @@ int main(int argc, char** argv) {
|
||||
const std::string usage_help =
|
||||
"Usage:\trdci <subsystem>|<options>\n"
|
||||
"subsystem: \n"
|
||||
" discovery, dmon, group, fieldgroup, stats, diag, policy\n"
|
||||
" discovery, dmon, group, fieldgroup, stats, diag, policy, health\n"
|
||||
"options: \n"
|
||||
" -v(--version) : Print client version information only\n";
|
||||
|
||||
@@ -81,6 +82,8 @@ int main(int argc, char** argv) {
|
||||
subsystem.reset(new amd::rdc::RdciGroupSubSystem());
|
||||
} else if (subsystem_name == "fieldgroup") {
|
||||
subsystem.reset(new amd::rdc::RdciFieldGroupSubSystem());
|
||||
} else if (subsystem_name == "health") {
|
||||
subsystem.reset(new amd::rdc::RdciHealthSubSystem());
|
||||
} else if (subsystem_name == "stats") {
|
||||
subsystem.reset(new amd::rdc::RdciStatsSubSystem());
|
||||
} else if (subsystem_name == "policy") {
|
||||
|
||||
@@ -153,6 +153,22 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
|
||||
const ::rdc::UnRegisterPolicyRequest* request,
|
||||
::rdc::UnRegisterPolicyResponse* reply) override;
|
||||
|
||||
::grpc::Status SetHealth(::grpc::ServerContext* context,
|
||||
const ::rdc::SetHealthRequest* request,
|
||||
::rdc::SetHealthResponse* reply) override;
|
||||
|
||||
::grpc::Status GetHealth(::grpc::ServerContext* context,
|
||||
const ::rdc::GetHealthRequest* request,
|
||||
::rdc::GetHealthResponse* reply) override;
|
||||
|
||||
::grpc::Status CheckHealth(::grpc::ServerContext* context,
|
||||
const ::rdc::CheckHealthRequest* request,
|
||||
::rdc::CheckHealthResponse* reply) override;
|
||||
|
||||
::grpc::Status ClearHealth(::grpc::ServerContext* context,
|
||||
const ::rdc::ClearHealthRequest* request,
|
||||
::rdc::ClearHealthResponse* reply) override;
|
||||
|
||||
private:
|
||||
bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target);
|
||||
rdc_handle_t rdc_handle_;
|
||||
|
||||
@@ -947,5 +947,93 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData)
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::SetHealth(::grpc::ServerContext* context,
|
||||
const ::rdc::SetHealthRequest* request,
|
||||
::rdc::SetHealthResponse* reply) {
|
||||
(void)(context);
|
||||
if (!reply || !request) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
|
||||
}
|
||||
|
||||
rdc_status_t result = rdc_health_set(rdc_handle_, request->group_id(), request->components());
|
||||
|
||||
reply->set_status(result);
|
||||
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::GetHealth(::grpc::ServerContext* context,
|
||||
const ::rdc::GetHealthRequest* request,
|
||||
::rdc::GetHealthResponse* reply) {
|
||||
(void)(context);
|
||||
if (!reply || !request) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
|
||||
}
|
||||
|
||||
unsigned int components;
|
||||
rdc_status_t result = rdc_health_get(rdc_handle_, request->group_id(), &components);
|
||||
|
||||
reply->set_status(result);
|
||||
if (result != RDC_ST_OK) {
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
reply->set_components(components);
|
||||
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::CheckHealth(::grpc::ServerContext* context,
|
||||
const ::rdc::CheckHealthRequest* request,
|
||||
::rdc::CheckHealthResponse* reply) {
|
||||
(void)(context);
|
||||
if (!reply || !request) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
|
||||
}
|
||||
|
||||
rdc_health_response_t response;
|
||||
rdc_status_t result = rdc_health_check(rdc_handle_, request->group_id(), &response);
|
||||
|
||||
reply->set_status(result);
|
||||
if (result != RDC_ST_OK) {
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::rdc::HealthResponse* to_response = reply->mutable_response();
|
||||
to_response->set_overall_health(response.overall_health);
|
||||
to_response->set_incidents_count(response.incidents_count);
|
||||
|
||||
for (uint32_t i = 0; i < response.incidents_count; i++) {
|
||||
const rdc_health_incidents_t& incident = response.incidents[i];
|
||||
::rdc::HealthIncidents* to_incidents = to_response->add_incidents();
|
||||
|
||||
to_incidents->set_gpu_index(incident.gpu_index);
|
||||
to_incidents->set_component(incident.component);
|
||||
to_incidents->set_health(incident.health);
|
||||
|
||||
//error
|
||||
auto to_error = to_incidents->mutable_error();
|
||||
to_error->set_code(incident.error.code);
|
||||
to_error->set_msg(incident.error.msg);
|
||||
}
|
||||
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RdcAPIServiceImpl::ClearHealth(::grpc::ServerContext* context,
|
||||
const ::rdc::ClearHealthRequest* request,
|
||||
::rdc::ClearHealthResponse* reply) {
|
||||
(void)(context);
|
||||
if (!reply || !request) {
|
||||
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
|
||||
}
|
||||
|
||||
rdc_status_t result = rdc_health_clear(rdc_handle_, request->group_id());
|
||||
|
||||
reply->set_status(result);
|
||||
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
Ссылка в новой задаче
Block a user