Add the RdcSmiHealth module, which will call rocm_smi_lib.
It will support following health:
 - XGMI error detected
 - PCIE replay count detected
 - Memory check
 - InfoROM check
 - Power/Thermal check
The grpc client and server side health function is added.
The health module is added to the rdci.

At present, XGMI/PCIE and a part of Memory have been implemented.
Others will be added as soon as possible.

Change-Id: I1bd99290bdc7dea733f21a41a8c4bcefb2138112


[ROCm/rdc commit: 853d3b0cc5]
Этот коммит содержится в:
limeng12
2024-10-23 16:42:24 +08:00
коммит произвёл Meng, Li (Jassmine)
родитель 39758d913c
Коммит 71e2727a8f
26 изменённых файлов: 2260 добавлений и 3 удалений
+9
Просмотреть файл
@@ -161,3 +161,12 @@ FLD_DESC_ENT(RDC_EVNT_NOTIF_THERMAL_THROTTLE, "Clk freq decrease due to temp",
FLD_DESC_ENT(RDC_EVNT_NOTIF_PRE_RESET, "GPU reset is about to occur", "GPU_PRE_RESET", false)
FLD_DESC_ENT(RDC_EVNT_NOTIF_POST_RESET, "GPU reset just occurred", "GPU_POST_RESET", false)
FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG, "GPU ring hang just occured", "RING_HANG", false)
// RDC health related fields
FLD_DESC_ENT(RDC_HEALTH_XGMI_ERROR, "XGMI one or more errors detected", "XGMI_ERROR", true)
FLD_DESC_ENT(RDC_HEALTH_PCIE_REPLAY_COUNT, "Total PCIE replay count", "PCIE_REPLAY_COUNT", true)
FLD_DESC_ENT(RDC_HEALTH_PENDING_PAGE_NUM, "Pending page number", "PENDING_PAGE_NUM", true)
FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit", "RETIRED_PAGE_LIMIT", false)
FLD_DESC_ENT(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, "Uncorrectable page limit", "UNCORRECTABLE_PAGE_LIMIT", false)
FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", false)
FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", false)
+6
Просмотреть файл
@@ -120,6 +120,12 @@ set(POLICY_EXAMPLE_EXE "policy")
add_executable(${POLICY_EXAMPLE_EXE} "${POLICY_EXAMPLE_SRC_LIST}")
target_link_libraries(${POLICY_EXAMPLE_EXE} pthread dl rdc_bootstrap)
set(HEALTH_EXAMPLE_SRC_LIST "health_example.cc")
cmake_print_variables(HEALTH_EXAMPLE_SRC_LIST)
set(HEALTH_EXAMPLE_EXE "health")
add_executable(${HEALTH_EXAMPLE_EXE} "${HEALTH_EXAMPLE_SRC_LIST}")
target_link_libraries(${HEALTH_EXAMPLE_EXE} pthread dl rdc_bootstrap)
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" Finished Cmake Example ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
+359
Просмотреть файл
@@ -0,0 +1,359 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <unistd.h>
#include <iomanip>
#include <iostream>
#include <vector>
#include <map>
#include "rdc/rdc.h"
rdc_status_t get_watches(rdc_handle_t rdc_handle, rdc_gpu_group_t group_id) {
unsigned int components;
rdc_status_t result = rdc_health_get(rdc_handle, group_id, &components);
if (result == RDC_ST_OK) {
std::string on = "On";
std::string off = "Off";
std::cout << "Health monitor systems status:" << std::endl;
std::cout << "+--------------------+" //"-" width :20
<< "---------------------------------------------------+\n"; //-" width :51
std::cout << "|" << std::setw(20) << std::left << " PCIe" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " XGMI" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Memory" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n";
std::cout << "+--------------------+" //"-" width :20
<< "---------------------------------------------------+\n"; //-" width :51
}
return result;
}
std::string health_string(rdc_health_result_t health) {
switch (health) {
case RDC_HEALTH_RESULT_PASS:
return "Pass";
case RDC_HEALTH_RESULT_WARN:
return "Warning";
case RDC_HEALTH_RESULT_FAIL:
return "Fail";
default:
return "Unknown";
}
}
std::string component_string(rdc_health_system_t component) {
switch (component) {
case RDC_HEALTH_WATCH_PCIE:
return "PCIe system: ";
case RDC_HEALTH_WATCH_XGMI:
return"XGMI system: ";
case RDC_HEALTH_WATCH_MEM:
return "Memory system: ";
case RDC_HEALTH_WATCH_INFOROM:
return "Inforom system: ";
case RDC_HEALTH_WATCH_THERMAL:
return "Thermal system:";
case RDC_HEALTH_WATCH_POWER:
return "Power system: ";
default:
return "Unknown";
}
}
void output_errstr(const std::string& input) {
std::string word, line_str;
unsigned int width = 60, line_size = 0;
std::istringstream iss(input);
while (iss >> word) {
if (line_size + word.size() >= width) {
std::cout << "|" << std::setw(20) << " " << "| "
<< std::setw(width) << std::left << line_str << "|\n";
//add new line string
line_str = word;
line_size = word.size();
} else {
if (line_size > 0) {
line_str += " ";
line_str += word;
line_size += word.size() + 1;
} else {
line_str += word;
line_size += word.size();
}
}
} //end while
if (0 < line_size)
std::cout << "|" << std::setw(20) << " " << "| "
<< std::setw(width) << std::left << line_str << "|\n";
}
unsigned int handle_one_component(rdc_health_response_t &response,
unsigned int start_index,
uint32_t gpu_index,
rdc_health_system_t component,
rdc_health_result_t &component_health,
std::vector<std::string> &err_str) {
unsigned int count = 0;
rdc_health_incidents_t *incident;
std::string all_err_str;
for (unsigned int i = start_index; i < response.incidents_count; i++) {
incident = &response.incidents[i];
//same GPU Index, same component
if ((incident->gpu_index != gpu_index) ||
(incident->component != component))
break;
//set component health
if (incident->health > component_health)
component_health = incident->health;
all_err_str = " - ";
all_err_str += incident->error.msg;
err_str.push_back(all_err_str);
count++;
}
return count;
}
unsigned int handle_one_gpu(rdc_health_response_t &response,
unsigned int start_index,
uint32_t gpu_index) {
unsigned int count = 0, comp_count = 0;
rdc_health_incidents_t *incident;
rdc_health_result_t gpu_health = RDC_HEALTH_RESULT_PASS;
std::string component_str, health_str, gpu_health_str;
typedef struct {
rdc_health_result_t component_health;
std::vector<std::string> err_str;
} component_detail_t;
std::map<rdc_health_system_t, component_detail_t> component_detail_map;
for (unsigned int i = start_index; i < response.incidents_count; i++) {
incident = &response.incidents[i];
//same GPU Index
if (incident->gpu_index != gpu_index)
break;
//set gpu health
if (incident->health > gpu_health)
gpu_health = incident->health;
//handle smae component
component_detail_t detail;
detail.component_health = RDC_HEALTH_RESULT_PASS;
detail.err_str.clear();
comp_count = handle_one_component(response, i, gpu_index, incident->component, detail.component_health, detail.err_str);
i += comp_count - 1;
count += comp_count;
// Add to the component detail map
component_detail_map.insert({incident->component, detail});
}
//output gpu_index health result
gpu_health_str = health_string(gpu_health);
std::cout << "|" << std::setw(20) << " GPU ID: " + std::to_string(gpu_index) << "| "
<< std::setw(60) << std::left << gpu_health_str << "|\n";
std::cout << "|" << std::setw(20) << " " << "| "
<< std::setw(60) << " " << "|\n";
for (auto ite : component_detail_map) {
component_str = component_string(ite.first);
health_str = health_string(ite.second.component_health);
std::cout << "|" << std::setw(20) << " " << "| "
<< std::setw(60) << std::left << component_str + health_str << "|\n";
for (auto msg : ite.second.err_str)
output_errstr(msg);
std::cout << "|" << std::setw(20) << " " << "| "
<< std::setw(60) << " " << "|\n";
}
std::cout << "+--------------------+-" //"-" width :20
<< "------------------------------------------------------------+\n"; //-" width :60
return count;
}
int main(int, char**) {
rdc_status_t result;
rdc_handle_t rdc_handle;
char hostIpAddress[] = {"127.0.0.1:50051"};
char group_name[] = {"healthgroup1"};
std::cout << "Start rdci in Standalone mode\n";
// Init the rdc
result = rdc_init(0);
if (result != RDC_ST_OK) {
std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
} else {
std::cout << "RDC Initialized.\n";
}
result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr);
if (result != RDC_ST_OK) {
std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
// Now we can use the same API for standalone
// (1) create group and add GPUs
rdc_gpu_group_t group_id;
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id);
if (result != RDC_ST_OK) {
std::cout << "Error creating group. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
std::cout << "Created the GPU group " << group_id << std::endl;
result = rdc_group_gpu_add(rdc_handle, group_id, 0); // Add GPU 0
if (result != RDC_ST_OK) {
std::cout << "Error adding group. Return: " << rdc_status_string(result)
<< std::endl;
goto destroygroup;
}
rdc_device_attributes_t attribute;
result = rdc_device_get_attributes(rdc_handle, 0, &attribute);
if (result != RDC_ST_OK) {
std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result);
goto destroygroup;
}
std::cout << "Add GPU 0: " << attribute.device_name << " to group "
<< group_id << std::endl;
// (2) get heath current watches before setting
result = get_watches(rdc_handle, group_id);
if (result != RDC_ST_OK) {
std::cout << "Error getting health watches. Return: " << rdc_status_string(result)
<< std::endl;
goto destroygroup;
}
// (3) set health watches.
unsigned int components;
components = RDC_HEALTH_WATCH_PCIE | RDC_HEALTH_WATCH_XGMI | RDC_HEALTH_WATCH_MEM;
result = rdc_health_set(rdc_handle, group_id, components);
if (result != RDC_ST_OK) {
std::cout << "Error setting health watches. Return: " << rdc_status_string(result)
<< std::endl;
goto destroygroup;
}
std::cout << "Set health watches to all." << std::endl;
// (4) get heath current watches after setting
result = get_watches(rdc_handle, group_id);
if (result != RDC_ST_OK) {
std::cout << "Error getting health watches. Return: " << rdc_status_string(result)
<< std::endl;
goto destroygroup;
}
std::cout << "Start to health monitor group:" << group_id
<< std::endl;
std::cout << "Sleep a few seconds before retreive the data ...\n";
// For standalone mode, the daemon will update and cache the samples
// take samples, standalone mode, do nothing
usleep(5000000); // sleep 5 seconds before fetch the stats
// (5) Get the health stats
rdc_health_response_t response;
result = rdc_health_check(rdc_handle, group_id, &response);
if (result != RDC_ST_OK) {
std::cout << "Error health check. Return: " << rdc_status_string(result)
<< std::endl;
goto destroygroup;
} else {
//output headline
std::string overall_str = health_string(response.overall_health);
std::cout << "Health monitor report:" << std::endl;
std::cout << "+--------------------+-" //"-" width :20
<< "------------------------------------------------------------+\n"; //-" width :60
std::cout << "|" << std::setw(20) << std::left << " Group " + std::to_string(group_id) << "| "
<< std::setw(60) << std::left << "Overall Health: " + overall_str << "|\n";
std::cout << "+====================+=" //"=" width :20
<< "============================================================+\n"; //"=" width :60
//output health of per GPU
unsigned int index = 0;
while (index < response.incidents_count) {
uint32_t gpu_index = response.incidents[index].gpu_index;
unsigned int count = handle_one_gpu(response, index, gpu_index);
index += count;
}
}
// (6) Clear the health
result = rdc_health_clear(rdc_handle, group_id);
if (result != RDC_ST_OK) {
std::cout << "Error clear health. Return: " << rdc_status_string(result)
<< std::endl;
goto destroygroup;
}
std::cout << "Clear Group " << group_id << " all health monitor systems." << std::endl;
destroygroup:
// Delete the GPU group
result = rdc_group_gpu_destroy(rdc_handle, group_id);
if (result != RDC_ST_OK) {
std::cout << "Error delete GPU group. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Deleted the GPU group " << group_id << std::endl;
// Cleanup consists of shutting down RDC.
cleanup:
std::cout << "Cleaning up.\n";
rdc_disconnect(rdc_handle);
rdc_shutdown();
return result;
}
+153
Просмотреть файл
@@ -335,6 +335,18 @@ typedef enum {
RDC_EVNT_NOTIF_RING_HANG, //!< GPU ring hang just occurred
RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_RING_HANG,
/**
* @brief RDC health related fields
*/
RDC_HEALTH_XGMI_ERROR = 3000, //!< XGMI one or more errors detected
RDC_HEALTH_PCIE_REPLAY_COUNT, //!< Total PCIE replay count
RDC_HEALTH_RETIRED_PAGE_NUM, //!< Retired page number
RDC_HEALTH_PENDING_PAGE_NUM, //!< Pending page number
RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< The threshold of retired page
RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT,//!< The threshold of uncorrectable page
RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter
RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds)
} rdc_field_t;
// even and odd numbers are used for correctable and uncorrectable errors
@@ -589,6 +601,81 @@ typedef struct {
rdc_policy_action_t action; //!< Action to take
} rdc_policy_t;
/**
* @brief type of health watches
*/
typedef enum {
RDC_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches
RDC_HEALTH_WATCH_XGMI = 0x2, //!< XGMI system watches
RDC_HEALTH_WATCH_MEM = 0x4, //!< Memory watches
RDC_HEALTH_WATCH_INFOROM = 0x8, //!< Inforom watches
RDC_HEALTH_WATCH_THERMAL = 0x10, //!< Temperature watches
RDC_HEALTH_WATCH_POWER = 0x20, //!< Power watches
} rdc_health_system_t;
/**
* @brief type of health result
*/
typedef enum {
RDC_HEALTH_RESULT_PASS, //!< The health test pass
RDC_HEALTH_RESULT_WARN, //!< The health test has warnings
RDC_HEALTH_RESULT_FAIL //!< The health test fail
} rdc_health_result_t;
/**
* @brief The maximum length of the health messages
*/
#define MAX_HEALTH_MSG_LENGTH 4096
/**
* 8 replays per minute is the maximum recommended
*/
#define PCIE_MAX_REPLAYS_PERMIN 8
// The error code set at rdc_health_incidents_t.error.code
typedef enum {
RDC_FR_PCI_REPLAY_RATE = 1000,
RDC_FR_ECC_UNCORRECTABLE_DETECTED = 1001,
RDC_FR_PENDING_PAGE_RETIREMENTS = 1002,
RDC_FR_RETIRED_PAGES_LIMIT = 1003,
RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT = 1004,
RDC_FR_CLOCKS_THROTTLE_THERMAL = 1005,
RDC_FR_CLOCKS_THROTTLE_POWER = 1006,
RDC_FR_XGMI_SINGLE_ERROR = 1007,
RDC_FR_XGMI_MULTIPLE_ERROR = 1008,
RDC_FR_CORRUPT_INFOROM = 1009
} rdc_health_error_code_t;
/**
* @brief details of the health errors
*/
typedef struct {
char msg[MAX_HEALTH_MSG_LENGTH]; //!< The test result details
uint32_t code; //!< The low level error code
} rdc_health_detail_t;
/**
* @brief details of the per health incidents
*/
typedef struct {
uint32_t gpu_index; //!< which GPU in this group have the issue
rdc_health_system_t component; //!< which components have the issue
rdc_health_result_t health; //!< health diagnosis of this incident
rdc_health_detail_t error; //!< The details of the error, rdc_health_error_code_t
} rdc_health_incidents_t;
#define HEALTH_MAX_ERROR_ITEMS 64
/**
* @brief The health responses for test cases
*/
typedef struct {
rdc_health_result_t overall_health; //!< The overall health of this entire host
unsigned int incidents_count; //!< The number of health incidents reported in this struct
rdc_health_incidents_t incidents[HEALTH_MAX_ERROR_ITEMS]; //!< Report of the errors detected
} rdc_health_response_t;
/**
* @brief Initialize ROCm RDC.
*
@@ -1274,6 +1361,72 @@ rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t grou
*/
rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
/**
* @brief enable the health check for a group
*
* @details For each group, only one parameter can be set. If you want to
* clear the setting for a group, set component == 0x0
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @param[in] components The list of components that should be enabled for health check
* for example, RDC_HEALTH_WATCH_THERMAL | RDC_HEALTH_WATCH_POWER
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_health_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
unsigned int components);
/**
* @brief get the health check settings of a group
*
* @details get the health check settings of a component
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @param[out] components The list of components that should be enabled for health check
* for example, RDC_HEALTH_WATCH_THERMAL | RDC_HEALTH_WATCH_POWER
* if it is 0x0, then the health check not set for the group yet.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
unsigned int* components);
/**
* @brief Check health watch results
*
* @details If it has incidents.
* For each incident, check the component and error message.
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @param[inout] response The detail results of the health.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_health_check(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_health_response_t* response);
/**
* @brief clear the health watch
*
* @details For each group, clear the setting.
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
#ifdef __cplusplus
}
#endif // __cplusplus
+15
Просмотреть файл
@@ -59,6 +59,21 @@ class RdcCacheManager {
virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove_all() = 0;
virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id,
uint32_t gpu_index,
const rdc_field_value& value) = 0;
virtual rdc_status_t rdc_health_get_values(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_field_t field_id,
uint64_t start_timestamp,
uint64_t end_timestamp,
rdc_field_value* start_value,
rdc_field_value* end_value) = 0;
virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0;
virtual rdc_status_t rdc_update_health_stats(rdc_gpu_group_t group_id,
uint32_t gpu_index,
const rdc_field_value& value) = 0;
virtual ~RdcCacheManager() {}
};
+6
Просмотреть файл
@@ -106,6 +106,12 @@ class RdcHandler {
virtual rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) = 0;
// Health API
virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) = 0;
virtual rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) = 0;
virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) = 0;
virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0;
virtual ~RdcHandler() {}
};
+8
Просмотреть файл
@@ -50,6 +50,14 @@ class RdcWatchTable {
virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id) = 0;
virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id,
unsigned int components) = 0;
virtual rdc_status_t rdc_health_get(rdc_gpu_group_t group_id,
unsigned int* components) = 0;
virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id,
rdc_health_response_t *response) = 0;
virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0;
virtual ~RdcWatchTable() {}
};
+21
Просмотреть файл
@@ -32,6 +32,8 @@ THE SOFTWARE.
#include "rdc_lib/RdcCacheManager.h"
#include "rdc_lib/rdc_common.h"
#define HEALTH_MAX_KEEP_SAMPLES 300
namespace amd {
namespace rdc {
@@ -81,6 +83,9 @@ struct RdcJobStatsCacheEntry {
// <job_id, job_stats>
typedef std::map<std::string, RdcJobStatsCacheEntry> RdcJobStatsCache;
// <group_id, health_samples>
typedef std::map<rdc_gpu_group_t, RdcCacheSamples> RdcHealthStatsCache;
class RdcCacheManagerImpl : public RdcCacheManager {
public:
rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field,
@@ -105,6 +110,21 @@ class RdcCacheManagerImpl : public RdcCacheManager {
rdc_status_t rdc_job_remove(const char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
rdc_status_t rdc_health_set(rdc_gpu_group_t group_id,
uint32_t gpu_index,
const rdc_field_value& value) override;
rdc_status_t rdc_health_get_values(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_field_t field_id,
uint64_t start_timestamp,
uint64_t end_timestamp,
rdc_field_value* start_value,
rdc_field_value* end_value) override;
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
rdc_status_t rdc_update_health_stats(rdc_gpu_group_t group_id,
uint32_t gpu_index,
const rdc_field_value& value) override;
private:
void set_summary(const FieldSummaryStats& stats, rdc_stats_summary_t& gpu,
rdc_stats_summary_t& summary, // NOLINT
@@ -113,6 +133,7 @@ class RdcCacheManagerImpl : public RdcCacheManager {
uint32_t num_gpus); // NOLINT
RdcCacheSamples cache_samples_;
RdcJobStatsCache cache_jobs_;
RdcHealthStatsCache cache_health_;
std::mutex cache_mutex_;
};
+6
Просмотреть файл
@@ -108,6 +108,12 @@ class RdcEmbeddedHandler final : public RdcHandler {
rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;
// Health API
rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override;
rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override;
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override;
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode);
~RdcEmbeddedHandler() final;
+6
Просмотреть файл
@@ -105,6 +105,12 @@ class RdcStandaloneHandler : public RdcHandler {
rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override;
// Health API
rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override;
rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override;
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override;
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
explicit RdcStandaloneHandler(const char* ip_and_port, const char* root_ca,
const char* client_cert, const char* client_key);
+42 -1
Просмотреть файл
@@ -55,6 +55,12 @@ struct JobWatchTableEntry {
std::vector<RdcFieldKey> fields; //< store fields for faster query
};
struct HealthWatchTableEntry {
unsigned int components;
rdc_field_grp_t field_group_id;
std::vector<RdcFieldKey> fields; //< store fields for faster query
};
class RdcWatchTableImpl : public RdcWatchTable {
public:
rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, const char job_id[64],
@@ -74,6 +80,11 @@ class RdcWatchTableImpl : public RdcWatchTable {
//!< is reached, which will be handled in the clean_up() function.
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override;
rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override;
rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override;
rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override;
rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override;
//!< When the RDC is running as RDC_OPERATION_MODE_MANUAL, the user will
//!< call this function periodically. Instead of providing other APIs to
//!< cleanup the cache, this function will update and cleanup the cache.
@@ -85,7 +96,8 @@ class RdcWatchTableImpl : public RdcWatchTable {
rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) override;
RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr,
const RdcModuleMgrPtr& module_mgr, const RdcNotificationPtr& notif);
const RdcMetricFetcherPtr& metric_fetcher, const RdcModuleMgrPtr& module_mgr,
const RdcNotificationPtr& notif);
private:
//!< Helper function to Update the fields_in_table when unwatch tables
@@ -104,13 +116,39 @@ class RdcWatchTableImpl : public RdcWatchTable {
bool is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id,
std::string& job_id) const; // NOLINT
bool is_health_watch_field(uint32_t gpu_index, rdc_field_t field_id,
rdc_gpu_group_t& group_id) const;
rdc_status_t rdc_notif_update_cache(rdc_evnt_notification_t* events, uint32_t num_events);
//!< The function will be pass as the callback for bulk fetch
static rdc_status_t handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values,
void* user_data);
rdc_status_t create_health_field_group(unsigned int components,
rdc_field_grp_t* field_group_id);
//!< output: Whether health incidents are full
bool add_health_incident(uint32_t gpu_index,
rdc_health_system_t component,
rdc_health_result_t health,
uint32_t err_code,
std::string err_msg,
rdc_health_incidents_t* incident,
rdc_health_response_t* response);
rdc_status_t get_start_end_values(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_field_t field,
rdc_field_value *start_value,
rdc_field_value *end_value);
rdc_status_t pcie_check(rdc_gpu_group_t group_id,
uint32_t gpu_index, rdc_health_response_t* response);
rdc_status_t xgmi_check(rdc_gpu_group_t group_id,
uint32_t gpu_index, rdc_health_response_t* response);
rdc_status_t memory_check(rdc_gpu_group_t group_id,
uint32_t gpu_index, rdc_health_response_t* response);
RdcGroupSettingsPtr group_settings_;
RdcCacheManagerPtr cache_mgr_;
RdcMetricFetcherPtr metric_fetcher_;
RdcModuleMgrPtr rdc_module_mgr_;
RdcNotificationPtr notifications_;
@@ -126,6 +164,9 @@ class RdcWatchTableImpl : public RdcWatchTable {
//!< Those settings will only be updated when watching or unwatching.
std::map<RdcFieldKey, FieldSettings> fields_to_watch_;
//!< The health watch table to store the health settings.
std::map<uint32_t, HealthWatchTableEntry> health_watch_table_;
//!< The last clean up time
std::atomic<uint64_t> last_cleanup_time_;
std::mutex watch_mutex_;
+64
Просмотреть файл
@@ -190,6 +190,18 @@ service RdcAPI {
// rdc_policy_condition_t condition);
rpc UnRegisterPolicy(UnRegisterPolicyRequest) returns (UnRegisterPolicyResponse) {}
// Health API
// rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components);
rpc SetHealth(SetHealthRequest) returns (SetHealthResponse) {}
// rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components);
rpc GetHealth(GetHealthRequest) returns (GetHealthResponse) {}
// rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t r*esponse);
rpc CheckHealth(CheckHealthRequest) returns (CheckHealthResponse) {}
// rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id);
rpc ClearHealth(ClearHealthRequest) returns (ClearHealthResponse) {}
}
message Empty {
@@ -620,3 +632,55 @@ message UnRegisterPolicyRequest {
message UnRegisterPolicyResponse {
uint32 status = 1;
}
message SetHealthRequest {
uint32 group_id = 1;
uint32 components = 2;
}
message SetHealthResponse {
uint32 status = 1;
}
message GetHealthRequest {
uint32 group_id = 1;
}
message GetHealthResponse {
uint32 status = 1;
uint32 components = 2;
}
message CheckHealthRequest {
uint32 group_id = 1;
}
message HealthDetail {
string msg = 1;
uint32 code = 2;
}
message HealthIncidents {
uint32 gpu_index = 1;
uint32 component = 2;
uint32 health = 3;
HealthDetail error = 4;
}
message HealthResponse {
uint32 overall_health = 1;
uint32 incidents_count = 2;
repeated HealthIncidents incidents = 3;
}
message CheckHealthResponse {
uint32 status = 1;
HealthResponse response = 2;
}
message ClearHealthRequest {
uint32 group_id = 1;
}
message ClearHealthResponse {
uint32 status = 1;
}
+8
Просмотреть файл
@@ -163,6 +163,14 @@ class rdc_field_t(c_int):
RDC_EVNT_NOTIF_PRE_RESET = 2002
RDC_EVNT_NOTIF_POST_RESET = 2003
RDC_EVNT_NOTIF_RING_HANG = 2004
RDC_HEALTH_XGMI_ERROR = 3000
RDC_HEALTH_PCIE_REPLAY_COUNT = 3001
RDC_HEALTH_RETIRED_PAGE_NUM = 3002
RDC_HEALTH_PENDING_PAGE_NUM = 3003
RDC_HEALTH_RETIRED_PAGE_LIMIT = 3004
RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT = 3005
RDC_HEALTH_POWER_THROTTLE_TIME = 3006
RDC_HEALTH_THERMAL_THROTTLE_TIME = 3007
rdc_handle_t = c_void_p
rdc_gpu_group_t = c_uint32
+39
Просмотреть файл
@@ -401,6 +401,45 @@ rdc_field_t get_field_id_from_name(const char* name) {
return RDC_FI_INVALID;
}
rdc_status_t rdc_health_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
unsigned int components) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_health_set(group_id, components);
}
rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
unsigned int* components) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_health_get(group_id, components);
}
rdc_status_t rdc_health_check(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_health_response_t *response) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_health_check(group_id, response);
}
rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_health_clear(group_id);
}
char* strncpy_with_null(char* dest, const char* src, size_t n) {
if (n == 0) {
return dest;
+137
Просмотреть файл
@@ -449,5 +449,142 @@ rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(const char job_id[64],
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_health_set(rdc_gpu_group_t group_id,
uint32_t gpu_index,
const rdc_field_value& value) {
std::lock_guard<std::mutex> guard(cache_mutex_);
RdcFieldKey field{gpu_index, value.field_id};
// Set initial values
RdcCacheEntry entry;
entry.last_time = value.ts;
entry.value = value.value;
entry.type = value.type;
auto health_ite = cache_health_.find(group_id);
if (health_ite == cache_health_.end()) {
std::vector<RdcCacheEntry> ve;
ve.push_back(entry);
RdcCacheSamples cache_sample;
cache_sample.insert({field, ve});
cache_health_.insert({group_id, cache_sample});
}
else {
auto samples_ite = health_ite->second.find(field);
if (samples_ite == health_ite->second.end()) {
std::vector<RdcCacheEntry> ve;
ve.push_back(entry);
health_ite->second.insert({field, ve});
} else {
samples_ite->second.push_back(entry);
}
}
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_health_get_values(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_field_t field_id,
uint64_t start_timestamp,
uint64_t end_timestamp,
rdc_field_value* start_value,
rdc_field_value* end_value) {
if (!start_value && !end_value)
return RDC_ST_BAD_PARAMETER;
std::lock_guard<std::mutex> guard(cache_mutex_);
auto health_ite = cache_health_.find(group_id);
if (health_ite == cache_health_.end())
return RDC_ST_NOT_FOUND;
RdcFieldKey field{gpu_index, field_id};
auto samples_ite = health_ite->second.find(field);
if (samples_ite == health_ite->second.end() ||
samples_ite->second.size() == 0)
return RDC_ST_NOT_FOUND;
auto cache_values = samples_ite->second;
rdc_status_t result = RDC_ST_OK;
if (start_value != nullptr) {
//get start value
result = RDC_ST_NOT_FOUND;
for (auto entry = cache_values.begin(); entry != cache_values.end(); entry++) {
if (entry->last_time >= start_timestamp) {
start_value->field_id = field_id;
start_value->ts = entry->last_time;
start_value->type = entry->type;
if (entry->type == STRING)
strncpy_with_null(start_value->value.str, entry->value.str, RDC_MAX_STR_LENGTH);
else
start_value->value.l_int = entry->value.l_int;
result = RDC_ST_OK;
break;
}
} //end for
} //end if
if ((RDC_ST_OK == result) && (end_value != nullptr)) {
// get end value
result = RDC_ST_NOT_FOUND;
for (auto entry = cache_values.rbegin(); entry != cache_values.rend(); entry++) {
if (entry->last_time <= end_timestamp) {
end_value->field_id = field_id;
end_value->ts = entry->last_time;
end_value->type = entry->type;
if (entry->type == STRING)
strncpy_with_null(end_value->value.str, entry->value.str, RDC_MAX_STR_LENGTH);
else
end_value->value.l_int = entry->value.l_int;
result = RDC_ST_OK;
break;
}
} //end for
} //end if
return result;
}
rdc_status_t RdcCacheManagerImpl::rdc_health_clear(rdc_gpu_group_t group_id) {
std::lock_guard<std::mutex> guard(cache_mutex_);
cache_health_.erase(group_id);
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_update_health_stats(rdc_gpu_group_t group_id,
uint32_t gpu_index,
const rdc_field_value& value) {
std::lock_guard<std::mutex> guard(cache_mutex_);
auto health_ite = cache_health_.find(group_id);
if (health_ite == cache_health_.end()) {
return RDC_ST_NOT_FOUND;
}
RdcFieldKey field{gpu_index, value.field_id};
auto samples_ite = health_ite->second.find(field);
if (samples_ite == health_ite->second.end()) {
return RDC_ST_NOT_FOUND;
}
// Check HEALTH_MAX_KEEP_SAMPLES
auto& cache_values = samples_ite->second;
int item_remove = cache_values.size() - HEALTH_MAX_KEEP_SAMPLES + 1;
if (item_remove > 0) {
cache_values.erase(cache_values.begin(), cache_values.begin() + item_remove);
}
RdcCacheEntry entry;
entry.last_time = value.ts;
entry.value = value.value;
entry.type = value.type;
cache_values.push_back(entry);
return RDC_ST_OK;
}
} // namespace rdc
} // namespace amd
+34 -1
Просмотреть файл
@@ -79,7 +79,7 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode)
metric_fetcher_(new RdcMetricFetcherImpl()),
rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)),
rdc_notif_(new RdcNotificationImpl()),
watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, rdc_module_mgr_, rdc_notif_)),
watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, metric_fetcher_, rdc_module_mgr_, rdc_notif_)),
metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)),
policy_(new RdcPolicyImpl(group_settings_,metric_fetcher_)) {
if (mode == RDC_OPERATION_MODE_AUTO) {
@@ -460,5 +460,38 @@ rdc_status_t RdcEmbeddedHandler::rdc_policy_unregister(rdc_gpu_group_t group_id)
return policy_->rdc_policy_unregister(group_id);
}
// Health API
rdc_status_t RdcEmbeddedHandler::rdc_health_set(rdc_gpu_group_t group_id,
unsigned int components) {
if (0 == components) {
return RDC_ST_BAD_PARAMETER;
}
return watch_table_->rdc_health_set(group_id, components);
}
rdc_status_t RdcEmbeddedHandler::rdc_health_get(rdc_gpu_group_t group_id,
unsigned int *components) {
if (components == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
return watch_table_->rdc_health_get(group_id, components);
}
rdc_status_t RdcEmbeddedHandler::rdc_health_check(rdc_gpu_group_t group_id,
rdc_health_response_t *response) {
if (response == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
return watch_table_->rdc_health_check(group_id, response);
}
rdc_status_t RdcEmbeddedHandler::rdc_health_clear(rdc_gpu_group_t group_id) {
return watch_table_->rdc_health_clear(group_id);
}
} // namespace rdc
} // namespace amd
+60
Просмотреть файл
@@ -804,6 +804,66 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
read_gpu_metrics_uint64_t();
break;
case RDC_HEALTH_XGMI_ERROR: {
amdsmi_xgmi_status_t status;
ret = amdsmi_gpu_xgmi_error_status(processor_handle, &status);
value->status = Smi2RdcError(ret);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(status);
}
break;
}
case RDC_HEALTH_PCIE_REPLAY_COUNT: {
amdsmi_pcie_info_t pcie_info;
ret = amdsmi_get_pcie_info(processor_handle, &pcie_info);
value->status = Smi2RdcError(ret);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(pcie_info.pcie_metric.pcie_replay_count);
}
break;
}
case RDC_HEALTH_RETIRED_PAGE_NUM:
case RDC_HEALTH_PENDING_PAGE_NUM: {
uint32_t num_pages = 0;
ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, nullptr);
if (AMDSMI_STATUS_SUCCESS == ret) {
if (RDC_HEALTH_RETIRED_PAGE_NUM == field_id) {
value->status = Smi2RdcError(ret);
value->type = INTEGER;
value->value.l_int = static_cast<int64_t>(num_pages);
break;
}
if ((0 < num_pages) &&
(RDC_HEALTH_PENDING_PAGE_NUM == field_id)) {
std::vector<amdsmi_retired_page_record_t> bad_page_info(num_pages);
ret = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages,
bad_page_info.data());
value->status = Smi2RdcError(ret);
value->type = INTEGER;
if (AMDSMI_STATUS_SUCCESS == ret) {
uint64_t pending_page_num = 0;
for (uint32_t i=0; i < num_pages; i++) {
if (AMDSMI_MEM_PAGE_STATUS_PENDING == bad_page_info[i].status)
pending_page_num++;
}
value->value.l_int = static_cast<int64_t>(pending_page_num);
}
}
} else
value->status = Smi2RdcError(ret);
break;
}
case RDC_HEALTH_RETIRED_PAGE_LIMIT:
case RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT:
case RDC_HEALTH_POWER_THROTTLE_TIME: //gpu_metrics 1.6
case RDC_HEALTH_THERMAL_THROTTLE_TIME: //gpu_metrics 1.6
default:
break;
}
+3
Просмотреть файл
@@ -180,6 +180,9 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
RDC_EVNT_XGMI_1_THRPUT, RDC_EVNT_XGMI_2_THRPUT, RDC_EVNT_XGMI_3_THRPUT,
RDC_EVNT_XGMI_4_THRPUT, RDC_EVNT_XGMI_5_THRPUT, RDC_FI_OAM_ID,
RDC_FI_GPU_MM_ENC_UTIL, RDC_FI_GPU_MM_DEC_UTIL, RDC_FI_GPU_MEMORY_ACTIVITY,
RDC_HEALTH_XGMI_ERROR, RDC_HEALTH_PCIE_REPLAY_COUNT, RDC_HEALTH_RETIRED_PAGE_NUM,
RDC_HEALTH_PENDING_PAGE_NUM, RDC_HEALTH_RETIRED_PAGE_LIMIT, RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT,
RDC_HEALTH_POWER_THROTTLE_TIME, RDC_HEALTH_THERMAL_THROTTLE_TIME,
};
std::copy(fields.begin(), fields.end(), field_ids);
*field_count = fields.size();
+459
Просмотреть файл
@@ -41,10 +41,12 @@ namespace rdc {
RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings,
const RdcCacheManagerPtr& cache_mgr,
const RdcMetricFetcherPtr& metric_fetcher,
const RdcModuleMgrPtr& module_mgr,
const RdcNotificationPtr& notif)
: group_settings_(group_settings),
cache_mgr_(cache_mgr),
metric_fetcher_(metric_fetcher),
rdc_module_mgr_(module_mgr),
notifications_(notif),
last_cleanup_time_(0) {}
@@ -373,6 +375,423 @@ rdc_status_t RdcWatchTableImpl::rdc_field_unwatch(rdc_gpu_group_t group_id,
return update_field_in_table_when_unwatch(ite->first);
}
rdc_status_t RdcWatchTableImpl::create_health_field_group(unsigned int components,
rdc_field_grp_t* field_group_id) {
// set filed ids
std::vector<rdc_field_t> field_ids{};
if (components & RDC_HEALTH_WATCH_PCIE) {
field_ids.push_back(RDC_HEALTH_PCIE_REPLAY_COUNT);
}
if (components & RDC_HEALTH_WATCH_XGMI) {
field_ids.push_back(RDC_HEALTH_XGMI_ERROR);
}
if (components & RDC_HEALTH_WATCH_MEM) {
field_ids.push_back(RDC_FI_ECC_UNCORRECT_TOTAL);
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_NUM);
field_ids.push_back(RDC_HEALTH_PENDING_PAGE_NUM);
field_ids.push_back(RDC_HEALTH_RETIRED_PAGE_LIMIT);
field_ids.push_back(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT);
}
if (components & RDC_HEALTH_WATCH_INFOROM) {
}
if (components & RDC_HEALTH_WATCH_THERMAL) {
field_ids.push_back(RDC_HEALTH_THERMAL_THROTTLE_TIME);
}
if (components & RDC_HEALTH_WATCH_POWER) {
field_ids.push_back(RDC_HEALTH_POWER_THROTTLE_TIME);
}
if (0 == field_ids.size()) {
RDC_LOG(RDC_ERROR, "Fail to health set. The components must contain at least one watch.");
return RDC_ST_BAD_PARAMETER;
}
const std::string field_group_name("health-field-group");
return group_settings_->rdc_group_field_create(field_ids.size(), field_ids.data(),
field_group_name.c_str(), field_group_id);
}
rdc_status_t RdcWatchTableImpl::rdc_health_set(rdc_gpu_group_t group_id,
unsigned int components) {
// remove old health for same group_id
rdc_health_clear(group_id);
// create a field group base on the components
rdc_field_grp_t field_group_id;
rdc_status_t result = create_health_field_group(components, &field_group_id);
if (result != RDC_ST_OK) {
return result;
}
// get field key
std::vector<RdcFieldKey> fields_in_watch;
result = get_fields_from_group(group_id, field_group_id, fields_in_watch);
if (result != RDC_ST_OK) {
return result;
}
// add to the health watch table
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
HealthWatchTableEntry hentry{components, field_group_id, fields_in_watch};
health_watch_table_.insert({group_id, hentry});
} while (0);
for (auto fields = fields_in_watch.begin(); fields != fields_in_watch.end(); fields++) {
// get initial values
rdc_field_value value;
result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value);
if (result != RDC_ST_OK)
break;
// set initial values to cache
result = cache_mgr_->rdc_health_set(group_id, fields->first, value);
if (result != RDC_ST_OK)
break;
}
// Start to watch the fields and update fields per 1 second.
result = rdc_field_watch(group_id, field_group_id, 1000000, 0, 0);
return result;
}
rdc_status_t RdcWatchTableImpl::rdc_health_get(rdc_gpu_group_t group_id,
unsigned int *components) {
if (nullptr == components)
return RDC_ST_BAD_PARAMETER;
std::lock_guard<std::mutex> guard(watch_mutex_);
auto table_iter = health_watch_table_.find(group_id);
// already in the health watch table
if (table_iter != health_watch_table_.end())
*components = table_iter->second.components;
else
*components = 0;
return RDC_ST_OK;
}
bool RdcWatchTableImpl::add_health_incident(uint32_t gpu_index,
rdc_health_system_t component,
rdc_health_result_t health,
uint32_t err_code,
std::string err_msg,
rdc_health_incidents_t* incident,
rdc_health_response_t* response) {
bool result = false;
incident->gpu_index = gpu_index;
incident->component = component;
incident->health = health;
incident->error.code = err_code;
strncpy_with_null(incident->error.msg, err_msg.c_str(), MAX_HEALTH_MSG_LENGTH);
if (incident->health > response->overall_health)
response->overall_health = incident->health;
response->incidents_count++;
if (response->incidents_count >= HEALTH_MAX_ERROR_ITEMS) {
RDC_LOG(RDC_INFO, "Health incidents are full!");
result = true;
}
return (result);
}
rdc_status_t RdcWatchTableImpl::get_start_end_values(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_field_t field,
rdc_field_value *start_value,
rdc_field_value *end_value) {
if ((nullptr == start_value) || (nullptr == end_value))
return RDC_ST_BAD_PARAMETER;
uint64_t start_timestamp = 0;
//get the history data last 1 minute
start_timestamp = static_cast<uint64_t>(time(nullptr) - 60) * 1000;
//get the values of the field at the start_timestamp/end_timestampe
rdc_status_t result = cache_mgr_->rdc_health_get_values(group_id,
gpu_index, field,
start_timestamp, 0,
start_value, nullptr);
if (result != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " history data. Return: " << result);
return result;
}
// get end values
result = metric_fetcher_->fetch_smi_field(gpu_index, field, end_value);
if (result != RDC_ST_OK)
RDC_LOG(RDC_ERROR, "Error get gpu: " << gpu_index << " field: " << field << " current data. Return: " << result);
return result;
}
rdc_status_t RdcWatchTableImpl::pcie_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
rdc_field_value start = {}, end = {};
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_PCIE_REPLAY_COUNT,
&start,
&end);
if (result != RDC_ST_OK)
return result;
uint64_t pcie_replay_count = end.value.l_int - start.value.l_int;
if (pcie_replay_count > PCIE_MAX_REPLAYS_PERMIN) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(pcie_replay_count);
err_msg += " PCIe replays per minute exceeding the max limit ";
err_msg += std::to_string(PCIE_MAX_REPLAYS_PERMIN);
err_msg += ".";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_PCIE,
RDC_HEALTH_RESULT_WARN,
RDC_FR_PCI_REPLAY_RATE,
err_msg,
incident,
response))
return RDC_ST_MAX_LIMIT;
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::xgmi_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
rdc_field_value start = {}, end = {};
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_XGMI_ERROR,
&start,
&end);
if (result != RDC_ST_OK)
return result;
amdsmi_xgmi_status_t status = static_cast<amdsmi_xgmi_status_t>(end.value.l_int);
if (AMDSMI_XGMI_STATUS_NO_ERRORS != status) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
uint32_t err_code;
std::string err_msg = "Detected ";
if (AMDSMI_XGMI_STATUS_ERROR == status) {
err_msg += " a single XGMI error";
err_code = RDC_FR_XGMI_SINGLE_ERROR;
} else {
err_msg += " multiple XGMI errors";
err_code = RDC_FR_XGMI_MULTIPLE_ERROR;
}
err_msg += ".";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_XGMI,
RDC_HEALTH_RESULT_FAIL,
err_code,
err_msg,
incident,
response))
return RDC_ST_MAX_LIMIT;
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::memory_check(rdc_gpu_group_t group_id,
uint32_t gpu_index,
rdc_health_response_t* response) {
//get field start/end values
rdc_field_value start = {}, end = {};
rdc_status_t result = get_start_end_values(group_id,
gpu_index,
RDC_FI_ECC_UNCORRECT_TOTAL,
&start,
&end);
if (result != RDC_ST_OK)
return result;
uint64_t ecc_uncorrectable_count = 0;
ecc_uncorrectable_count = end.value.l_int - start.value.l_int;
if (ecc_uncorrectable_count > 0) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(ecc_uncorrectable_count);
err_msg += " uncorrectable ECC error(s) in the last minute.";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_MEM,
RDC_HEALTH_RESULT_FAIL,
RDC_FR_ECC_UNCORRECTABLE_DETECTED,
err_msg,
incident,
response))
return RDC_ST_MAX_LIMIT;
}
result = get_start_end_values(group_id,
gpu_index,
RDC_HEALTH_PENDING_PAGE_NUM,
&start,
&end);
if (result != RDC_ST_OK)
return result;
uint64_t num_pages = end.value.l_int - start.value.l_int;
if (num_pages > 0) {
rdc_health_incidents_t *incident = &response->incidents[response->incidents_count];
std::string err_msg = "Detected ";
err_msg += std::to_string(num_pages);
err_msg += " pending retired page(s).";
//add incident
if (add_health_incident(gpu_index,
RDC_HEALTH_WATCH_MEM,
RDC_HEALTH_RESULT_WARN,
RDC_FR_PENDING_PAGE_RETIREMENTS,
err_msg,
incident,
response))
return RDC_ST_MAX_LIMIT;
}
//To do: RDC_FR_RETIRED_PAGES_LIMIT
//To do: RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::rdc_health_check(rdc_gpu_group_t group_id,
rdc_health_response_t *response) {
if (nullptr == response)
return RDC_ST_BAD_PARAMETER;
unsigned int components = 0;
std::vector<RdcFieldKey> fields_in_watch;
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
auto health = health_watch_table_.find(group_id);
if (health == health_watch_table_.end())
return RDC_ST_NOT_FOUND;
components = health->second.components;
fields_in_watch = health->second.fields;
} while (0);
rdc_group_info_t ginfo;
rdc_status_t result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
if (result != RDC_ST_OK)
return result;
for (auto fields = fields_in_watch.begin(); fields != fields_in_watch.end(); fields++) {
// get current values
rdc_field_value value;
result = metric_fetcher_->fetch_smi_field(fields->first, fields->second, &value);
if (result != RDC_ST_OK)
break;
// set current values to cache
result = cache_mgr_->rdc_update_health_stats(group_id, fields->first, value);
if (result != RDC_ST_OK)
break;
}
//init response
response->overall_health = RDC_HEALTH_RESULT_PASS;
response->incidents_count = 0;
for (uint32_t gindex = 0; gindex < ginfo.count; gindex++) {
//PCIe
if (components & RDC_HEALTH_WATCH_PCIE) {
result = pcie_check(group_id, ginfo.entity_ids[gindex], response);
if (result == RDC_ST_MAX_LIMIT)
return result;
}
//XGMI
if (components & RDC_HEALTH_WATCH_XGMI) {
result = xgmi_check(group_id, ginfo.entity_ids[gindex], response);
if (result == RDC_ST_MAX_LIMIT)
return result;
}
//Memory
if (components & RDC_HEALTH_WATCH_MEM) {
result = memory_check(group_id, ginfo.entity_ids[gindex], response);
if (result == RDC_ST_MAX_LIMIT)
return result;
}
//InfoROM
if (components & RDC_HEALTH_WATCH_INFOROM) {
//To do:
return RDC_ST_NOT_SUPPORTED;
}
//Thermal
if (components & RDC_HEALTH_WATCH_THERMAL) {
//To do:
return RDC_ST_NOT_SUPPORTED;
}
//Power
if (components & RDC_HEALTH_WATCH_POWER) {
//To do:
return RDC_ST_NOT_SUPPORTED;
}
} //end of for gindex
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::rdc_health_clear(rdc_gpu_group_t group_id) {
rdc_field_grp_t field_group_id;
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
auto health = health_watch_table_.find(group_id);
if (health == health_watch_table_.end()) {
return RDC_ST_NOT_FOUND;
}
field_group_id = health->second.field_group_id;
} while (0);
// at first, unwatch the old fields.
rdc_status_t result = rdc_field_unwatch(group_id, field_group_id);
if (result != RDC_ST_OK) {
return result;
}
// destroy the old field group
group_settings_->rdc_group_field_destroy(field_group_id);
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
health_watch_table_.erase(group_id);
} while (0);
result = cache_mgr_->rdc_health_clear(group_id);
return RDC_ST_OK;
}
bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id,
std::string& job_id) const {
RdcFieldKey key{gpu_index, field_id};
@@ -388,6 +807,21 @@ bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, rdc_field_t field
return false;
}
bool RdcWatchTableImpl::is_health_watch_field(uint32_t gpu_index, rdc_field_t field_id,
rdc_gpu_group_t& group_id) const {
RdcFieldKey key{gpu_index, field_id};
for (auto ite = health_watch_table_.begin(); ite != health_watch_table_.end(); ite++) {
auto& fields = ite->second.fields;
if (std::find(fields.begin(), fields.end(), key) != fields.end()) {
group_id = ite->first;
return true;
}
}
return false;
}
rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values,
void* user_data) {
if (values == nullptr || user_data == nullptr) {
@@ -421,6 +855,12 @@ rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values, uin
if (watchTable->is_job_watch_field(gpu_index, field_id, job_id)) {
watchTable->cache_mgr_->rdc_update_job_stats(gpu_index, job_id, values[i].field_value);
}
// Update the health stats cache
rdc_gpu_group_t group_id;
if (watchTable->is_health_watch_field(gpu_index, field_id, group_id)) {
watchTable->cache_mgr_->rdc_update_health_stats(group_id, gpu_index, values[i].field_value);
}
}
return RDC_ST_OK;
}
@@ -492,6 +932,12 @@ rdc_status_t RdcWatchTableImpl::rdc_notif_update_cache(rdc_evnt_notification_t*
if (is_job_watch_field(gpu_index, field_id, job_id)) {
cache_mgr_->rdc_update_job_stats(gpu_index, job_id, events[i].field);
}
// Update the health stats cache
rdc_gpu_group_t group_id;
if (is_health_watch_field(gpu_index, field_id, group_id)) {
cache_mgr_->rdc_update_health_stats(group_id, gpu_index, events[i].field);
}
}
return RDC_ST_OK;
}
@@ -549,6 +995,7 @@ void RdcWatchTableImpl::debug_status() {
RDC_LOG(RDC_DEBUG, "fields_to_watch_:" << fields_to_watch_.size()
<< " watch_table_:" << watch_table_.size()
<< " job_watch_table_:" << job_watch_table_.size()
<< " health_watch_table_:" << health_watch_table_.size()
<< " cache stats:" << cache_mgr_->get_cache_stats());
if (watch_table_.size() > 0) {
@@ -575,6 +1022,18 @@ void RdcWatchTableImpl::debug_status() {
jite->first << ": " << jite->second.group_id << " fields : " << strstream.str());
}
if (health_watch_table_.size() > 0) {
RDC_LOG(RDC_DEBUG, "health watch table details: ");
}
for (auto hite = health_watch_table_.begin(); hite != health_watch_table_.end(); hite++) {
std::stringstream strstream;
for (const auto& p : hite->second.fields) {
strstream << "<" << p.first << "," << p.second << "> ";
}
RDC_LOG(RDC_DEBUG,
"group id : " << hite->first << " components : " << hite->second.components << " fields : " << strstream.str());
}
if (fields_to_watch_.size() > 0) {
RDC_LOG(RDC_DEBUG, "fields to watch details:");
}
+82
Просмотреть файл
@@ -870,5 +870,87 @@ rdc_status_t RdcStandaloneHandler::rdc_policy_unregister(rdc_gpu_group_t group_i
return error_handle(status, reply.status());
}
// Health RdcAPI
rdc_status_t RdcStandaloneHandler::rdc_health_set(rdc_gpu_group_t group_id,
unsigned int components) {
::rdc::SetHealthRequest request;
::rdc::SetHealthResponse reply;
::grpc::ClientContext context;
request.set_group_id(group_id);
request.set_components(components);
::grpc::Status status = stub_->SetHealth(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
return err_status;
}
rdc_status_t RdcStandaloneHandler::rdc_health_get(rdc_gpu_group_t group_id,
unsigned int* components) {
if (!components) {
return RDC_ST_BAD_PARAMETER;
}
::rdc::GetHealthRequest request;
::rdc::GetHealthResponse reply;
::grpc::ClientContext context;
request.set_group_id(group_id);
::grpc::Status status = stub_->GetHealth(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
if (err_status != RDC_ST_OK) return err_status;
*components = reply.components();
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_health_check(rdc_gpu_group_t group_id,
rdc_health_response_t *response) {
if (!response) {
return RDC_ST_BAD_PARAMETER;
}
::rdc::CheckHealthRequest request;
::rdc::CheckHealthResponse reply;
::grpc::ClientContext context;
request.set_group_id(group_id);
::grpc::Status status = stub_->CheckHealth(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
if (err_status != RDC_ST_OK) return err_status;
auto res = reply.response();
response->overall_health = static_cast<rdc_health_result_t>(res.overall_health());
response->incidents_count = res.incidents_count();
for (int i = 0; i < res.incidents_size(); i++) {
const ::rdc::HealthIncidents& result = res.incidents(i);
rdc_health_incidents_t& to_result = response->incidents[i];
to_result.gpu_index = result.gpu_index();
to_result.component = static_cast<rdc_health_system_t>(result.component());
to_result.health = static_cast<rdc_health_result_t>(result.health());
//set error
to_result.error.code = result.error().code();
strncpy_with_null(to_result.error.msg, result.error().msg().c_str(), MAX_HEALTH_MSG_LENGTH);
}
return RDC_ST_OK;
}
rdc_status_t RdcStandaloneHandler::rdc_health_clear(rdc_gpu_group_t group_id) {
::rdc::ClearHealthRequest request;
::rdc::ClearHealthResponse reply;
::grpc::ClientContext context;
request.set_group_id(group_id);
::grpc::Status status = stub_->ClearHealth(&context, request, &reply);
rdc_status_t err_status = error_handle(status, reply.status());
if (err_status != RDC_ST_OK) return err_status;
return RDC_ST_OK;
}
} // namespace rdc
} // namespace amd
+1
Просмотреть файл
@@ -68,6 +68,7 @@ set(RDCI_SRC_LIST
"${SRC_DIR}/RdciGroupSubSystem.cc"
"${SRC_DIR}/RdciStatsSubSystem.cc"
"${SRC_DIR}/RdciPolicySubSystem.cc"
"${SRC_DIR}/RdciHealthSubSystem.cc"
"${SRC_DIR}/RdciSubSystem.cc"
"${SRC_DIR}/rdci.cc")
message("RDCI_SRC_LIST=${RDCI_SRC_LIST}")
+77
Просмотреть файл
@@ -0,0 +1,77 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef RDCI_INCLUDE_RDCIHEALTHSUBSYSTEM_H_
#define RDCI_INCLUDE_RDCIHEALTHSUBSYSTEM_H_
#include <signal.h>
#include <string>
#include "RdciSubSystem.h"
namespace amd {
namespace rdc {
class RdciHealthSubSystem : public RdciSubSystem {
public:
RdciHealthSubSystem();
~RdciHealthSubSystem();
void parse_cmd_opts(int argc, char** argv) override;
void process() override;
private:
void show_help() const;
void get_watches() const;
void set_watches() const;
void health_check() const;
void health_clear() const;
std::string health_string(rdc_health_result_t health) const;
std::string component_string(rdc_health_system_t component) const;
void output_errstr(const std::string& input) const;
unsigned int handle_one_component(rdc_health_response_t &response,
unsigned int start_index,
uint32_t gpu_index,
rdc_health_system_t component,
rdc_health_result_t &component_health,
std::vector<std::string> &err_str) const;
unsigned int handle_one_gpu(rdc_health_response_t &response,
unsigned int start_index,
uint32_t gpu_index) const;
enum OPERATIONS {
HEALTH_UNKNOWN = 0,
HEALTH_HELP,
HEALTH_FETCH,
HEALTH_SET,
HEALTH_CHECK,
HEALTH_CLEAR,
} health_ops_;
rdc_gpu_group_t group_id_;
unsigned int components_;
};
} // namespace rdc
} // namespace amd
#endif // RDCI_INCLUDE_RDCIHEALTHSUBSYSTEM_H_
+557
Просмотреть файл
@@ -0,0 +1,557 @@
/*
Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "RdciHealthSubSystem.h"
#include <getopt.h>
#include <signal.h>
#include <unistd.h>
#include <ctime>
#include <iomanip>
#include <limits>
#include "common/rdc_utils.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcException.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdciHealthSubSystem::RdciHealthSubSystem() {}
RdciHealthSubSystem::~RdciHealthSubSystem() {}
void RdciHealthSubSystem::parse_cmd_opts(int argc, char** argv) {
const int HOST_OPTIONS = 1000;
const int JSON_OPTIONS = 1001;
const int CLEAR_OPTIONS = 1002;
const struct option long_options[] = {{"host", required_argument, nullptr, HOST_OPTIONS},
{"unauth", optional_argument, nullptr, 'u'},
{"help", optional_argument, nullptr, 'h'},
{"json", optional_argument, nullptr, JSON_OPTIONS},
{"clear", optional_argument, nullptr, CLEAR_OPTIONS},
{"group", required_argument, nullptr, 'g'},
{"fetch", optional_argument, nullptr, 'f'},
{"set", required_argument, nullptr, 's'},
{"check", optional_argument, nullptr, 'c'},
{nullptr, 0, nullptr, 0}};
bool group_id_set = false;
int option_index = 0, opt = 0;
std::string flags;
unsigned int components = 0;
while ((opt = getopt_long(argc, argv, "uhg:fs:c", long_options, &option_index)) != -1) {
switch (opt) {
case HOST_OPTIONS:
ip_port_ = optarg;
break;
case JSON_OPTIONS:
set_json_output(true);
break;
case CLEAR_OPTIONS:
health_ops_ = HEALTH_CLEAR;
break;
case 'u':
use_auth_ = false;
break;
case 'h':
health_ops_ = HEALTH_HELP;
return;
case 'g':
if (!IsNumber(optarg)) {
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER, "The group id needs to be a number");
}
group_id_ = std::stoi(optarg);
group_id_set = true;
break;
case 'f':
health_ops_ = HEALTH_FETCH;
break;
case 's':
health_ops_ = HEALTH_SET;
flags = optarg;
for (unsigned int i = 0; i < flags.length(); i++) {
switch (flags.at(i)) {
case 'a':
components |= RDC_HEALTH_WATCH_PCIE;
components |= RDC_HEALTH_WATCH_XGMI;
components |= RDC_HEALTH_WATCH_MEM;
//To do:
//components |= RDC_HEALTH_WATCH_INFOROM;
//components |= RDC_HEALTH_WATCH_THERMAL;
//components |= RDC_HEALTH_WATCH_POWER;
break;
case 'p':
components |= RDC_HEALTH_WATCH_PCIE;
break;
case 'm':
components |= RDC_HEALTH_WATCH_MEM;
break;
case 'i':
//To do:
//components |= RDC_HEALTH_WATCH_INFOROM;
throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported");
break;
case 't':
//To do:
//components |= RDC_HEALTH_WATCH_THERMAL;
//components |= RDC_HEALTH_WATCH_POWER;
throw RdcException(RDC_ST_NOT_SUPPORTED, "Not supported");
break;
case 'x':
components |= RDC_HEALTH_WATCH_XGMI;
break;
default:
throw RdcException(RDC_ST_BAD_PARAMETER, "Invalid flags");
break;
}
}
if (0 == components) {
throw RdcException(RDC_ST_BAD_PARAMETER, "No flags");
} else
components_ = components;
break;
case 'c':
health_ops_ = HEALTH_CHECK;
break;
default:
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options");
}
}
if (!group_id_set) {
show_help();
throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the GPU group id");
}
}
void RdciHealthSubSystem::show_help() const {
if (is_json_output()) return;
std::cout << " health -- Used to manage the health watches of a group. \n"
<< " The health of the GPUs in a group can then be monitored"
<< " during a process.\n\n";
std::cout << "Usage\n";
std::cout << " rdci health [--host <IP/FQDN>:port] [-u] [-j] -g <groupId> -s <flags>\n";
std::cout << " rdci health [--host <IP/FQDN>:port] [-u] [-j] -g <groupId> -c\n";
std::cout << " rdci health [--host <IP/FQDN>:port] [-u] [-j] -g <groupId> -f\n";
std::cout << " rdci health [--host <IP/FQDN>:prot] [-u] [-j] -g <groupId> --clear\n";
std::cout << "\nFlags:\n";
show_common_usage();
std::cout << " --json Output using json.\n";
std::cout << " --clear Disable all watches being monitored.\n";
std::cout << " -g --group groupId The GPU group to query "
<< "on the specified host.\n";
std::cout << " -f --fetch Fetch the current watch status.\n";
std::cout << " -s --set flags The list of components can be watched. "
<< "[default = pm]\n";
std::cout << " a - watch all components\n";
std::cout << " p - watch PCIe\n";
std::cout << " m - watch Memory\n";
//std::cout << " i - watch infoROM\n";
//std::cout << " t - watch power and thermal\n";
std::cout << " x - watch XGMI\n";
std::cout << " -c --check Check to see if any errors or warnings have "
<< "occurred in the currently monitored watches.\n";
}
void RdciHealthSubSystem::get_watches() const {
rdc_status_t result;
unsigned int components = 0;
std::string on = "On";
std::string off = "Off";
result = rdc_health_get(rdc_handle_, group_id_, &components);
if (result != RDC_ST_OK) {
std::string error_msg = rdc_status_string(result);
if (result == RDC_ST_NOT_FOUND) {
error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field.";
}
throw RdcException(result, error_msg.c_str());
}
if (is_json_output()) {
std::cout << "\"heading\" : \"Health monitor systems status\", ";
std::cout << "\"body\" : [";
std::cout << "{\"Component\" : \"PCIe\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "\"},";
std::cout << "{\"Component\" : \"XGMI\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "\"},";
std::cout << "{\"Component\" : \"Memory\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "\"},";
/*std::cout << "{\"Component\" : \"InfoROM\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "\"},";
std::cout << "{\"Component\" : \"Thermal\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "\"},";
std::cout << "{\"Component\" : \"Power\", \"Status\" : \"" << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "\"}";*/
std::cout << "]";
} else {
std::cout << "Health monitor systems status:" << std::endl;
std::cout << "+--------------------+" //"-" width :20
<< "---------------------------------------------------+\n"; //-" width :51
std::cout << "|" << std::setw(20) << std::left << " PCIe" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_PCIE) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " XGMI" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_XGMI) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Memory" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_MEM) ? on : off).c_str() << "|\n";
/*std::cout << "|" << std::setw(20) << std::left << " InfoROM" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_INFOROM) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Thermal" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_THERMAL) ? on : off).c_str() << "|\n";
std::cout << "|" << std::setw(20) << std::left << " Power" << "| "
<< std::setw(50) << std::left << ((components & RDC_HEALTH_WATCH_POWER) ? on : off).c_str() << "|\n";*/
std::cout << "+--------------------+" //"-" width :20
<< "---------------------------------------------------+\n"; //-" width :51
}
}
void RdciHealthSubSystem::set_watches() const {
rdc_status_t result;
result = rdc_health_set(rdc_handle_, group_id_, components_);
if (result != RDC_ST_OK) {
std::string error_msg = rdc_status_string(result);
if (result == RDC_ST_NOT_FOUND) {
error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field.";
}
throw RdcException(result, error_msg.c_str());
}
std::cout << "Group " << group_id_ << " health monitor systems set successfully." << std::endl;
}
std::string RdciHealthSubSystem::health_string(rdc_health_result_t health) const {
switch (health) {
case RDC_HEALTH_RESULT_PASS:
return "Pass";
case RDC_HEALTH_RESULT_WARN:
return "Warning";
case RDC_HEALTH_RESULT_FAIL:
return "Fail";
default:
return "Unknown";
}
}
std::string RdciHealthSubSystem::component_string(rdc_health_system_t component) const {
switch (component) {
case RDC_HEALTH_WATCH_PCIE:
return "PCIe system: ";
case RDC_HEALTH_WATCH_XGMI:
return"XGMI system: ";
case RDC_HEALTH_WATCH_MEM:
return "Memory system: ";
case RDC_HEALTH_WATCH_INFOROM:
return "Inforom system: ";
case RDC_HEALTH_WATCH_THERMAL:
return "Thermal system:";
case RDC_HEALTH_WATCH_POWER:
return "Power system: ";
default:
return "Unknown";
}
}
void RdciHealthSubSystem::output_errstr(const std::string& input) const {
std::string word, line_str;
unsigned int width = 60, line_size = 0;
std::istringstream iss(input);
while (iss >> word) {
if (line_size + word.size() >= width) {
std::cout << "|" << std::setw(20) << " " << "| "
<< std::setw(width) << std::left << line_str << "|\n";
//add new line string
line_str = word;
line_size = word.size();
} else {
if (line_size > 0) {
line_str += " ";
line_str += word;
line_size += word.size() + 1;
} else {
line_str += word;
line_size += word.size();
}
}
} //end while
if (0 < line_size)
std::cout << "|" << std::setw(20) << " " << "| "
<< std::setw(width) << std::left << line_str << "|\n";
}
unsigned int RdciHealthSubSystem::handle_one_component(rdc_health_response_t &response,
unsigned int start_index,
uint32_t gpu_index,
rdc_health_system_t component,
rdc_health_result_t &component_health,
std::vector<std::string> &err_str) const {
unsigned int count = 0;
rdc_health_incidents_t *incident;
std::string all_err_str;
for (unsigned int i = start_index; i < response.incidents_count; i++) {
incident = &response.incidents[i];
//same GPU Index, same component
if ((incident->gpu_index != gpu_index) ||
(incident->component != component))
break;
//set component health
if (incident->health > component_health)
component_health = incident->health;
all_err_str = " - ";
all_err_str += incident->error.msg;
err_str.push_back(all_err_str);
count++;
}
return count;
}
unsigned int RdciHealthSubSystem::handle_one_gpu(rdc_health_response_t &response,
unsigned int start_index,
uint32_t gpu_index) const {
unsigned int count = 0, comp_count = 0;
rdc_health_incidents_t *incident;
rdc_health_result_t gpu_health = RDC_HEALTH_RESULT_PASS;
std::string component_str, health_str, gpu_health_str;
typedef struct {
rdc_health_result_t component_health;
std::vector<std::string> err_str;
} component_detail_t;
std::map<rdc_health_system_t, component_detail_t> component_detail_map;
for (unsigned int i = start_index; i < response.incidents_count; i++) {
incident = &response.incidents[i];
//same GPU Index
if (incident->gpu_index != gpu_index)
break;
//set gpu health
if (incident->health > gpu_health)
gpu_health = incident->health;
//handle smae component
component_detail_t detail;
detail.component_health = RDC_HEALTH_RESULT_PASS;
detail.err_str.clear();
comp_count = handle_one_component(response, i, gpu_index, incident->component, detail.component_health, detail.err_str);
i += comp_count - 1;
count += comp_count;
// Add to the component detail map
component_detail_map.insert({incident->component, detail});
}
//output gpu_index health result
gpu_health_str = health_string(gpu_health);
if (is_json_output()) {
std::cout << "{\"Index\" : \"" << std::to_string(gpu_index) << "\", ";
std::cout << "\"Health\" : \"" << gpu_health_str << "\", ";
std::cout << "\"Error\" : [";
unsigned int i = 0;
for (auto ite : component_detail_map) {
component_str = component_string(ite.first);
health_str = health_string(ite.second.component_health);
std::cout << "{\"Component\" : \"" << component_str << "\", ";
std::cout << "\"Health\" : \"" << health_str << "\", ";
std::cout << "\"Message\" : [";
unsigned int j = 0;
for (auto err_ite : ite.second.err_str) {
std::cout << "\"" << err_ite << "\"";
j++;
if (j < ite.second.err_str.size())
std::cout << ", ";
}
std::cout << "]}"; //end Message
i++;
if (i < component_detail_map.size()) {
std::cout << ", ";
}
}
std::cout << "]}"; //end Error
} else {
std::cout << "|" << std::setw(20) << " GPU ID: " + std::to_string(gpu_index) << "| "
<< std::setw(60) << std::left << gpu_health_str << "|\n";
std::cout << "|" << std::setw(20) << " " << "| "
<< std::setw(60) << " " << "|\n";
for (auto ite : component_detail_map) {
component_str = component_string(ite.first);
health_str = health_string(ite.second.component_health);
std::cout << "|" << std::setw(20) << " " << "| "
<< std::setw(60) << std::left << component_str + health_str << "|\n";
for (auto msg : ite.second.err_str)
output_errstr(msg);
std::cout << "|" << std::setw(20) << " " << "| "
<< std::setw(60) << " " << "|\n";
}
std::cout << "+--------------------+-" //"-" width :20
<< "------------------------------------------------------------+\n"; //-" width :60
}
return count;
}
void RdciHealthSubSystem::health_check() const {
unsigned int components = 0;
rdc_status_t result;
rdc_health_response_t response;
result = rdc_health_get(rdc_handle_, group_id_, &components);
if (result != RDC_ST_OK) {
std::string error_msg = rdc_status_string(result);
if (result == RDC_ST_NOT_FOUND) {
error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field.";
}
throw RdcException(result, error_msg.c_str());
}
if (0 == components) {
std::string error_msg = "Health watches not enable, please enable watches first.";
throw RdcException(RDC_ST_UNKNOWN_ERROR, error_msg.c_str());
}
result = rdc_health_check(rdc_handle_, group_id_, &response);
if (result != RDC_ST_OK) {
throw RdcException(result, rdc_status_string(result));
}
//output headline
std::string overall_str = health_string(response.overall_health);
if (is_json_output()) {
std::cout << "\"heading\" : \"Health monitor report\", ";
std::cout << "\"body\" : ";
std::cout << "{\"Group\" : \"" << std::to_string(group_id_) << "\", ";
std::cout << "\"Overall Health\" : \"" << overall_str << "\", ";
std::cout << "\"GPU\" : [";
} else {
std::cout << "Health monitor report:" << std::endl;
std::cout << "+--------------------+-" //"-" width :20
<< "------------------------------------------------------------+\n"; //-" width :60
std::cout << "|" << std::setw(20) << std::left << " Group " + std::to_string(group_id_) << "| "
<< std::setw(60) << std::left << "Overall Health: " + overall_str << "|\n";
std::cout << "+====================+=" //"=" width :20
<< "============================================================+\n"; //"=" width :60
}
//output health of per GPU
unsigned int index = 0;
while (index < response.incidents_count) {
uint32_t gpu_index = response.incidents[index].gpu_index;
unsigned int count = handle_one_gpu(response, index, gpu_index);
index += count;
if (is_json_output() && (index < response.incidents_count))
std::cout << ",";
}
if (is_json_output())
std::cout << "]}"; //end Group
}
void RdciHealthSubSystem::health_clear() const {
rdc_status_t result;
result = rdc_health_clear(rdc_handle_, group_id_);
if (result != RDC_ST_OK) {
std::string error_msg = rdc_status_string(result);
if (result == RDC_ST_NOT_FOUND) {
error_msg = "Cannot find the group " + std::to_string(group_id_) + " or the field.";
}
throw RdcException(result, error_msg.c_str());
}
std::cout << "Clear Group " << group_id_ << " all health monitor systems." << std::endl;
}
void RdciHealthSubSystem::process() {
switch (health_ops_) {
case HEALTH_HELP:
case HEALTH_UNKNOWN:
show_help();
break;
case HEALTH_FETCH:
get_watches();
break;
case HEALTH_SET:
set_watches();
break;
case HEALTH_CHECK:
health_check();
break;
case HEALTH_CLEAR:
health_clear();
break;
default:
show_help();
break;
}
}
} // namespace rdc
} // namespace amd
+4 -1
Просмотреть файл
@@ -31,6 +31,7 @@ THE SOFTWARE.
#include "RdciGroupSubSystem.h"
#include "RdciStatsSubSystem.h"
#include "RdciPolicySubSystem.h"
#include "RdciHealthSubSystem.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcException.h"
#include "rdc_lib/rdc_common.h"
@@ -50,7 +51,7 @@ int main(int argc, char** argv) {
const std::string usage_help =
"Usage:\trdci <subsystem>|<options>\n"
"subsystem: \n"
" discovery, dmon, group, fieldgroup, stats, diag, policy\n"
" discovery, dmon, group, fieldgroup, stats, diag, policy, health\n"
"options: \n"
" -v(--version) : Print client version information only\n";
@@ -81,6 +82,8 @@ int main(int argc, char** argv) {
subsystem.reset(new amd::rdc::RdciGroupSubSystem());
} else if (subsystem_name == "fieldgroup") {
subsystem.reset(new amd::rdc::RdciFieldGroupSubSystem());
} else if (subsystem_name == "health") {
subsystem.reset(new amd::rdc::RdciHealthSubSystem());
} else if (subsystem_name == "stats") {
subsystem.reset(new amd::rdc::RdciStatsSubSystem());
} else if (subsystem_name == "policy") {
+16
Просмотреть файл
@@ -153,6 +153,22 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service {
const ::rdc::UnRegisterPolicyRequest* request,
::rdc::UnRegisterPolicyResponse* reply) override;
::grpc::Status SetHealth(::grpc::ServerContext* context,
const ::rdc::SetHealthRequest* request,
::rdc::SetHealthResponse* reply) override;
::grpc::Status GetHealth(::grpc::ServerContext* context,
const ::rdc::GetHealthRequest* request,
::rdc::GetHealthResponse* reply) override;
::grpc::Status CheckHealth(::grpc::ServerContext* context,
const ::rdc::CheckHealthRequest* request,
::rdc::CheckHealthResponse* reply) override;
::grpc::Status ClearHealth(::grpc::ServerContext* context,
const ::rdc::ClearHealthRequest* request,
::rdc::ClearHealthResponse* reply) override;
private:
bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target);
rdc_handle_t rdc_handle_;
+88
Просмотреть файл
@@ -947,5 +947,93 @@ int RdcAPIServiceImpl::PolicyCallback(rdc_policy_callback_response_t* userData)
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::SetHealth(::grpc::ServerContext* context,
const ::rdc::SetHealthRequest* request,
::rdc::SetHealthResponse* reply) {
(void)(context);
if (!reply || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
rdc_status_t result = rdc_health_set(rdc_handle_, request->group_id(), request->components());
reply->set_status(result);
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::GetHealth(::grpc::ServerContext* context,
const ::rdc::GetHealthRequest* request,
::rdc::GetHealthResponse* reply) {
(void)(context);
if (!reply || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
unsigned int components;
rdc_status_t result = rdc_health_get(rdc_handle_, request->group_id(), &components);
reply->set_status(result);
if (result != RDC_ST_OK) {
return ::grpc::Status::OK;
}
reply->set_components(components);
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::CheckHealth(::grpc::ServerContext* context,
const ::rdc::CheckHealthRequest* request,
::rdc::CheckHealthResponse* reply) {
(void)(context);
if (!reply || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
rdc_health_response_t response;
rdc_status_t result = rdc_health_check(rdc_handle_, request->group_id(), &response);
reply->set_status(result);
if (result != RDC_ST_OK) {
return ::grpc::Status::OK;
}
::rdc::HealthResponse* to_response = reply->mutable_response();
to_response->set_overall_health(response.overall_health);
to_response->set_incidents_count(response.incidents_count);
for (uint32_t i = 0; i < response.incidents_count; i++) {
const rdc_health_incidents_t& incident = response.incidents[i];
::rdc::HealthIncidents* to_incidents = to_response->add_incidents();
to_incidents->set_gpu_index(incident.gpu_index);
to_incidents->set_component(incident.component);
to_incidents->set_health(incident.health);
//error
auto to_error = to_incidents->mutable_error();
to_error->set_code(incident.error.code);
to_error->set_msg(incident.error.msg);
}
return ::grpc::Status::OK;
}
::grpc::Status RdcAPIServiceImpl::ClearHealth(::grpc::ServerContext* context,
const ::rdc::ClearHealthRequest* request,
::rdc::ClearHealthResponse* reply) {
(void)(context);
if (!reply || !request) {
return ::grpc::Status(::grpc::StatusCode::INTERNAL, "Empty contents");
}
rdc_status_t result = rdc_health_clear(rdc_handle_, request->group_id());
reply->set_status(result);
return ::grpc::Status::OK;
}
} // namespace rdc
} // namespace amd