LINT: Add cpplint, clang-format and pre-commit support

Change-Id: I3cbb787ef27d90486b212dfb1a8c77c460acc2ac
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: 434e40305d]
此提交包含在:
Galantsev, Dmitrii
2023-12-04 15:24:34 -06:00
父節點 61cf14d7cc
當前提交 ea624cbb7c
共有 137 個檔案被更改,包括 9122 行新增10469 行删除
+9
查看文件
@@ -0,0 +1,9 @@
---
Language: Cpp
BasedOnStyle: Google
ColumnLimit: 100
# Force pointers to the type for C++.
# For some reason Google style doesn't specify this..
DerivePointerAlignment: false
PointerAlignment: Left
+1 -1
查看文件
@@ -13,7 +13,7 @@ end_of_line = lf
[*.{c,cc,cpp,h,hh,hpp}]
charset = utf-8
indent_style = space
indent_size = 4
indent_size = 2
[*.py]
indent_style = space
+4
查看文件
@@ -17,3 +17,7 @@ docs/_doxygen/
# VisualStudioCode
.vscode/
# do NOT ignore these files
!.clang-format
!.editorconfig
+30
查看文件
@@ -0,0 +1,30 @@
# - How to use:
# python3 -m pip install pre-commit
# pre-commit install --install hooks
# Upon a new commit - the hooks should automagically run
#
# - How to skip:
# git commit --no-verify
# or
# SKIP=clang-format-docker git commit
# SKIP=cpplint-docker git commit
fail_fast: false
repos:
# For portability I decided to use Docker containers
- repo: https://github.com/dmitrii-galantsev/pre-commit-docker-cpplint
rev: 0.0.3
hooks:
- id: clang-format-docker
- id: cpplint-docker
# Below is a local way of running formatters and linters
# NOTE: clang-tidy is not used in the above tests
# - repo: https://github.com/pocc/pre-commit-hooks
# rev: v1.3.5
# hooks:
# - id: clang-format
# args: [--no-diff, -i]
# - id: clang-tidy
# args: [-p=build, --quiet]
# - id: cpplint
# args: [--verbose=5]
+3
查看文件
@@ -0,0 +1,3 @@
set noparent
linelength=100
filter=-build/include_subdir,-legal/copyright,-runtime/printf,-build/c++11,-runtime/int,-build/header_guard
+21 -34
查看文件
@@ -28,6 +28,7 @@ THE SOFTWARE.
#include <memory>
#include <string>
#include "rocm_smi/rocm_smi.h"
/**
@@ -190,7 +191,6 @@ typedef enum {
RDC_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
} rdc_status_t;
/**
* @brief Handle to RDC server channel
*/
@@ -232,10 +232,8 @@ typedef uintptr_t rdc_channel_t;
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t
rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect,
grpc_connectivity_state *state);
rdc_status_t rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect,
grpc_connectivity_state* state);
/**
* @brief Verify a channel's connection to the server
@@ -252,8 +250,7 @@ rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect,
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t
rdc_channel_connection_verify(rdc_channel_t channel);
rdc_status_t rdc_channel_connection_verify(rdc_channel_t channel);
/** @} */ // end of RDCAdmin
@@ -267,7 +264,7 @@ rdc_channel_connection_verify(rdc_channel_t channel);
/**
* @brief Create a communications channel to an RDC server
*
* @details Given a pointer to an ::rdc_channel_t @p channel, a string
* @details Given a pointer to an ::rdc_channel_t @p channel, a string
* containing the ip address of the server @p ip, a string containing
* the port number on which the server is listening @p port and a bool
* indicating whether the channel should use a secure link @p secure,
@@ -290,9 +287,8 @@ rdc_channel_connection_verify(rdc_channel_t channel);
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t
rdc_channel_create(rdc_channel_t *channel, const char *ip, const char *port,
bool secure);
rdc_status_t rdc_channel_create(rdc_channel_t* channel, const char* ip, const char* port,
bool secure);
/**
* @brief Destroy a communications channel to an RDC server
@@ -305,13 +301,12 @@ rdc_channel_create(rdc_channel_t *channel, const char *ip, const char *port,
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t
rdc_channel_destroy(rdc_channel_t channel);
rdc_status_t rdc_channel_destroy(rdc_channel_t channel);
/** @} */ // end of InitShutAdmin
/*****************************************************************************/
/** @defgroup RSMIAccess Remote ROCm SMI Calls
/** @defgroup RSMIAccess Remote ROCm SMI Calls
* These functions calls make ROCm SMI function calls on the remote server.
* Please refer to the
* [ROCm SMI documentation]
@@ -319,10 +314,10 @@ rdc_channel_destroy(rdc_channel_t channel);
* information about the calls. Here, we will document any additional aspects
* of the calls introduced by RDC that are not covered in the ROCm SMI
* documentation.
*
*
* All of the functions in this section attempt to make an RSMI call on the
* server machine, given an ::rdc_channel_t associated with the server, and
* all the arguments that are required to make the RSMI call.
* all the arguments that are required to make the RSMI call.
* @{
*/
@@ -330,12 +325,10 @@ rdc_channel_destroy(rdc_channel_t channel);
* @brief Remote call to rsmi_num_monitor_devices()
*
*/
rdc_status_t
rdc_num_gpus_get(rdc_channel_t channel, uint64_t *num_gpu);
rdc_status_t rdc_num_gpus_get(rdc_channel_t channel, uint64_t* num_gpu);
/** @} */ // end of RSMIAccess
/** @defgroup PhysQuer Physical State Queries
* These functions provide information about the physical characteristics of
* the device.
@@ -345,34 +338,29 @@ rdc_num_gpus_get(rdc_channel_t channel, uint64_t *num_gpu);
* @brief Remote call to rsmi_dev_temp_metric_get()
*
*/
rdc_status_t
rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind,
uint32_t sensor_type, rsmi_temperature_metric_t metric,
int64_t *temperature);
rdc_status_t rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_type,
rsmi_temperature_metric_t metric, int64_t* temperature);
/**
* @brief Remote call to rsmi_dev_fan_rpms_get()
*
*/
rdc_status_t
rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind,
uint32_t sensor_ind, int64_t *rpms);
rdc_status_t rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
int64_t* rpms);
/**
* @brief Remote call to rsmi_dev_fan_speed_get()
*
*/
rdc_status_t
rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind,
uint32_t sensor_ind, int64_t *speed);
rdc_status_t rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
int64_t* speed);
/**
* @brief Remote call to rsmi_dev_fan_speed_max_get()
*
*/
rdc_status_t
rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind,
uint32_t sensor_ind, uint64_t *max_speed);
rdc_status_t rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
uint64_t* max_speed);
/** @} */ // end of PhysQuer
/**
@@ -389,7 +377,6 @@ rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind,
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call
*
*/
rdc_status_t
rdc_status_string(rdc_status_t status, const char **status_string);
rdc_status_t rdc_status_string(rdc_status_t status, const char** status_string);
#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_H_
+8 -10
查看文件
@@ -26,8 +26,8 @@ THE SOFTWARE.
#include <grpcpp/grpcpp.h>
#include <string>
#include <memory>
#include <string>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_client.h"
@@ -37,8 +37,7 @@ namespace rdc {
class RDCChannel {
public:
explicit RDCChannel(std::string server_ip, std::string server_port,
bool secure_channel);
explicit RDCChannel(std::string server_ip, std::string server_port, bool secure_channel);
~RDCChannel();
rdc_status_t Initialize(void);
@@ -47,13 +46,12 @@ class RDCChannel {
// Don't have setter for server ip and ports; we don't want to change those
// after construction
std::string server_ip(void) const {return server_ip_;}
std::string server_port(void) const {return server_port_;}
bool secure_channel(void) const {return secure_channel_;}
std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub(void) const {return rsmi_stub_;}
std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub(void) const {
return rdc_admin_stub_;}
std::shared_ptr<grpc::Channel> const channel(void) {return channel_;}
std::string server_ip(void) const { return server_ip_; }
std::string server_port(void) const { return server_port_; }
bool secure_channel(void) const { return secure_channel_; }
std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub(void) const { return rsmi_stub_; }
std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub(void) const { return rdc_admin_stub_; }
std::shared_ptr<grpc::Channel> const channel(void) { return channel_; }
private:
std::string server_ip_;
+1
查看文件
@@ -22,6 +22,7 @@ THE SOFTWARE.
#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_
#define CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_
#include "rdc/rdc_client.h"
namespace amd {
namespace rdc {
+2 -3
查看文件
@@ -34,8 +34,8 @@ namespace rdc {
/// @brief Exception type which carries an error code to return to the user.
class rdc_exception : public std::exception {
public:
rdc_exception(rdc_status_t error, const std::string description) :
err_(error), desc_(description) {}
rdc_exception(rdc_status_t error, const std::string description)
: err_(error), desc_(description) {}
rdc_status_t error_code() const noexcept { return err_; }
const char* what() const noexcept override { return desc_.c_str(); }
@@ -48,4 +48,3 @@ class rdc_exception : public std::exception {
} // namespace amd
#endif // CLIENT_INCLUDE_RDC_RDC_EXCEPTION_H_
可執行檔 → 一般檔案
+190 -211
查看文件
@@ -20,30 +20,31 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <grpcpp/grpcpp.h>
#include "rdc/rdc_client.h"
#include <grpcpp/grpcpp.h>
#include <time.h>
#include <unistd.h>
#include <iostream>
#include "rdc/rdc_client_main.h"
#include "rdc/rdc_client.h"
#include "rdc/rdc_client_utils.h"
#include "common/rdc_utils.h"
#include "rdc/rdc_exception.h"
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_client_main.h"
#include "rdc/rdc_client_utils.h"
#include "rdc/rdc_exception.h"
#include "rocm_smi/rocm_smi.h"
#define CHK_PTR_ARG(PTR) \
if ((PTR) == nullptr) { \
#define CHK_PTR_ARG(PTR) \
if ((PTR) == nullptr) { \
return RDC_RSMI_STATUS_INVALID_ARGS; \
}
#define UINTPTR_TO_RDC_CHAN(UPTR) \
amd::rdc::RDCChannel *ch = reinterpret_cast<amd::rdc::RDCChannel *>(UPTR); \
if (ch == nullptr) { \
return RDC_STATUS_GRPC_INVALID_ARG; \
} \
#define UINTPTR_TO_RDC_CHAN(UPTR) \
amd::rdc::RDCChannel* ch = reinterpret_cast<amd::rdc::RDCChannel*>(UPTR); \
if (ch == nullptr) { \
return RDC_STATUS_GRPC_INVALID_ARG; \
}
static rdc_status_t handleException() {
try {
@@ -71,13 +72,15 @@ static rdc_status_t handleException() {
}
#define TRY try {
#define CATCH } catch (...) {return handleException();}
#define CATCH \
} \
catch (...) { \
return handleException(); \
}
rdc_status_t
rdc_channel_create(rdc_channel_t *channel, const char *ip,
const char *port, bool secure) {
TRY
std::string server_str;
rdc_status_t rdc_channel_create(rdc_channel_t* channel, const char* ip, const char* port,
bool secure) {
TRY std::string server_str;
std::string port_str;
if (channel == nullptr) {
@@ -95,8 +98,7 @@ rdc_channel_create(rdc_channel_t *channel, const char *ip,
port_str = std::to_string(RDC_DEFAULT_SERVER_PORT);
}
amd::rdc::RDCChannel *ch =
new amd::rdc::RDCChannel(server_str, port_str, secure);
amd::rdc::RDCChannel* ch = new amd::rdc::RDCChannel(server_str, port_str, secure);
if (ch == nullptr) {
return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED;
@@ -115,32 +117,26 @@ rdc_channel_create(rdc_channel_t *channel, const char *ip,
CATCH
}
rdc_status_t
rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect,
grpc_connectivity_state *state) {
TRY
CHK_PTR_ARG(state)
UINTPTR_TO_RDC_CHAN(channel)
rdc_status_t rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect,
grpc_connectivity_state* state) {
TRY CHK_PTR_ARG(state) UINTPTR_TO_RDC_CHAN(channel)
*state = ch->channel()->GetState(try_to_connect);
* state = ch->channel()->GetState(try_to_connect);
return RDC_STATUS_SUCCESS;
CATCH
}
rdc_status_t
rdc_channel_connection_verify(rdc_channel_t channel) {
TRY
UINTPTR_TO_RDC_CHAN(channel)
rdc_status_t rdc_channel_connection_verify(rdc_channel_t channel) {
TRY UINTPTR_TO_RDC_CHAN(channel)
::rdc::VerifyConnectionResponse resp;
::rdc::VerifyConnectionResponse resp;
::rdc::VerifyConnectionRequest req;
::grpc::ClientContext context;
unsigned int seed = time(NULL);
req.set_magic_num(static_cast<uint64_t>(rand_r(&seed)));
::grpc::Status status =
ch->rdc_admin_stub()->VerifyConnection(&context, req, &resp);
::grpc::Status status = ch->rdc_admin_stub()->VerifyConnection(&context, req, &resp);
if (!status.ok()) {
return amd::rdc::GrpcErrorToRdcError(status.error_code());
@@ -155,29 +151,23 @@ rdc_channel_connection_verify(rdc_channel_t channel) {
CATCH
}
rdc_status_t
rdc_channel_destroy(rdc_channel_t channel) {
TRY
UINTPTR_TO_RDC_CHAN(channel)
rdc_status_t rdc_channel_destroy(rdc_channel_t channel) {
TRY UINTPTR_TO_RDC_CHAN(channel)
delete ch;
delete ch;
return RDC_STATUS_SUCCESS;
CATCH
}
rdc_status_t
rdc_num_gpus_get(rdc_channel_t channel, uint64_t *num_gpu) {
TRY
CHK_PTR_ARG(num_gpu)
UINTPTR_TO_RDC_CHAN(channel)
rdc_status_t rdc_num_gpus_get(rdc_channel_t channel, uint64_t* num_gpu) {
TRY CHK_PTR_ARG(num_gpu) UINTPTR_TO_RDC_CHAN(channel)
::rdc::GetNumDevicesResponse resp;
::rdc::GetNumDevicesResponse resp;
::rdc::GetNumDevicesRequest empty;
::grpc::ClientContext context;
::grpc::Status status =
ch->rsmi_stub()->GetNumDevices(&context, empty, &resp);
::grpc::Status status = ch->rsmi_stub()->GetNumDevices(&context, empty, &resp);
if (!status.ok()) {
return amd::rdc::GrpcErrorToRdcError(status.error_code());
@@ -191,21 +181,16 @@ rdc_num_gpus_get(rdc_channel_t channel, uint64_t *num_gpu) {
// rsmi and rdc currently happen to have a 1-to-1 mapping, but
// have this function in case that changes
static ::rdc::GetTemperatureRequest_TemperatureMetric
rsmi_temp2rdc_temp(rsmi_temperature_metric_t rsmi_temp) {
return
static_cast<::rdc::GetTemperatureRequest_TemperatureMetric>(rsmi_temp);
static ::rdc::GetTemperatureRequest_TemperatureMetric rsmi_temp2rdc_temp(
rsmi_temperature_metric_t rsmi_temp) {
return static_cast<::rdc::GetTemperatureRequest_TemperatureMetric>(rsmi_temp);
}
rdc_status_t
rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind,
uint32_t sensor_type, rsmi_temperature_metric_t metric,
int64_t *temperature) {
TRY
CHK_PTR_ARG(temperature)
UINTPTR_TO_RDC_CHAN(channel)
rdc_status_t rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_type,
rsmi_temperature_metric_t metric, int64_t* temperature) {
TRY CHK_PTR_ARG(temperature) UINTPTR_TO_RDC_CHAN(channel)
::rdc::GetTemperatureResponse resp;
::rdc::GetTemperatureResponse resp;
::rdc::GetTemperatureRequest in_args;
::grpc::ClientContext context;
@@ -213,8 +198,7 @@ rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind,
in_args.set_dv_ind(dv_ind);
in_args.set_sensor_type(sensor_type);
::grpc::Status status =
ch->rsmi_stub()->GetTemperature(&context, in_args, &resp);
::grpc::Status status = ch->rsmi_stub()->GetTemperature(&context, in_args, &resp);
if (!status.ok()) {
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
@@ -226,22 +210,18 @@ rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind,
CATCH
}
rdc_status_t
rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind,
uint32_t sensor_ind, int64_t *rpms) {
TRY
CHK_PTR_ARG(rpms)
UINTPTR_TO_RDC_CHAN(channel)
rdc_status_t rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
int64_t* rpms) {
TRY CHK_PTR_ARG(rpms) UINTPTR_TO_RDC_CHAN(channel)
::rdc::GetFanRpmsResponse resp;
::rdc::GetFanRpmsResponse resp;
::rdc::GetFanRpmsRequest in_args;
::grpc::ClientContext context;
in_args.set_dv_ind(dv_ind);
in_args.set_sensor_ind(sensor_ind);
::grpc::Status status =
ch->rsmi_stub()->GetFanRpms(&context, in_args, &resp);
::grpc::Status status = ch->rsmi_stub()->GetFanRpms(&context, in_args, &resp);
if (!status.ok()) {
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
@@ -253,22 +233,18 @@ rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind,
CATCH
}
rdc_status_t
rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind,
uint32_t sensor_ind, int64_t *speed) {
TRY
CHK_PTR_ARG(speed)
UINTPTR_TO_RDC_CHAN(channel)
rdc_status_t rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
int64_t* speed) {
TRY CHK_PTR_ARG(speed) UINTPTR_TO_RDC_CHAN(channel)
::rdc::GetFanSpeedResponse resp;
::rdc::GetFanSpeedResponse resp;
::rdc::GetFanSpeedRequest in_args;
::grpc::ClientContext context;
in_args.set_dv_ind(dv_ind);
in_args.set_sensor_ind(sensor_ind);
::grpc::Status status =
ch->rsmi_stub()->GetFanSpeed(&context, in_args, &resp);
::grpc::Status status = ch->rsmi_stub()->GetFanSpeed(&context, in_args, &resp);
if (!status.ok()) {
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
@@ -280,22 +256,18 @@ rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind,
CATCH
}
rdc_status_t
rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind,
uint32_t sensor_ind, uint64_t *max_speed) {
TRY
CHK_PTR_ARG(max_speed)
UINTPTR_TO_RDC_CHAN(channel)
rdc_status_t rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
uint64_t* max_speed) {
TRY CHK_PTR_ARG(max_speed) UINTPTR_TO_RDC_CHAN(channel)
::rdc::GetFanSpeedMaxResponse resp;
::rdc::GetFanSpeedMaxResponse resp;
::rdc::GetFanSpeedMaxRequest in_args;
::grpc::ClientContext context;
in_args.set_dv_ind(dv_ind);
in_args.set_sensor_ind(sensor_ind);
::grpc::Status status =
ch->rsmi_stub()->GetFanSpeedMax(&context, in_args, &resp);
::grpc::Status status = ch->rsmi_stub()->GetFanSpeedMax(&context, in_args, &resp);
if (!status.ok()) {
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
@@ -307,89 +279,97 @@ rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind,
CATCH
}
rdc_status_t
rdc_status_string(rdc_status_t status, const char **status_string) {
TRY
if (status_string == nullptr) {
return RDC_RSMI_STATUS_INVALID_ARGS;
}
rdc_status_t rdc_status_string(rdc_status_t status, const char** status_string) {
TRY if (status_string == nullptr) { return RDC_RSMI_STATUS_INVALID_ARGS; }
const size_t status_u = static_cast<size_t>(status);
switch (status_u) {
case RDC_STATUS_SUCCESS:
*status_string = "RDC_STATUS_SUCCESS: The function has been executed"
" successfully.";
*status_string =
"RDC_STATUS_SUCCESS: The function has been executed"
" successfully.";
break;
case RDC_RSMI_STATUS_INVALID_ARGS:
*status_string =
"RDC_RSMI_STATUS_INVALID_ARGS: The provided arguments do not"
" meet the preconditions required for calling this function.";
" meet the preconditions required for calling this function.";
break;
case RDC_RSMI_STATUS_NOT_SUPPORTED:
*status_string = "RDC_RSMI_STATUS_NOT_SUPPORTED: This function is not"
" supported in the current environment.";
*status_string =
"RDC_RSMI_STATUS_NOT_SUPPORTED: This function is not"
" supported in the current environment.";
break;
case RDC_RSMI_STATUS_FILE_ERROR:
*status_string =
"RDC_RSMI_STATUS_FILE_ERROR: There was an error in finding or"
" opening a file or directory. The operation may not be supported by "
"this Linux kernel version.";
"RDC_RSMI_STATUS_FILE_ERROR: There was an error in finding or"
" opening a file or directory. The operation may not be supported by "
"this Linux kernel version.";
break;
case RDC_RSMI_STATUS_PERMISSION:
*status_string = "RDC_RSMI_STATUS_PERMISSION: The user ID of the calling"
" process does not have sufficient permission to execute a command."
" Often this is fixed by running as root (sudo).";
*status_string =
"RDC_RSMI_STATUS_PERMISSION: The user ID of the calling"
" process does not have sufficient permission to execute a command."
" Often this is fixed by running as root (sudo).";
break;
case RDC_RSMI_STATUS_OUT_OF_RESOURCES:
*status_string = "RDC_RSMI_STATUS_OUT_OF_RESOURCES: Unable to acquire "
*status_string =
"RDC_RSMI_STATUS_OUT_OF_RESOURCES: Unable to acquire "
"memory or other resource";
break;
case RDC_RSMI_STATUS_INTERNAL_EXCEPTION:
*status_string = "RDC_RSMI_STATUS_INTERNAL_EXCEPTION: An internal "
*status_string =
"RDC_RSMI_STATUS_INTERNAL_EXCEPTION: An internal "
"exception was caught";
break;
case RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS:
*status_string = "RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS: The provided "
*status_string =
"RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS: The provided "
"input is out of allowable or safe range";
break;
case RDC_RSMI_STATUS_INIT_ERROR:
*status_string = "RDC_RSMI_STATUS_INIT_ERROR: An error occurred during "
*status_string =
"RDC_RSMI_STATUS_INIT_ERROR: An error occurred during "
"initialization, during "
"monitor discovery or when when initializing internal data structures";
"monitor discovery or when when initializing internal data structures";
break;
case RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED:
*status_string = "RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED: The called "
*status_string =
"RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED: The called "
"function has not been implemented in this "
"system for this device type";
"system for this device type";
break;
case RDC_RSMI_STATUS_NOT_FOUND:
*status_string = "RDC_RSMI_STATUS_NOT_FOUND: An item required to "
*status_string =
"RDC_RSMI_STATUS_NOT_FOUND: An item required to "
"complete the call was not found";
break;
case RDC_RSMI_STATUS_INSUFFICIENT_SIZE:
*status_string = "RDC_RSMI_STATUS_INSUFFICIENT_SIZE: Not enough "
*status_string =
"RDC_RSMI_STATUS_INSUFFICIENT_SIZE: Not enough "
"resources were available to fully execute"
" the call";
" the call";
break;
case RDC_RSMI_STATUS_UNKNOWN_ERROR:
*status_string = "An unknown error prevented the call from completing"
" successfully";
*status_string =
"An unknown error prevented the call from completing"
" successfully";
break;
case RDC_RSMI_STATUS_INTERRUPT:
*status_string = "RDC_RSMI_STATUS_INTERRUPT An interrupt occurred while "
*status_string =
"RDC_RSMI_STATUS_INTERRUPT An interrupt occurred while "
"executing the function";
break;
@@ -401,31 +381,31 @@ rdc_status_string(rdc_status_t status, const char **status_string) {
case RDC_STATUS_GRPC_UNKNOWN:
*status_string =
"RDC_STATUS_GRPC_UNKNOWN Unknown error. An example of where this error"
" may be returned is if a"
"Status value received from another address space belongs to an error-"
"space that is not known in this address space. Also errors raised by "
"APIs that do not return enough error information may be converted to "
"this error.";
"RDC_STATUS_GRPC_UNKNOWN Unknown error. An example of where this error"
" may be returned is if a"
"Status value received from another address space belongs to an error-"
"space that is not known in this address space. Also errors raised by "
"APIs that do not return enough error information may be converted to "
"this error.";
break;
case RDC_STATUS_GRPC_INVALID_ARG:
*status_string =
"RDC_STATUS_GRPC_INVALID_ARG Client specified an invalid argument. "
"Note that this differs from"
"FAILED_PRECONDITION. INVALID_ARGUMENT indicates arguments that are "
"problematic regardless of the state of the system (e.g., a malformed "
"file name).";
"RDC_STATUS_GRPC_INVALID_ARG Client specified an invalid argument. "
"Note that this differs from"
"FAILED_PRECONDITION. INVALID_ARGUMENT indicates arguments that are "
"problematic regardless of the state of the system (e.g., a malformed "
"file name).";
break;
case RDC_STATUS_GRPC_DEADLINE_EXCEEDED:
*status_string =
"RDC_STATUS_GRPC_DEADLINE_EXCEEDED Deadline expired before operation "
"could complete. For operations that"
"change the state of the system, this error may be returned even if "
"the operation has completed successfully. For example, a successful "
"response from a server could have been delayed long enough for the "
"deadline to expire.";
"RDC_STATUS_GRPC_DEADLINE_EXCEEDED Deadline expired before operation "
"could complete. For operations that"
"change the state of the system, this error may be returned even if "
"the operation has completed successfully. For example, a successful "
"response from a server could have been delayed long enough for the "
"deadline to expire.";
break;
case RDC_STATUS_GRPC_NOT_FOUND:
@@ -436,130 +416,129 @@ rdc_status_string(rdc_status_t status, const char **status_string) {
case RDC_STATUS_GRPC_ALREADY_EXISTS:
*status_string =
"RDC_STATUS_GRPC_ALREADY_EXISTS Some entity that we attempted to create "
"(e.g., file or directory) already exists.";
"RDC_STATUS_GRPC_ALREADY_EXISTS Some entity that we "
"attempted to create "
"(e.g., file or directory) already exists.";
break;
case RDC_STATUS_GRPC_PERM_DENIED:
*status_string =
"RDC_STATUS_GRPC_PERM_DENIED The caller does not have permission to "
"execute the specified operation."
"PERMISSION_DENIED must not be used for rejections caused by "
"exhausting some resource (use RESOURCE_EXHAUSTED instead for those "
"errors). PERMISSION_DENIED must not be used if the caller can not "
" be identified (use UNAUTHENTICATED instead for those errors).";
"RDC_STATUS_GRPC_PERM_DENIED The caller does not have permission to "
"execute the specified operation."
"PERMISSION_DENIED must not be used for rejections caused by "
"exhausting some resource (use RESOURCE_EXHAUSTED instead for those "
"errors). PERMISSION_DENIED must not be used if the caller can not "
" be identified (use UNAUTHENTICATED instead for those errors).";
break;
case RDC_STATUS_GRPC_UNAUTHENTICATED:
*status_string =
"RDC_STATUS_GRPC_UNAUTHENTICATED The request does not have valid "
"authentication credentials for the operation.";
"RDC_STATUS_GRPC_UNAUTHENTICATED The request does not have valid "
"authentication credentials for the operation.";
break;
case RDC_STATUS_GRPC_RESOURCE_EXHAUSTED:
*status_string =
"RDC_STATUS_GRPC_RESOURCE_EXHAUSTED Some resource has been exhausted, "
"perhaps a per-user quota, or perhaps the "
"entire file system is out of space.";
"RDC_STATUS_GRPC_RESOURCE_EXHAUSTED Some resource has been exhausted, "
"perhaps a per-user quota, or perhaps the "
"entire file system is out of space.";
break;
case RDC_STATUS_GRPC_FAILED_PRECOND:
*status_string =
"RDC_STATUS_GRPC_FAILED_PRECOND Operation was rejected because the "
"system is not in a state required for "
"the operation's execution. For example, directory to be deleted may "
"be non-empty, an rmdir operation is applied to a non-directory, etc.\n"
"A litmus test that may help a service implementor in deciding "
"between FAILED_PRECONDITION, ABORTED, and UNAVAILABLE:\n"
" (a) Use UNAVAILABLE if the client can retry just the failing call.\n"
" (b) Use ABORTED if the client should retry at a higher-level "
" (e.g., restarting a read-modify-write sequence).\n"
" (c) Use FAILED_PRECONDITION if the client should not retry until"
" the system state has been explicitly fixed. E.g., if an \"rmdir\""
" fails because the directory is non-empty, FAILED_PRECONDITION"
" should be returned since the client should not retry unless"
" they have first fixed up the directory by deleting files from it.\n"
" (d) Use FAILED_PRECONDITION if the client performs conditional"
" REST Get/Update/Delete on a resource and the resource on the"
" server does not match the condition. E.g., conflicting"
" read-modify-write on the same resource.";
"RDC_STATUS_GRPC_FAILED_PRECOND Operation was rejected because the "
"system is not in a state required for "
"the operation's execution. For example, directory to be deleted may "
"be non-empty, an rmdir operation is applied to a non-directory, etc.\n"
"A litmus test that may help a service implementor in deciding "
"between FAILED_PRECONDITION, ABORTED, and UNAVAILABLE:\n"
" (a) Use UNAVAILABLE if the client can retry just the failing call.\n"
" (b) Use ABORTED if the client should retry at a higher-level "
" (e.g., restarting a read-modify-write sequence).\n"
" (c) Use FAILED_PRECONDITION if the client should not retry until"
" the system state has been explicitly fixed. E.g., if an \"rmdir\""
" fails because the directory is non-empty, FAILED_PRECONDITION"
" should be returned since the client should not retry unless"
" they have first fixed up the directory by deleting files from it.\n"
" (d) Use FAILED_PRECONDITION if the client performs conditional"
" REST Get/Update/Delete on a resource and the resource on the"
" server does not match the condition. E.g., conflicting"
" read-modify-write on the same resource.";
break;
case RDC_STATUS_GRPC_ABORTED:
*status_string =
"RDC_STATUS_GRPC_ABORTED The operation was aborted, "
"typically due to a concurrency issue like "
"sequencer check failures, transaction aborts, etc.\n"
"See litmus test above for deciding between "
"FAILED_PRECONDITION, ABORTED, "
"and UNAVAILABLE.";
"RDC_STATUS_GRPC_ABORTED The operation was aborted, "
"typically due to a concurrency issue like "
"sequencer check failures, transaction aborts, etc.\n"
"See litmus test above for deciding between "
"FAILED_PRECONDITION, ABORTED, "
"and UNAVAILABLE.";
break;
case RDC_STATUS_GRPC_OUT_OF_RANGE:
*status_string =
"RDC_STATUS_GRPC_OUT_OF_RANGE Operation was attempted "
"past the valid range. E.g., seeking or reading "
"past end of file.\n"
"Unlike INVALID_ARGUMENT, this error indicates a "
"problem that may be fixed "
"if the system state changes. For example, a 32-bit file system will "
"generate INVALID_ARGUMENT if asked to read "
"at an offset that is not in the "
"range [0,2^32-1], but it will generate "
"OUT_OF_RANGE if asked to read from "
"an offset past the current file size.\n"
"There is a fair bit of overlap between FAILED_PRECONDITION and "
"OUT_OF_RANGE. We recommend using OUT_OF_RANGE "
"(the more specific error) "
"when it applies so that callers who are "
"iterating through a space can "
"easily look for an OUT_OF_RANGE error to detect when they are done.";
"RDC_STATUS_GRPC_OUT_OF_RANGE Operation was attempted "
"past the valid range. E.g., seeking or reading "
"past end of file.\n"
"Unlike INVALID_ARGUMENT, this error indicates a "
"problem that may be fixed "
"if the system state changes. For example, a 32-bit file system will "
"generate INVALID_ARGUMENT if asked to read "
"at an offset that is not in the "
"range [0,2^32-1], but it will generate "
"OUT_OF_RANGE if asked to read from "
"an offset past the current file size.\n"
"There is a fair bit of overlap between FAILED_PRECONDITION and "
"OUT_OF_RANGE. We recommend using OUT_OF_RANGE "
"(the more specific error) "
"when it applies so that callers who are "
"iterating through a space can "
"easily look for an OUT_OF_RANGE error to detect when they are done.";
break;
case RDC_STATUS_GRPC_UNIMPLEMENTED:
*status_string =
"RDC_STATUS_GRPC_UNIMPLEMENTED Operation is not "
"implemented or not supported/enabled in this service.";
"RDC_STATUS_GRPC_UNIMPLEMENTED Operation is not "
"implemented or not supported/enabled in this service.";
break;
case RDC_STATUS_GRPC_INTERNAL:
*status_string =
"RDC_STATUS_GRPC_INTERNAL Internal errors. This means "
"some invariants expected by underlying System has "
"been broken. If you see one of these errors.";
"RDC_STATUS_GRPC_INTERNAL Internal errors. This means "
"some invariants expected by underlying System has "
"been broken. If you see one of these errors.";
break;
case RDC_STATUS_GRPC_UNAVAILABLE:
*status_string =
"RDC_STATUS_GRPC_UNAVAILABLE The service is currently unavailable. "
"This is a most likely a transient "
"condition and may be corrected by retrying with a backoff.\n"
"Warning: Although data MIGHT not have been transmitted when this "
"status occurs, there is NOT A GUARANTEE that the server has not seen "
"anything. So in general it is unsafe to retry on this status code "
"if the call is non-idempotent. "
"See litmus test above for deciding between "
"FAILED_PRECONDITION, ABORTED,"
"and UNAVAILABLE.";
"RDC_STATUS_GRPC_UNAVAILABLE The service is currently unavailable. "
"This is a most likely a transient "
"condition and may be corrected by retrying with a backoff.\n"
"Warning: Although data MIGHT not have been transmitted when this "
"status occurs, there is NOT A GUARANTEE that the server has not seen "
"anything. So in general it is unsafe to retry on this status code "
"if the call is non-idempotent. "
"See litmus test above for deciding between "
"FAILED_PRECONDITION, ABORTED,"
"and UNAVAILABLE.";
break;
case RDC_STATUS_GRPC_DATA_LOSS:
*status_string =
"RDC_STATUS_GRPC_DATA_LOSS Unrecoverable data loss or corruption.";
*status_string = "RDC_STATUS_GRPC_DATA_LOSS Unrecoverable data loss or corruption.";
break;
case RDC_STATUS_UNKNOWN_ERROR:
*status_string =
"RDC_STATUS_UNKNOWN_ERROR An unknown RDC error occurred.";
*status_string = "RDC_STATUS_UNKNOWN_ERROR An unknown RDC error occurred.";
break;
case RDC_STATUS_CLIENT_ERR_SSL:
*status_string =
"An error occurred when executing SSL authentication operations.";
*status_string = "An error occurred when executing SSL authentication operations.";
break;
default:
*status_string = "RDC_RSMI_STATUS_UNKNOWN_ERROR An "
*status_string =
"RDC_RSMI_STATUS_UNKNOWN_ERROR An "
"unknown error occurred";
return RDC_RSMI_STATUS_UNKNOWN_ERROR;
}
可執行檔 → 一般檔案
+18 -28
查看文件
@@ -21,46 +21,39 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc/rdc_client_main.h"
#include <assert.h>
#include <grpcpp/grpcpp.h>
#include <string>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_client_main.h"
#include "rdc/rdc_client.h"
#include "common/rdc_utils.h"
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_client.h"
namespace amd {
namespace rdc {
#ifdef USE_PINNED_CERTS
// Pinned certificates
static const char *kDefaultRDCServerCertPinPath =
"/etc/rdc/server/rdc_server.crt";
static const char *kDefaultRDCClientKeyPinPath =
"/etc/rdc/client/private/rdc_client.key";
static const char *kDefaultRDCClientCertPinPath =
"/etc/rdc/client/rdc_client.crt";
static const char* kDefaultRDCServerCertPinPath = "/etc/rdc/server/rdc_server.crt";
static const char* kDefaultRDCClientKeyPinPath = "/etc/rdc/client/private/rdc_client.key";
static const char* kDefaultRDCClientCertPinPath = "/etc/rdc/client/rdc_client.crt";
#endif // USE_PINNED_CERTS
// PKI certificates
static const char * kDefaultRDCClientCertKeyPkiPath =
"/etc/rdc/client/private/rdc_client_cert.key";
static const char * kDefaultRDCClientCertPemPkiPath =
"/etc/rdc/client/certs/rdc_client_cert.pem";
static const char * kDefaultRDCClientCACertPemPkiPath =
"/etc/rdc/client/certs/rdc_cacert.pem";
static const char* kDefaultRDCClientCertKeyPkiPath = "/etc/rdc/client/private/rdc_client_cert.key";
static const char* kDefaultRDCClientCertPemPkiPath = "/etc/rdc/client/certs/rdc_client_cert.pem";
static const char* kDefaultRDCClientCACertPemPkiPath = "/etc/rdc/client/certs/rdc_cacert.pem";
RDCChannel::RDCChannel(std::string server_ip, std::string server_port,
bool secure) : server_ip_(server_ip), server_port_(server_port),
secure_channel_(secure) {}
RDCChannel::RDCChannel(std::string server_ip, std::string server_port, bool secure)
: server_ip_(server_ip), server_port_(server_port), secure_channel_(secure) {}
RDCChannel::~RDCChannel() {
}
RDCChannel::~RDCChannel() {}
#ifdef USE_PINNED_CERTS
static int ConstructSSLOptsPin(grpc::SslCredentialsOptions *ssl_opts) {
static int ConstructSSLOptsPin(grpc::SslCredentialsOptions* ssl_opts) {
assert(ssl_opts != nullptr);
if (ssl_opts == nullptr) {
return -EINVAL;
@@ -100,7 +93,7 @@ static int ConstructSSLOptsPin(grpc::SslCredentialsOptions *ssl_opts) {
}
#endif // USE_PINNED_CERTS
static int ConstructSSLOptsPKI(grpc::SslCredentialsOptions *ssl_opts) {
static int ConstructSSLOptsPKI(grpc::SslCredentialsOptions* ssl_opts) {
assert(ssl_opts != nullptr);
if (ssl_opts == nullptr) {
return -EINVAL;
@@ -139,8 +132,7 @@ static int ConstructSSLOptsPKI(grpc::SslCredentialsOptions *ssl_opts) {
return 0;
}
rdc_status_t
RDCChannel::Initialize(void) {
rdc_status_t RDCChannel::Initialize(void) {
assert(!server_port_.empty());
assert(!server_ip_.empty());
@@ -157,16 +149,14 @@ RDCChannel::Initialize(void) {
ret = ConstructSSLOptsPKI(&ssl_opts);
#endif
if (ret) {
std::cerr << "Failed to process OpenSSL keys and certificates." <<
std::endl;
std::cerr << "Failed to process OpenSSL keys and certificates." << std::endl;
return RDC_STATUS_CLIENT_ERR_SSL;
}
channel_creds_ = grpc::SslCredentials(ssl_opts);
channel_ = grpc::CreateChannel(addr_str, channel_creds_);
} else {
channel_ = ::grpc::CreateChannel(addr_str,
grpc::InsecureChannelCredentials());
channel_ = ::grpc::CreateChannel(addr_str, grpc::InsecureChannelCredentials());
}
rsmi_stub_ = ::rdc::Rsmi::NewStub(channel_);
可執行檔 → 一般檔案
+4 -4
查看文件
@@ -20,17 +20,17 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc/rdc_client.h"
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_client_utils.h"
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_client.h"
namespace amd {
namespace rdc {
rdc_status_t GrpcErrorToRdcError(grpc::StatusCode grpc_err) {
uint32_t grpc_err_int = static_cast<uint32_t>(grpc_err);
uint32_t rdc_grpc_base_int =
static_cast<uint32_t>(RDC_STATUS_GRPC_ERR_FIRST);
uint32_t rdc_grpc_base_int = static_cast<uint32_t>(RDC_STATUS_GRPC_ERR_FIRST);
uint32_t rdc_err_int = grpc_err_int + rdc_grpc_base_int;
return static_cast<rdc_status_t>(rdc_err_int);
可執行檔 → 一般檔案
+11 -12
查看文件
@@ -20,16 +20,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <sys/capability.h>
#include <errno.h>
#include <assert.h>
#include "common/rdc_capabilities.h"
#include <assert.h>
#include <errno.h>
#include <sys/capability.h>
namespace amd {
namespace rdc {
int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled) {
int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool* enabled) {
cap_t caps;
assert(enabled != nullptr);
@@ -41,7 +41,7 @@ int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled) {
// Get process's current capabilities
caps = cap_get_proc();
if (caps == nullptr) {
return errno;
return errno;
}
cap_flag_value_t val;
@@ -52,7 +52,7 @@ int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled) {
}
if (cap_free(caps) == -1) {
return errno;
return errno;
}
*enabled = (val == CAP_SET ? true : false);
@@ -68,16 +68,15 @@ int ModifyCapability(cap_value_t cap, cap_flag_t cap_type, bool enable) {
// Get process's current capabilities
caps = cap_get_proc();
if (caps == nullptr) {
return errno;
return errno;
}
// the 1 in the call below is the size of the cap_list array
cap_list[0] = cap;
if (cap_set_flag(caps, cap_type, 1, cap_list, enable ? CAP_SET : CAP_CLEAR)
== -1) {
if (cap_set_flag(caps, cap_type, 1, cap_list, enable ? CAP_SET : CAP_CLEAR) == -1) {
int ret = errno;
cap_free(caps);
return ret;
return ret;
}
if (cap_set_proc(caps) == -1) {
@@ -87,7 +86,7 @@ int ModifyCapability(cap_value_t cap, cap_flag_t cap_type, bool enable) {
}
if (cap_free(caps) == -1) {
return errno;
return errno;
}
return 0;
}
可執行檔 → 一般檔案
+12 -17
查看文件
@@ -28,29 +28,24 @@ THE SOFTWARE.
namespace amd {
namespace rdc {
int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool *enabled);
int GetCapability(cap_value_t cap, cap_flag_t cap_type, bool* enabled);
int ModifyCapability(cap_value_t cap, cap_flag_t cap_type, bool enable);
struct ScopedCapability {
ScopedCapability(cap_value_t cp, cap_flag_t cpt) :
cap_(cp), cap_type_(cpt), error_(0) {
error_ = ModifyCapability(cap_, cap_type_, true);
}
~ScopedCapability() {
error_ = ModifyCapability(cap_, cap_type_, false);
}
void Relinquish(void) {
error_ = ModifyCapability(cap_, cap_type_, false);
}
int error(void) {return error_;}
private:
cap_value_t cap_;
cap_flag_t cap_type_;
int error_;
ScopedCapability(cap_value_t cp, cap_flag_t cpt) : cap_(cp), cap_type_(cpt), error_(0) {
error_ = ModifyCapability(cap_, cap_type_, true);
}
~ScopedCapability() { error_ = ModifyCapability(cap_, cap_type_, false); }
void Relinquish(void) { error_ = ModifyCapability(cap_, cap_type_, false); }
int error(void) { return error_; }
private:
cap_value_t cap_;
cap_flag_t cap_type_;
int error_;
};
} // namespace rdc
} // namespace amd
#endif // COMMON_RDC_CAPABILITIES_H_
+16 -22
查看文件
@@ -19,54 +19,48 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "common/rdc_fields_supported.h"
#include <assert.h>
#include <algorithm>
#include "common/rdc_fields_supported.h"
#include "rdc/rdc.h"
namespace amd {
namespace rdc {
#define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY) \
{static_cast<uint32_t>(ID), {#ID, (DESC), (LABEL), (DISPLAY)}},
#define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY) \
{static_cast<uint32_t>(ID), {#ID, (DESC), (LABEL), (DISPLAY)}},
static const fld_id2name_map_t field_id_to_descript = {
#include "common/rdc_field.data"
#include "common/rdc_field.data"
};
#undef FLD_DESC_ENT
#define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY) {#ID, (ID)},
static fld_name2id_map_t field_name_to_id = {
#include "common/rdc_field.data" // NOLINT
#include "common/rdc_field.data" // NOLINT
};
#undef FLD_DESC_ENT
amd::rdc::fld_id2name_map_t& get_field_id_description_from_id(void) { return field_id_to_descript; }
bool get_field_id_from_name(const std::string name, rdc_field_t* value) {
assert(value != nullptr);
auto id = field_name_to_id.find(name);
if (id == field_name_to_id.end()) {
return false;
}
amd::rdc::fld_id2name_map_t &
get_field_id_description_from_id(void) {
return field_id_to_descript;
}
bool get_field_id_from_name(const std::string name, rdc_field_t *value) {
assert(value != nullptr);
auto id = field_name_to_id.find(name);
if (id == field_name_to_id.end()) {
return false;
}
*value = static_cast<rdc_field_t>(id->second);
return true;
*value = static_cast<rdc_field_t>(id->second);
return true;
}
bool is_field_valid(rdc_field_t field_id) {
if (field_id == RDC_FI_INVALID) {
return false;
}
return field_id_to_descript.find(static_cast<uint32_t>(field_id)) !=
field_id_to_descript.end();
return field_id_to_descript.find(static_cast<uint32_t>(field_id)) != field_id_to_descript.end();
}
} // namespace rdc
} // namespace amd
+8 -9
查看文件
@@ -22,8 +22,8 @@ THE SOFTWARE.
#ifndef COMMON_RDC_FIELDS_SUPPORTED_H_
#define COMMON_RDC_FIELDS_SUPPORTED_H_
#include <string>
#include <map>
#include <string>
#include <unordered_map>
#include "rdc/rdc.h"
@@ -32,18 +32,17 @@ namespace amd {
namespace rdc {
typedef struct {
std::string enum_name;
std::string description;
std::string label;
bool do_display;
std::string enum_name;
std::string description;
std::string label;
bool do_display;
} field_id_descript;
typedef const std::map<uint32_t, const field_id_descript>
fld_id2name_map_t;
typedef const std::map<uint32_t, const field_id_descript> fld_id2name_map_t;
typedef std::unordered_map<std::string, uint32_t> fld_name2id_map_t;
bool get_field_id_from_name(const std::string name, rdc_field_t *value);
fld_id2name_map_t & get_field_id_description_from_id(void); // NOLINT
bool get_field_id_from_name(const std::string name, rdc_field_t* value);
fld_id2name_map_t& get_field_id_description_from_id(void); // NOLINT
bool is_field_valid(rdc_field_t field_id);
} // namespace rdc
可執行檔 → 一般檔案
+14 -15
查看文件
@@ -20,28 +20,28 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "common/rdc_utils.h"
#include <arpa/inet.h>
#include <assert.h>
#include <netinet/in.h>
#include <sys/stat.h>
#include <assert.h>
#include <arpa/inet.h>
#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include <algorithm>
#include "common/rdc_utils.h"
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
namespace amd {
namespace rdc {
bool FileExists(char const *filename) {
bool FileExists(char const* filename) {
struct stat buf;
return (stat(filename, &buf) == 0);
}
int ReadFile(std::string path, std::string *retStr, bool chop_newline) {
int ReadFile(std::string path, std::string* retStr, bool chop_newline) {
std::stringstream ss;
int ret = 0;
@@ -61,13 +61,12 @@ int ReadFile(std::string path, std::string *retStr, bool chop_newline) {
*retStr = ss.str();
if (chop_newline) {
retStr->erase(std::remove(retStr->begin(), retStr->end(), '\n'),
retStr->end());
retStr->erase(std::remove(retStr->begin(), retStr->end(), '\n'), retStr->end());
}
return ret;
}
int ReadFile(const char *path, std::string *retStr, bool chop_newline) {
int ReadFile(const char* path, std::string* retStr, bool chop_newline) {
assert(path != nullptr);
assert(retStr != nullptr);
@@ -76,11 +75,11 @@ int ReadFile(const char *path, std::string *retStr, bool chop_newline) {
return amd::rdc::ReadFile(file_path, retStr, chop_newline);
}
bool IsNumber(const std::string &s) {
bool IsNumber(const std::string& s) {
return !s.empty() && std::all_of(s.begin(), s.end(), ::isdigit);
}
bool IsIP(const std::string &s) {
bool IsIP(const std::string& s) {
struct sockaddr_in sa;
int result = inet_pton(AF_INET, s.c_str(), &sa);
// inet_pton returns 1 on success
可執行檔 → 一般檔案
+10 -14
查看文件
@@ -30,29 +30,25 @@ namespace amd {
namespace rdc {
#ifdef NDEBUG
#define debug_print(fmt, ...) \
do { \
#define debug_print(fmt, ...) \
do { \
} while (false)
#else
#define debug_print(fmt, ...) \
do { \
fprintf(stderr, fmt, ##__VA_ARGS__); \
#define debug_print(fmt, ...) \
do { \
fprintf(stderr, fmt, ##__VA_ARGS__); \
} while (false)
#endif
bool
FileExists(char const *filename);
bool FileExists(char const* filename);
int
ReadFile(std::string path, std::string *retStr, bool chop_newline = false);
int
ReadFile(const char *path, std::string *retStr, bool chop_newline = false);
int ReadFile(std::string path, std::string* retStr, bool chop_newline = false);
int ReadFile(const char* path, std::string* retStr, bool chop_newline = false);
bool IsNumber(const std::string &s);
bool IsIP(const std::string &s);
bool IsNumber(const std::string& s);
bool IsIP(const std::string& s);
} // namespace rdc
} // namespace amd
#endif // COMMON_RDC_UTILS_H_
+108 -123
查看文件
@@ -20,154 +20,139 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <unistd.h>
#include <string.h>
#include <iostream>
#include <unistd.h>
#include <iomanip>
#include <iostream>
#include <map>
#include <string>
#include "rdc/rdc.h"
static std::string get_test_name(rdc_diag_test_cases_t test_case) {
const std::map<rdc_diag_test_cases_t, std::string> test_desc = {
{RDC_DIAG_COMPUTE_PROCESS, "No compute process"},
{RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"},
{RDC_DIAG_SYS_MEM_CHECK, "System memory check"},
{RDC_DIAG_NODE_TOPOLOGY, "Node topology check"},
{RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"},
{RDC_DIAG_TEST_LAST, "Unknown"}
};
const std::map<rdc_diag_test_cases_t, std::string> test_desc = {
{RDC_DIAG_COMPUTE_PROCESS, "No compute process"},
{RDC_DIAG_COMPUTE_QUEUE, "Compute Queue ready"},
{RDC_DIAG_SYS_MEM_CHECK, "System memory check"},
{RDC_DIAG_NODE_TOPOLOGY, "Node topology check"},
{RDC_DIAG_GPU_PARAMETERS, "GPU parameters check"},
{RDC_DIAG_TEST_LAST, "Unknown"}};
auto test_name = test_desc.find(test_case);
if (test_name == test_desc.end()) {
return "Unknown Test";
}
return test_name->second;
auto test_name = test_desc.find(test_case);
if (test_name == test_desc.end()) {
return "Unknown Test";
}
return test_name->second;
}
int main(int, char **) {
rdc_status_t result;
rdc_handle_t rdc_handle;
bool standalone = false;
char hostIpAddress[] = {"127.0.0.1:50051"};
char group_name[] = {"diag_group"};
int main(int, char**) {
rdc_status_t result;
rdc_handle_t rdc_handle;
bool standalone = false;
char hostIpAddress[] = {"127.0.0.1:50051"};
char group_name[] = {"diag_group"};
// Select the embedded mode and standalone mode dynamically.
std::cout << "Start rdci in: \n";
std::cout << "0 - Embedded mode \n";
std::cout << "1 - Standalone mode \n";
while (!(std::cin >> standalone)) {
std::cout << "Invalid input.\n";
std::cin.clear();
std::cin.ignore();
}
std::cout << std::endl;
std::cout << (standalone?
"Standalone mode selected.\n":"Embedded mode selected.\n");
// Select the embedded mode and standalone mode dynamically.
std::cout << "Start rdci in: \n";
std::cout << "0 - Embedded mode \n";
std::cout << "1 - Standalone mode \n";
while (!(std::cin >> standalone)) {
std::cout << "Invalid input.\n";
std::cin.clear();
std::cin.ignore();
}
std::cout << std::endl;
std::cout << (standalone ? "Standalone mode selected.\n" : "Embedded mode selected.\n");
// Init the rdc
result = rdc_init(0);
// Init the rdc
result = rdc_init(0);
if (result != RDC_ST_OK) {
std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
} else {
std::cout << "RDC Initialized.\n";
}
if (standalone) { // standalone
result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr);
if (result != RDC_ST_OK) {
std::cout << "Error initializing RDC. Return: " <<
rdc_status_string(result) << std::endl;
goto cleanup;
} else {
std::cout << "RDC Initialized.\n";
std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
if (standalone) { // standalone
result = rdc_connect(hostIpAddress, &rdc_handle,
nullptr, nullptr, nullptr);
if ( result != RDC_ST_OK ) {
std::cout << "Error connecting to remote rdcd. Return: "
<< rdc_status_string(result) << std::endl;
goto cleanup;
}
} else { // embedded
result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle);
if (result != RDC_ST_OK) {
std::cout << "Error starting embedded RDC engine. Return: "
<< rdc_status_string(result) << std::endl;
goto cleanup;
}
}
// Now we can use the same API for both standalone and embedded
// (1) create group for all GPUs
rdc_gpu_group_t group_id;
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_DEFAULT,
group_name, &group_id);
} else { // embedded
result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle);
if (result != RDC_ST_OK) {
std::cout << "Error creating group. Return: "
<< rdc_status_string(result);
goto cleanup;
std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
}
// (2) start to run short diagnostic.
rdc_diag_response_t response;
result = rdc_diagnostic_run(rdc_handle, group_id,
RDC_DIAG_LVL_SHORT, &response);
// Now we can use the same API for both standalone and embedded
// (1) create group for all GPUs
rdc_gpu_group_t group_id;
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_DEFAULT, group_name, &group_id);
if (result != RDC_ST_OK) {
std::cout << "Error creating group. Return: " << rdc_status_string(result);
goto cleanup;
}
if (result != RDC_ST_OK) {
std::cout << "Error run RDC_DIAG_LVL_SHORT diagnostic. Return: "
<< rdc_status_string(result);
goto cleanup;
}
// (2) start to run short diagnostic.
rdc_diag_response_t response;
result = rdc_diagnostic_run(rdc_handle, group_id, RDC_DIAG_LVL_SHORT, &response);
// (3) Check diagnostic results
for (uint32_t i=0 ; i < response.results_count; i++) {
const rdc_diag_test_result_t& test_result =
response.diag_info[i];
std::cout << std::setw(22) << std::left
<< get_test_name(test_result.test_case) + ":"
<< rdc_diagnostic_result_string(test_result.status) << "\n";
}
if (result != RDC_ST_OK) {
std::cout << "Error run RDC_DIAG_LVL_SHORT diagnostic. Return: " << rdc_status_string(result);
goto cleanup;
}
// (4) diagnostic detail information
std::cout <<" =============== Diagnostic Details ==================\n";
for (uint32_t i=0 ; i < response.results_count; i++) {
const rdc_diag_test_result_t& test_result =
response.diag_info[i];
if (test_result.info[0] != '\0') {
std::cout << std::setw(22) << std::left
<< get_test_name(test_result.test_case) + ":"
// (3) Check diagnostic results
for (uint32_t i = 0; i < response.results_count; i++) {
const rdc_diag_test_result_t& test_result = response.diag_info[i];
std::cout << std::setw(22) << std::left << get_test_name(test_result.test_case) + ":"
<< rdc_diagnostic_result_string(test_result.status) << "\n";
}
// (4) diagnostic detail information
std::cout << " =============== Diagnostic Details ==================\n";
for (uint32_t i = 0; i < response.results_count; i++) {
const rdc_diag_test_result_t& test_result = response.diag_info[i];
if (test_result.info[0] != '\0') {
std::cout << std::setw(22) << std::left << get_test_name(test_result.test_case) + ":"
<< test_result.info << "\n";
}
for (uint32_t j=0; j < test_result.per_gpu_result_count; j++) {
const rdc_diag_per_gpu_result_t& gpu_result
= test_result.gpu_results[j];
if (strlen(gpu_result.gpu_result.msg) > 0) {
std::cout << " GPU " << gpu_result.gpu_index
<< " " << gpu_result.gpu_result.msg << "\n";
}
}
}
// (5) run one test case
std::cout <<" ============== Run individual diagnostic test ===========\n";
rdc_diag_test_result_t test_result;
result = rdc_test_case_run(rdc_handle, group_id,
RDC_DIAG_COMPUTE_PROCESS, &test_result);
if (result != RDC_ST_OK) {
std::cout << "Error run RDC_DIAG_COMPUTE_PROCESS diagnostic. Return: "
<< rdc_status_string(result);
goto cleanup;
for (uint32_t j = 0; j < test_result.per_gpu_result_count; j++) {
const rdc_diag_per_gpu_result_t& gpu_result = test_result.gpu_results[j];
if (strlen(gpu_result.gpu_result.msg) > 0) {
std::cout << " GPU " << gpu_result.gpu_index << " " << gpu_result.gpu_result.msg << "\n";
}
}
}
std::cout << std::setw(22) << std::left
<< get_test_name(RDC_DIAG_COMPUTE_PROCESS) + ":"
<< test_result.info << "\n";
// (5) run one test case
std::cout << " ============== Run individual diagnostic test ===========\n";
rdc_diag_test_result_t test_result;
result = rdc_test_case_run(rdc_handle, group_id, RDC_DIAG_COMPUTE_PROCESS, &test_result);
if (result != RDC_ST_OK) {
std::cout << "Error run RDC_DIAG_COMPUTE_PROCESS diagnostic. Return: "
<< rdc_status_string(result);
goto cleanup;
}
// Cleanup consists of shutting down RDC.
cleanup:
std::cout << "Cleaning up.\n";
if (standalone)
rdc_disconnect(rdc_handle);
else
rdc_stop_embedded(rdc_handle);
rdc_shutdown();
return result;
std::cout << std::setw(22) << std::left << get_test_name(RDC_DIAG_COMPUTE_PROCESS) + ":"
<< test_result.info << "\n";
// Cleanup consists of shutting down RDC.
cleanup:
std::cout << "Cleaning up.\n";
if (standalone)
rdc_disconnect(rdc_handle);
else
rdc_stop_embedded(rdc_handle);
rdc_shutdown();
return result;
}
+215 -237
查看文件
@@ -21,257 +21,235 @@ THE SOFTWARE.
*/
#include <unistd.h>
#include <iostream>
#include <iomanip>
#include <iostream>
#include "rdc/rdc.h"
int main(int, char **) {
rdc_status_t result;
rdc_handle_t rdc_handle;
bool standalone = false;
char hostIpAddress[] = {"127.0.0.1:50051"};
char group_name[] = {"group1"};
char field_group_name[] = {"fieldgroup1"};
uint64_t since_timestamp = 0;
uint64_t next_timestamp = 0;
uint64_t start_timestamp = 0;
uint32_t count = 0;
int main(int, char**) {
rdc_status_t result;
rdc_handle_t rdc_handle;
bool standalone = false;
char hostIpAddress[] = {"127.0.0.1:50051"};
char group_name[] = {"group1"};
char field_group_name[] = {"fieldgroup1"};
uint64_t since_timestamp = 0;
uint64_t next_timestamp = 0;
uint64_t start_timestamp = 0;
uint32_t count = 0;
// Select the embedded mode and standalone mode dynamically.
std::cout << "Start rdci in: \n";
std::cout << "0 - Embedded mode \n";
std::cout << "1 - Standalone mode \n";
while (!(std::cin >> standalone)) {
std::cout << "Invalid input.\n";
std::cin.clear();
std::cin.ignore();
}
std::cout << std::endl;
std::cout << (standalone ? "Standalone mode selected.\n" : "Embedded mode selected.\n");
// Select the embedded mode and standalone mode dynamically.
std::cout << "Start rdci in: \n";
std::cout << "0 - Embedded mode \n";
std::cout << "1 - Standalone mode \n";
while (!(std::cin >> standalone)) {
std::cout << "Invalid input.\n";
std::cin.clear();
std::cin.ignore();
}
std::cout << std::endl;
std::cout << (standalone?
"Standalone mode selected.\n":"Embedded mode selected.\n");
// Init the rdc
result = rdc_init(0);
// Init the rdc
result = rdc_init(0);
if (result != RDC_ST_OK) {
std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
} else {
std::cout << "RDC Initialized.\n";
}
if (standalone) { // standalone
result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr);
if (result != RDC_ST_OK) {
std::cout << "Error initializing RDC. Return: " <<
rdc_status_string(result) << std::endl;
goto cleanup;
} else {
std::cout << "RDC Initialized.\n";
std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
} else { // embedded
result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle);
if (result != RDC_ST_OK) {
std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
}
if (standalone) { // standalone
result = rdc_connect(hostIpAddress, &rdc_handle,
nullptr, nullptr, nullptr);
if ( result != RDC_ST_OK ) {
std::cout << "Error connecting to remote rdcd. Return: "
<< rdc_status_string(result) << std::endl;
goto cleanup;
// Now we can use the same API for both standalone and embedded
// Get the list of devices in the system
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
result = rdc_device_get_all(rdc_handle, gpu_index_list, &count);
if (result != RDC_ST_OK) {
std::cout << "Error to find devices on the system. Return: " << rdc_status_string(result);
goto cleanup;
}
if (count == 0) {
std::cout << "No GPUs find on the sytem ";
goto cleanup;
} else {
std::cout << count << " GPUs found in the system.\n";
}
// Create the group
rdc_gpu_group_t group_id;
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id);
if (result != RDC_ST_OK) {
std::cout << "Error creating group. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Created the GPU group " << group_id << std::endl;
// Add all GPUs to the group
for (uint32_t i = 0; i < count; i++) {
result = rdc_group_gpu_add(rdc_handle, group_id, gpu_index_list[i]); // Add GPU 0
if (result != RDC_ST_OK) {
std::cout << "Error adding group. Return: " << rdc_status_string(result);
goto cleanup;
}
rdc_device_attributes_t attribute;
result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], &attribute);
if (result != RDC_ST_OK) {
std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Add GPU " << gpu_index_list[i] << ":" << attribute.device_name << " to group "
<< group_id << std::endl;
}
// Create the field groups to monitor POWER and TEMP
rdc_field_grp_t field_group_id;
rdc_field_t field_ids[2];
field_ids[0] = RDC_FI_GPU_MEMORY_USAGE;
field_ids[1] = RDC_FI_POWER_USAGE;
result = rdc_group_field_create(rdc_handle, 2, &field_ids[0], field_group_name, &field_group_id);
if (result != RDC_ST_OK) {
std::cout << "Error create field group, Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Created the field group " << field_group_id << ": "
<< field_id_string(RDC_FI_GPU_MEMORY_USAGE) << ", "
<< field_id_string(RDC_FI_POWER_USAGE) << std::endl;
// Let the RDC to watch the fields and groups. The fields will be updated
// once per second, the max keep age is 1 minutes and only keep 10 samples.
result = rdc_field_watch(rdc_handle, group_id, field_group_id, 1000000, 60, 10);
if (result != RDC_ST_OK) {
std::cout << "Error watch group fields. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Start to watch group:" << group_id << ", field_group:" << field_group_id
<< std::endl;
std::cout << "Sleep a few seconds before retreive the data ...\n";
// Since we are running the RDC_OPERATION_MODE_AUTO mode, the rdc_update_
// all_fields() will be called periodically at background. If running as
// RDC_OPERATION_MODE_MANUAL mode, we must call rdc_field_update_all()
// periodically to take samples.
usleep(5000000); // sleep 5 seconds before fetch the stats
// Retreive the field and group information from RDC
rdc_group_info_t group_info;
rdc_field_group_info_t field_info;
result = rdc_group_gpu_get_info(rdc_handle, group_id, &group_info);
if (result != RDC_ST_OK) {
std::cout << "Error get gpu group info. Return: " << rdc_status_string(result);
goto cleanup;
}
result = rdc_group_field_get_info(rdc_handle, field_group_id, &field_info);
if (result != RDC_ST_OK) {
std::cout << "Error get field group info. Return: " << rdc_status_string(result);
goto cleanup;
}
// Get the latest metrics
std::cout << "Get the latest metrics for group:" << group_id << " field_group:" << field_group_id
<< std::endl;
std::cout << "time_stamp\t"
<< "GPU_index\t"
<< "field_name\t\t"
<< "field_value\n";
for (uint32_t gindex = 0; gindex < group_info.count; gindex++) {
for (uint32_t findex = 0; findex < field_info.count; findex++) {
rdc_field_value value;
result = rdc_field_get_latest_value(rdc_handle, group_info.entity_ids[gindex],
field_info.field_ids[findex], &value);
if (result == RDC_ST_NOT_FOUND) {
continue;
}
if (result != RDC_ST_OK) {
std::cout << "Error get least value. Return: " << rdc_status_string(result);
goto cleanup;
}
// We only support the integer metrics so far
std::cout << value.ts << "\t" << group_info.entity_ids[gindex] << "\t\t" << std::left
<< std::setw(16) << field_id_string(value.field_id) << "\t" << value.value.l_int
<< std::endl;
}
}
// Stop watching the field group
result = rdc_field_unwatch(rdc_handle, group_id, field_group_id);
if (result != RDC_ST_OK) {
std::cout << "Error stop watch fields. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Stop watch group:" << group_id << ", field_group:" << field_group_id << std::endl;
// Get the history data last 10 seconds
std::cout << "Get last 10 seconds metrics for group:" << group_id
<< " field_group:" << field_group_id << std::endl;
std::cout << "time_stamp\t"
<< "GPU_index\t"
<< "field_name\t\t"
<< "field_value\n";
start_timestamp = static_cast<uint64_t>(time(nullptr) - 10) * 1000;
for (uint32_t gindex = 0; gindex < group_info.count; gindex++) {
for (uint32_t findex = 0; findex < field_info.count; findex++) {
since_timestamp = start_timestamp;
while (true) {
rdc_field_value value;
result = rdc_field_get_value_since(rdc_handle, group_info.entity_ids[gindex],
field_info.field_ids[findex], since_timestamp,
&next_timestamp, &value);
if (result == RDC_ST_NOT_FOUND) {
break;
}
} else { // embedded
result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle);
if (result != RDC_ST_OK) {
std::cout << "Error starting embedded RDC engine. Return: "
<< rdc_status_string(result) << std::endl;
goto cleanup;
std::cout << "Error get history data. Return: " << rdc_status_string(result);
goto cleanup;
}
}
std::cout << value.ts << "\t" << group_info.entity_ids[gindex] << "\t\t" << std::left
<< std::setw(16) << field_id_string(value.field_id) << "\t" << value.value.l_int
<< std::endl;
since_timestamp = next_timestamp;
} // while
} // for findex
} // for gindex
// Now we can use the same API for both standalone and embedded
// Get the list of devices in the system
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
result = rdc_device_get_all(rdc_handle, gpu_index_list, &count);
if (result != RDC_ST_OK) {
std::cout << "Error to find devices on the system. Return: "
<< rdc_status_string(result);
goto cleanup;
}
if (count == 0) {
std::cout << "No GPUs find on the sytem ";
goto cleanup;
} else {
std::cout << count << " GPUs found in the system.\n";
}
// Delete the field group and GPU group
result = rdc_group_field_destroy(rdc_handle, field_group_id);
if (result != RDC_ST_OK) {
std::cout << "Error delete field group. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Deleted the field group " << field_group_id << std::endl;
// Create the group
rdc_gpu_group_t group_id;
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY,
group_name, &group_id);
if (result != RDC_ST_OK) {
std::cout << "Error creating group. Return: "
<< rdc_status_string(result);
goto cleanup;
}
std::cout << "Created the GPU group " << group_id << std::endl;
result = rdc_group_gpu_destroy(rdc_handle, group_id);
if (result != RDC_ST_OK) {
std::cout << "Error delete GPU group. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Deleted the GPU group " << group_id << std::endl;
// Add all GPUs to the group
for (uint32_t i = 0; i < count; i++) {
result = rdc_group_gpu_add(rdc_handle,
group_id, gpu_index_list[i]); // Add GPU 0
if (result != RDC_ST_OK) {
std::cout << "Error adding group. Return: "
<< rdc_status_string(result);
goto cleanup;
}
rdc_device_attributes_t attribute;
result = rdc_device_get_attributes(rdc_handle,
gpu_index_list[i], &attribute);
if (result != RDC_ST_OK) {
std::cout << "Error get GPU attribute. Return: "
<< rdc_status_string(result);
goto cleanup;
}
std::cout << "Add GPU " <<gpu_index_list[i] << ":"
<< attribute.device_name <<" to group "<< group_id << std::endl;
}
// Create the field groups to monitor POWER and TEMP
rdc_field_grp_t field_group_id;
rdc_field_t field_ids[2];
field_ids[0] = RDC_FI_GPU_MEMORY_USAGE;
field_ids[1] = RDC_FI_POWER_USAGE;
result = rdc_group_field_create(rdc_handle, 2,
&field_ids[0], field_group_name, &field_group_id);
if (result != RDC_ST_OK) {
std::cout << "Error create field group, Return: "
<< rdc_status_string(result);
goto cleanup;
}
std::cout<< "Created the field group " << field_group_id
<< ": " << field_id_string(RDC_FI_GPU_MEMORY_USAGE) << ", "
<< field_id_string(RDC_FI_POWER_USAGE) <<std::endl;
// Let the RDC to watch the fields and groups. The fields will be updated
// once per second, the max keep age is 1 minutes and only keep 10 samples.
result = rdc_field_watch(rdc_handle, group_id,
field_group_id, 1000000, 60, 10);
if (result != RDC_ST_OK) {
std::cout << "Error watch group fields. Return: "
<< rdc_status_string(result);
goto cleanup;
}
std::cout << "Start to watch group:"
<< group_id << ", field_group:" << field_group_id << std::endl;
std::cout << "Sleep a few seconds before retreive the data ...\n";
// Since we are running the RDC_OPERATION_MODE_AUTO mode, the rdc_update_
// all_fields() will be called periodically at background. If running as
// RDC_OPERATION_MODE_MANUAL mode, we must call rdc_field_update_all()
// periodically to take samples.
usleep(5000000); // sleep 5 seconds before fetch the stats
// Retreive the field and group information from RDC
rdc_group_info_t group_info;
rdc_field_group_info_t field_info;
result = rdc_group_gpu_get_info(rdc_handle, group_id, &group_info);
if (result != RDC_ST_OK) {
std::cout << "Error get gpu group info. Return: "
<< rdc_status_string(result);
goto cleanup;
}
result = rdc_group_field_get_info(rdc_handle, field_group_id, &field_info);
if (result != RDC_ST_OK) {
std::cout << "Error get field group info. Return: "
<< rdc_status_string(result);
goto cleanup;
}
// Get the latest metrics
std::cout << "Get the latest metrics for group:" << group_id
<< " field_group:" << field_group_id << std::endl;
std::cout << "time_stamp\t" << "GPU_index\t"
<< "field_name\t\t" << "field_value\n";
for (uint32_t gindex = 0; gindex < group_info.count; gindex++) {
for (uint32_t findex = 0; findex < field_info.count; findex++) {
rdc_field_value value;
result = rdc_field_get_latest_value(rdc_handle,
group_info.entity_ids[gindex], field_info.field_ids[findex], &value);
if (result == RDC_ST_NOT_FOUND) {
continue;
}
if (result != RDC_ST_OK) {
std::cout << "Error get least value. Return: "
<< rdc_status_string(result);
goto cleanup;
}
// We only support the integer metrics so far
std::cout << value.ts <<"\t" << group_info.entity_ids[gindex]
<< "\t\t" << std::left << std::setw(16)
<< field_id_string(value.field_id) << "\t"
<< value.value.l_int << std::endl;
}
}
// Stop watching the field group
result = rdc_field_unwatch(rdc_handle, group_id, field_group_id);
if (result != RDC_ST_OK) {
std::cout << "Error stop watch fields. Return: "
<< rdc_status_string(result);
goto cleanup;
}
std::cout << "Stop watch group:" << group_id << ", field_group:"
<< field_group_id << std::endl;
// Get the history data last 10 seconds
std::cout << "Get last 10 seconds metrics for group:" << group_id
<< " field_group:" << field_group_id << std::endl;
std::cout << "time_stamp\t" << "GPU_index\t"
<< "field_name\t\t" << "field_value\n";
start_timestamp = static_cast<uint64_t>(time(nullptr)-10)*1000;
for (uint32_t gindex = 0; gindex < group_info.count; gindex++) {
for (uint32_t findex = 0; findex < field_info.count; findex++) {
since_timestamp = start_timestamp;
while (true) {
rdc_field_value value;
result = rdc_field_get_value_since(rdc_handle,
group_info.entity_ids[gindex] , field_info.field_ids[findex],
since_timestamp, &next_timestamp, &value);
if (result == RDC_ST_NOT_FOUND) {
break;
}
if (result != RDC_ST_OK) {
std::cout << "Error get history data. Return: "
<< rdc_status_string(result);
goto cleanup;
}
std::cout << value.ts <<"\t" << group_info.entity_ids[gindex]
<< "\t\t" << std::left << std::setw(16)
<< field_id_string(value.field_id) << "\t"
<< value.value.l_int << std::endl;
since_timestamp = next_timestamp;
} // while
} // for findex
} // for gindex
// Delete the field group and GPU group
result = rdc_group_field_destroy(rdc_handle, field_group_id);
if (result != RDC_ST_OK) {
std::cout << "Error delete field group. Return: "
<< rdc_status_string(result);
goto cleanup;
}
std::cout << "Deleted the field group " << field_group_id << std::endl;
result = rdc_group_gpu_destroy(rdc_handle, group_id);
if (result != RDC_ST_OK) {
std::cout << "Error delete GPU group. Return: "
<< rdc_status_string(result);
goto cleanup;
}
std::cout << "Deleted the GPU group " << group_id << std::endl;
// Cleanup consists of shutting down RDC.
cleanup:
std::cout << "Cleaning up.\n";
if (standalone)
rdc_disconnect(rdc_handle);
else
rdc_stop_embedded(rdc_handle);
rdc_shutdown();
return result;
// Cleanup consists of shutting down RDC.
cleanup:
std::cout << "Cleaning up.\n";
if (standalone)
rdc_disconnect(rdc_handle);
else
rdc_stop_embedded(rdc_handle);
rdc_shutdown();
return result;
}
+130 -142
查看文件
@@ -21,162 +21,150 @@ THE SOFTWARE.
*/
#include <unistd.h>
#include <iostream>
#include "rdc/rdc.h"
int main(int, char **) {
rdc_status_t result;
rdc_handle_t rdc_handle;
bool standalone = false;
char hostIpAddress[] = {"127.0.0.1:50051"};
char group_name[] = {"group1"};
char job_id[] = {"123"};
int main(int, char**) {
rdc_status_t result;
rdc_handle_t rdc_handle;
bool standalone = false;
char hostIpAddress[] = {"127.0.0.1:50051"};
char group_name[] = {"group1"};
char job_id[] = {"123"};
// Select the embedded mode and standalone mode dynamically.
std::cout << "Start rdci in: \n";
std::cout << "0 - Embedded mode \n";
std::cout << "1 - Standalone mode \n";
while (!(std::cin >> standalone)) {
std::cout << "Invalid input.\n";
std::cin.clear();
std::cin.ignore();
}
std::cout << std::endl;
std::cout << (standalone ? "Standalone mode selected.\n" : "Embedded mode selected.\n");
// Select the embedded mode and standalone mode dynamically.
std::cout << "Start rdci in: \n";
std::cout << "0 - Embedded mode \n";
std::cout << "1 - Standalone mode \n";
while (!(std::cin >> standalone)) {
std::cout << "Invalid input.\n";
std::cin.clear();
std::cin.ignore();
}
std::cout << std::endl;
std::cout << (standalone?
"Standalone mode selected.\n":"Embedded mode selected.\n");
// Init the rdc
result = rdc_init(0);
// Init the rdc
result = rdc_init(0);
if (result != RDC_ST_OK) {
std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
} else {
std::cout << "RDC Initialized.\n";
}
if (standalone) { // standalone
result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr);
if (result != RDC_ST_OK) {
std::cout << "Error initializing RDC. Return: " <<
rdc_status_string(result) << std::endl;
goto cleanup;
} else {
std::cout << "RDC Initialized.\n";
std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
if (standalone) { // standalone
result = rdc_connect(hostIpAddress, &rdc_handle,
nullptr, nullptr, nullptr);
if ( result != RDC_ST_OK ) {
std::cout << "Error connecting to remote rdcd. Return: "
<< rdc_status_string(result) << std::endl;
goto cleanup;
}
} else { // embedded
result = rdc_start_embedded(RDC_OPERATION_MODE_MANUAL, &rdc_handle);
if (result != RDC_ST_OK) {
std::cout << "Error starting embedded RDC engine. Return: "
<< rdc_status_string(result) << std::endl;
goto cleanup;
}
}
// Now we can use the same API for both standalone and embedded
// (1) create group and add GPUs
rdc_gpu_group_t group_id;
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY,
group_name, &group_id);
} else { // embedded
result = rdc_start_embedded(RDC_OPERATION_MODE_MANUAL, &rdc_handle);
if (result != RDC_ST_OK) {
std::cout << "Error creating group. Return: "
<< rdc_status_string(result);
std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
}
// Now we can use the same API for both standalone and embedded
// (1) create group and add GPUs
rdc_gpu_group_t group_id;
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id);
if (result != RDC_ST_OK) {
std::cout << "Error creating group. Return: " << rdc_status_string(result);
goto cleanup;
}
result = rdc_group_gpu_add(rdc_handle, group_id, 0); // Add GPU 0
if (result != RDC_ST_OK) {
std::cout << "Error adding group. Return: " << rdc_status_string(result);
goto cleanup;
}
// (2) start the recording. Set the sample frequency to once per second.
result = rdc_job_start_stats(rdc_handle, group_id, job_id, 1000000);
if (result != RDC_ST_OK) {
std::cout << "Error start job stats. Return: " << rdc_status_string(result);
goto cleanup;
}
// For standalone mode, the daemon will update and cache the samples
// In manual mode, we must call rdc_field_update_all periodically to
// take samples.
if (!standalone) { // embedded manual mode
for (int i = 5; i > 0; i--) { // As an example, we will take 5 samples
result = rdc_field_update_all(rdc_handle, 0);
if (result != RDC_ST_OK) {
std::cout << "Error update all fields. Return: " << rdc_status_string(result);
goto cleanup;
}
usleep(1000000);
}
} else { // standalone mode, do nothing
usleep(5000000); // sleep 5 seconds before fetch the stats
}
result = rdc_group_gpu_add(rdc_handle, group_id, 0); // Add GPU 0
if (result != RDC_ST_OK) {
std::cout << "Error adding group. Return: "
<< rdc_status_string(result);
goto cleanup;
}
// (3) stop the Slurm job, which will stop the watch
// We do not have to stop the job to get stats. The rdc_job_get_stats can be
// called at any time before stop
result = rdc_job_stop_stats(rdc_handle, job_id);
if (result != RDC_ST_OK) {
std::cout << "Error stop job stats. Return: " << rdc_status_string(result);
goto cleanup;
}
// (2) start the recording. Set the sample frequency to once per second.
result = rdc_job_start_stats(rdc_handle, group_id,
job_id, 1000000);
if (result != RDC_ST_OK) {
std::cout << "Error start job stats. Return: "
<< rdc_status_string(result);
goto cleanup;
}
// (4) Get the stats
rdc_job_info_t job_info;
result = rdc_job_get_stats(rdc_handle, job_id, &job_info);
// For standalone mode, the daemon will update and cache the samples
// In manual mode, we must call rdc_field_update_all periodically to
// take samples.
if (!standalone) { // embedded manual mode
for (int i=5; i > 0 ; i--) { // As an example, we will take 5 samples
result = rdc_field_update_all(rdc_handle, 0);
if (result != RDC_ST_OK) {
std::cout << "Error update all fields. Return: "
<< rdc_status_string(result);
goto cleanup;
}
usleep(1000000);
}
} else { // standalone mode, do nothing
usleep(5000000); // sleep 5 seconds before fetch the stats
}
if (result == RDC_ST_OK) {
std::cout << "|------- Execution Stats ----------+"
<< "------------------------------------\n";
std::cout << "| Start Time * | " << job_info.summary.start_time << "\n";
std::cout << "| End Time * | " << job_info.summary.end_time << "\n";
std::cout << "| Total Execution Time (sec) * | "
<< (job_info.summary.end_time - job_info.summary.start_time) << "\n";
std::cout << "+------- Performance Stats --------+"
<< "------------------------------------\n";
std::cout << "| Energy Consumed (Joules) | " << job_info.summary.energy_consumed
<< "\n";
std::cout << "| Power Usage (Watts) | "
<< "Max: " << job_info.summary.power_usage.max_value
<< " Min: " << job_info.summary.power_usage.min_value
<< " Avg: " << job_info.summary.power_usage.average << "\n";
std::cout << "| GPU Clock (MHz) | "
<< "Max: " << job_info.summary.gpu_clock.max_value
<< " Min: " << job_info.summary.gpu_clock.min_value
<< " Avg: " << job_info.summary.gpu_clock.average << "\n";
std::cout << "| GPU Utilization (%) | "
<< "Max: " << job_info.summary.gpu_utilization.max_value
<< " Min: " << job_info.summary.gpu_utilization.min_value
<< " Avg: " << job_info.summary.gpu_utilization.average << "\n";
std::cout << "| Max GPU Memory Used (bytes) * | " << job_info.summary.max_gpu_memory_used
<< "\n";
std::cout << "| Memory Utilization (%) | "
<< "Max: " << job_info.summary.memory_utilization.max_value
<< " Min: " << job_info.summary.memory_utilization.min_value
<< " Avg: " << job_info.summary.memory_utilization.average << "\n";
std::cout << "+----------------------------------+"
<< "------------------------------------\n";
} else {
std::cout << "No data for job stats found." << std::endl;
}
// (3) stop the Slurm job, which will stop the watch
// We do not have to stop the job to get stats. The rdc_job_get_stats can be
// called at any time before stop
result = rdc_job_stop_stats(rdc_handle, job_id);
if (result != RDC_ST_OK) {
std::cout << "Error stop job stats. Return: "
<< rdc_status_string(result);
goto cleanup;
}
// (4) Get the stats
rdc_job_info_t job_info;
result = rdc_job_get_stats(rdc_handle, job_id, &job_info);
if (result == RDC_ST_OK) {
std::cout << "|------- Execution Stats ----------+"
<<"------------------------------------\n";
std::cout << "| Start Time * | "
<< job_info.summary.start_time<< "\n";
std::cout << "| End Time * | "
<< job_info.summary.end_time <<"\n";
std::cout << "| Total Execution Time (sec) * | "
<< (job_info.summary.end_time-job_info.summary.start_time)
<< "\n";
std::cout << "+------- Performance Stats --------+"
<< "------------------------------------\n";
std::cout << "| Energy Consumed (Joules) | "
<< job_info.summary.energy_consumed << "\n";
std::cout << "| Power Usage (Watts) | "
<< "Max: " << job_info.summary.power_usage.max_value
<< " Min: "<< job_info.summary.power_usage.min_value
<< " Avg: "<< job_info.summary.power_usage.average << "\n";
std::cout << "| GPU Clock (MHz) | "
<< "Max: " <<job_info.summary.gpu_clock.max_value
<<" Min: " << job_info.summary.gpu_clock.min_value
<<" Avg: "<< job_info.summary.gpu_clock.average << "\n";
std::cout << "| GPU Utilization (%) | "
<< "Max: " <<job_info.summary.gpu_utilization.max_value
<<" Min: "<< job_info.summary.gpu_utilization.min_value
<<" Avg: "<< job_info.summary.gpu_utilization.average << "\n";
std::cout << "| Max GPU Memory Used (bytes) * | "
<< job_info.summary.max_gpu_memory_used << "\n";
std::cout << "| Memory Utilization (%) | "
<< "Max: " << job_info.summary.memory_utilization.max_value
<<" Min: "<< job_info.summary.memory_utilization.min_value
<<" Avg: "<< job_info.summary.memory_utilization.average << "\n";
std::cout << "+----------------------------------+"
<< "------------------------------------\n";
} else {
std::cout << "No data for job stats found." << std::endl;
}
// Cleanup consists of shutting down RDC.
cleanup:
std::cout << "Cleaning up.\n";
if (standalone)
rdc_disconnect(rdc_handle);
else
rdc_stop_embedded(rdc_handle);
rdc_shutdown();
return result;
// Cleanup consists of shutting down RDC.
cleanup:
std::cout << "Cleaning up.\n";
if (standalone)
rdc_disconnect(rdc_handle);
else
rdc_stop_embedded(rdc_handle);
rdc_shutdown();
return result;
}
可執行檔 → 一般檔案
+247 -268
查看文件
@@ -51,26 +51,26 @@ extern "C" {
* @brief Error codes returned by rocm_rdc_lib functions
*/
typedef enum {
RDC_ST_OK = 0, //!< Success
RDC_ST_NOT_SUPPORTED, //!< Not supported feature
RDC_ST_MSI_ERROR, //!< The MSI library error
RDC_ST_FAIL_LOAD_MODULE, //!< Fail to load the library
RDC_ST_INVALID_HANDLER, //!< Invalid handler
RDC_ST_BAD_PARAMETER, //!< A parameter is invalid
RDC_ST_NOT_FOUND, //!< Cannot find the value
RDC_ST_CONFLICT, //!< Conflict with current state
RDC_ST_CLIENT_ERROR, //!< The RDC client error
RDC_ST_ALREADY_EXIST, //!< The item already exists
RDC_ST_MAX_LIMIT, //!< Max limit recording for the object
RDC_ST_INSUFF_RESOURCES, //!< Not enough resources to complete
//!< operation
RDC_ST_FILE_ERROR, //!< Failed to access a file
RDC_ST_NO_DATA, //!< Data was requested,
//!< but none was found
RDC_ST_PERM_ERROR, //!< Insufficient permission to complete
//!< operation
RDC_ST_OK = 0, //!< Success
RDC_ST_NOT_SUPPORTED, //!< Not supported feature
RDC_ST_MSI_ERROR, //!< The MSI library error
RDC_ST_FAIL_LOAD_MODULE, //!< Fail to load the library
RDC_ST_INVALID_HANDLER, //!< Invalid handler
RDC_ST_BAD_PARAMETER, //!< A parameter is invalid
RDC_ST_NOT_FOUND, //!< Cannot find the value
RDC_ST_CONFLICT, //!< Conflict with current state
RDC_ST_CLIENT_ERROR, //!< The RDC client error
RDC_ST_ALREADY_EXIST, //!< The item already exists
RDC_ST_MAX_LIMIT, //!< Max limit recording for the object
RDC_ST_INSUFF_RESOURCES, //!< Not enough resources to complete
//!< operation
RDC_ST_FILE_ERROR, //!< Failed to access a file
RDC_ST_NO_DATA, //!< Data was requested,
//!< but none was found
RDC_ST_PERM_ERROR, //!< Insufficient permission to complete
//!< operation
RDC_ST_UNKNOWN_ERROR = 0xFFFFFFFF //!< Unknown error
RDC_ST_UNKNOWN_ERROR = 0xFFFFFFFF //!< Unknown error
} rdc_status_t;
/**
@@ -79,51 +79,42 @@ typedef enum {
* When run in manual mode, the user needs to periodically call
* rdc_field_update_all for data collection.
*/
typedef enum {
RDC_OPERATION_MODE_AUTO = 0,
RDC_OPERATION_MODE_MANUAL
} rdc_operation_mode_t;
typedef enum { RDC_OPERATION_MODE_AUTO = 0, RDC_OPERATION_MODE_MANUAL } rdc_operation_mode_t;
/**
* @brief type of GPU group
*/
typedef enum {
RDC_GROUP_DEFAULT = 0, //!< All GPUs on the Node
RDC_GROUP_EMPTY //!< Empty group
RDC_GROUP_DEFAULT = 0, //!< All GPUs on the Node
RDC_GROUP_EMPTY //!< Empty group
} rdc_group_type_t;
/**
* @brief the type stored in the filed value
*/
typedef enum {
INTEGER = 0,
DOUBLE,
STRING,
BLOB
} rdc_field_type_t;
typedef enum { INTEGER = 0, DOUBLE, STRING, BLOB } rdc_field_type_t;
//! ID used to represent an invalid GPU
#define GPU_ID_INVALID (-1)
#define GPU_ID_INVALID (-1)
//! Used to specify all GPUs
#define RDC_GROUP_ALL_GPUS (-1000)
#define RDC_GROUP_ALL_GPUS (-1000)
//! Used to specify all stats fields
#define RDC_JOB_STATS_FIELDS (-1000)
#define RDC_JOB_STATS_FIELDS (-1000)
/**
* @brief The max rdc field string length
*/
#define RDC_MAX_STR_LENGTH 256
#define RDC_MAX_STR_LENGTH 256
/**
* @brief The max entities in a group
*/
#define RDC_GROUP_MAX_ENTITIES 64
#define RDC_GROUP_MAX_ENTITIES 64
/**
* @brief Max number of GPUs supported by RDC
*/
#define RDC_MAX_NUM_DEVICES 16
#define RDC_MAX_NUM_DEVICES 16
/**
* @brief The max fields in a field group
@@ -133,7 +124,7 @@ typedef enum {
/**
* @brief The max number of groups
*/
#define RDC_MAX_NUM_GROUPS 64
#define RDC_MAX_NUM_GROUPS 64
/**
* @brief The max number of the field groups
@@ -144,112 +135,112 @@ typedef enum {
* These enums are used to specify a particular field to be retrieved.
*/
typedef enum {
RDC_FI_INVALID = 0, //!< Invalid field value
RDC_FI_INVALID = 0, //!< Invalid field value
//!< @brief Identifier fields
RDC_FI_GPU_COUNT = 1, //!< GPU count in the system
RDC_FI_DEV_NAME, //!< Name of the device
RDC_FI_GPU_COUNT = 1, //!< GPU count in the system
RDC_FI_DEV_NAME, //!< Name of the device
/*
* @brief Frequency related fields
*/
RDC_FI_GPU_CLOCK = 100, //!< The current clock for the GPU
RDC_FI_MEM_CLOCK, //!< Clock for the memory
RDC_FI_GPU_CLOCK = 100, //!< The current clock for the GPU
RDC_FI_MEM_CLOCK, //!< Clock for the memory
/*
* @brief Physical monitor fields
*/
RDC_FI_MEMORY_TEMP = 200, //!< Memory temperature for the device
RDC_FI_GPU_TEMP, //!< Current temperature for the device
RDC_FI_POWER_USAGE = 300, //!< Power usage for the device
RDC_FI_MEMORY_TEMP = 200, //!< Memory temperature for the device
RDC_FI_GPU_TEMP, //!< Current temperature for the device
RDC_FI_POWER_USAGE = 300, //!< Power usage for the device
/*
* @brief PCIe related fields
*/
RDC_FI_PCIE_TX = 400, //!< PCIe Tx utilization information
RDC_FI_PCIE_RX, //!< PCIe Rx utilization information
RDC_FI_PCIE_TX = 400, //!< PCIe Tx utilization information
RDC_FI_PCIE_RX, //!< PCIe Rx utilization information
/*
* @brief GPU usage related fields
*/
RDC_FI_GPU_UTIL = 500, //!< GPU Utilization
RDC_FI_GPU_MEMORY_USAGE, //!< Memory usage of the GPU instance
RDC_FI_GPU_MEMORY_TOTAL, //!< Total memory of the GPU instance
RDC_FI_GPU_UTIL = 500, //!< GPU Utilization
RDC_FI_GPU_MEMORY_USAGE, //!< Memory usage of the GPU instance
RDC_FI_GPU_MEMORY_TOTAL, //!< Total memory of the GPU instance
/**
* @brief ECC related fields
*/
RDC_FI_ECC_CORRECT_TOTAL = 600, //!< Accumulated correctable ECC errors
RDC_FI_ECC_UNCORRECT_TOTAL, //!< Accumulated uncorrectable ECC errors
RDC_FI_ECC_CORRECT_TOTAL = 600, //!< Accumulated correctable ECC errors
RDC_FI_ECC_UNCORRECT_TOTAL, //!< Accumulated uncorrectable ECC errors
RDC_FI_ECC_SDMA_SEC, //!< SDMA Single Error Correction
RDC_FI_ECC_SDMA_DED, //!< SDMA Double Error Detection
RDC_FI_ECC_SDMA_SEC, //!< SDMA Single Error Correction
RDC_FI_ECC_SDMA_DED, //!< SDMA Double Error Detection
RDC_FI_ECC_GFX_SEC, //!< GFX Single Error Correction
RDC_FI_ECC_GFX_DED, //!< GFX Double Error Detection
RDC_FI_ECC_GFX_SEC, //!< GFX Single Error Correction
RDC_FI_ECC_GFX_DED, //!< GFX Double Error Detection
RDC_FI_ECC_MMHUB_SEC, //!< MMHUB Single Error Correction
RDC_FI_ECC_MMHUB_DED, //!< MMHUB Double Error Detection
RDC_FI_ECC_MMHUB_SEC, //!< MMHUB Single Error Correction
RDC_FI_ECC_MMHUB_DED, //!< MMHUB Double Error Detection
RDC_FI_ECC_ATHUB_SEC, //!< ATHUB Single Error Correction
RDC_FI_ECC_ATHUB_DED, //!< ATHUB Double Error Detection
RDC_FI_ECC_ATHUB_SEC, //!< ATHUB Single Error Correction
RDC_FI_ECC_ATHUB_DED, //!< ATHUB Double Error Detection
RDC_FI_ECC_BIF_SEC, //!< BIF Single Error Correction
RDC_FI_ECC_BIF_DED, //!< BIF Double Error Detection
RDC_FI_ECC_BIF_SEC, //!< BIF Single Error Correction
RDC_FI_ECC_BIF_DED, //!< BIF Double Error Detection
RDC_FI_ECC_HDP_SEC, //!< HDP Single Error Correction
RDC_FI_ECC_HDP_DED, //!< HDP Double Error Detection
RDC_FI_ECC_HDP_SEC, //!< HDP Single Error Correction
RDC_FI_ECC_HDP_DED, //!< HDP Double Error Detection
RDC_FI_ECC_XGMI_WAFL_SEC, //!< XGMI WAFL Single Error Correction
RDC_FI_ECC_XGMI_WAFL_DED, //!< XGMI WAFL Double Error Detection
RDC_FI_ECC_XGMI_WAFL_SEC, //!< XGMI WAFL Single Error Correction
RDC_FI_ECC_XGMI_WAFL_DED, //!< XGMI WAFL Double Error Detection
RDC_FI_ECC_DF_SEC, //!< DF Single Error Correction
RDC_FI_ECC_DF_DED, //!< DF Double Error Detection
RDC_FI_ECC_DF_SEC, //!< DF Single Error Correction
RDC_FI_ECC_DF_DED, //!< DF Double Error Detection
RDC_FI_ECC_SMN_SEC, //!< SMN Single Error Correction
RDC_FI_ECC_SMN_DED, //!< SMN Double Error Detection
RDC_FI_ECC_SMN_SEC, //!< SMN Single Error Correction
RDC_FI_ECC_SMN_DED, //!< SMN Double Error Detection
RDC_FI_ECC_SEM_SEC, //!< SEM Single Error Correction
RDC_FI_ECC_SEM_DED, //!< SEM Double Error Detection
RDC_FI_ECC_SEM_SEC, //!< SEM Single Error Correction
RDC_FI_ECC_SEM_DED, //!< SEM Double Error Detection
RDC_FI_ECC_MP0_SEC, //!< MP0 Single Error Correction
RDC_FI_ECC_MP0_DED, //!< MP0 Double Error Detection
RDC_FI_ECC_MP0_SEC, //!< MP0 Single Error Correction
RDC_FI_ECC_MP0_DED, //!< MP0 Double Error Detection
RDC_FI_ECC_MP1_SEC, //!< MP1 Single Error Correction
RDC_FI_ECC_MP1_DED, //!< MP1 Double Error Detection
RDC_FI_ECC_MP1_SEC, //!< MP1 Single Error Correction
RDC_FI_ECC_MP1_DED, //!< MP1 Double Error Detection
RDC_FI_ECC_FUSE_SEC, //!< FUSE Single Error Correction
RDC_FI_ECC_FUSE_DED, //!< FUSE Double Error Detection
RDC_FI_ECC_FUSE_SEC, //!< FUSE Single Error Correction
RDC_FI_ECC_FUSE_DED, //!< FUSE Double Error Detection
RDC_FI_ECC_UMC_SEC, //!< UMC Single Error Correction
RDC_FI_ECC_UMC_DED, //!< UMC Double Error Detection
RDC_FI_ECC_UMC_SEC, //!< UMC Single Error Correction
RDC_FI_ECC_UMC_DED, //!< UMC Double Error Detection
/**
* @brief ROC-profiler related fields
*/
RDC_FI_PROF_ELAPSED_CYCLES = 700, //!< Number of elapsed cycles over all SMs
RDC_FI_PROF_ACTIVE_WAVES, //!< Number of Active Waves
RDC_FI_PROF_ACTIVE_CYCLES, //!< Number of Active Cycles
RDC_FI_PROF_CU_OCCUPANCY, //!< Active Waves / maximum active Waves supported
RDC_FI_PROF_CU_UTILIZATION, //!< Total active cycles / Total elapsed cycles
RDC_FI_PROF_FETCH_SIZE, //!< Number of kilobytes fetched from video memory
RDC_FI_PROF_WRITE_SIZE, //!< Number of kilobytes written to video memory
RDC_FI_PROF_FLOPS_16, //!< Number of fp16 OPS / second
RDC_FI_PROF_FLOPS_32, //!< Number of fp32 OPS / second
RDC_FI_PROF_FLOPS_64, //!< Number of fp64 OPS / second
RDC_FI_PROF_GFLOPS_16, //!< Number of fp16 GOPS / second
RDC_FI_PROF_GFLOPS_32, //!< Number of fp32 GOPS / second
RDC_FI_PROF_GFLOPS_64, //!< Number of fp64 GOPS / second
RDC_FI_PROF_MEMR_BW_KBPNS, //!< HBM Read Bandwidth in kilobytes / nanosecond
RDC_FI_PROF_MEMW_BW_KBPNS, //!< HBM Write Bandwidth in kilobytes / nanosecond
RDC_FI_PROF_ELAPSED_CYCLES = 700, //!< Number of elapsed cycles over all SMs
RDC_FI_PROF_ACTIVE_WAVES, //!< Number of Active Waves
RDC_FI_PROF_ACTIVE_CYCLES, //!< Number of Active Cycles
RDC_FI_PROF_CU_OCCUPANCY, //!< Active Waves / maximum active Waves supported
RDC_FI_PROF_CU_UTILIZATION, //!< Total active cycles / Total elapsed cycles
RDC_FI_PROF_FETCH_SIZE, //!< Number of kilobytes fetched from video memory
RDC_FI_PROF_WRITE_SIZE, //!< Number of kilobytes written to video memory
RDC_FI_PROF_FLOPS_16, //!< Number of fp16 OPS / second
RDC_FI_PROF_FLOPS_32, //!< Number of fp32 OPS / second
RDC_FI_PROF_FLOPS_64, //!< Number of fp64 OPS / second
RDC_FI_PROF_GFLOPS_16, //!< Number of fp16 GOPS / second
RDC_FI_PROF_GFLOPS_32, //!< Number of fp32 GOPS / second
RDC_FI_PROF_GFLOPS_64, //!< Number of fp64 GOPS / second
RDC_FI_PROF_MEMR_BW_KBPNS, //!< HBM Read Bandwidth in kilobytes / nanosecond
RDC_FI_PROF_MEMW_BW_KBPNS, //!< HBM Write Bandwidth in kilobytes / nanosecond
/*
* @brief Raw XGMI counter events
*/
RDC_EVNT_XGMI_0_NOP_TX = 1000, //!< NOPs sent to neighbor 0
RDC_EVNT_XGMI_0_REQ_TX, //!< Outgoing requests to
//!< neighbor 0
RDC_EVNT_XGMI_0_RESP_TX, //!< Outgoing responses to
//!< neighbor 0
RDC_EVNT_XGMI_0_NOP_TX = 1000, //!< NOPs sent to neighbor 0
RDC_EVNT_XGMI_0_REQ_TX, //!< Outgoing requests to
//!< neighbor 0
RDC_EVNT_XGMI_0_RESP_TX, //!< Outgoing responses to
//!< neighbor 0
/**
* @brief
*
@@ -265,37 +256,37 @@ typedef enum {
*/
// ie, Throughput = BEATS/time_running 10^9 bytes/sec
RDC_EVNT_XGMI_0_BEATS_TX,
RDC_EVNT_XGMI_1_NOP_TX, //!< NOPs sent to neighbor 1
RDC_EVNT_XGMI_1_REQ_TX, //!< Outgoing requests to
//!< neighbor 1
RDC_EVNT_XGMI_1_RESP_TX, //!< Outgoing responses to
//!< neighbor 1
RDC_EVNT_XGMI_1_BEATS_TX, //!< Data beats sent to
//!< neighbor 1; Each beat
//!< represents 32 bytes
RDC_EVNT_XGMI_1_NOP_TX, //!< NOPs sent to neighbor 1
RDC_EVNT_XGMI_1_REQ_TX, //!< Outgoing requests to
//!< neighbor 1
RDC_EVNT_XGMI_1_RESP_TX, //!< Outgoing responses to
//!< neighbor 1
RDC_EVNT_XGMI_1_BEATS_TX, //!< Data beats sent to
//!< neighbor 1; Each beat
//!< represents 32 bytes
// "Composite" events. These events have additional processing beyond
// the value provided by the rocm_smi library.
RDC_EVNT_XGMI_0_THRPUT = 1500, //!< Transmit throughput to XGMI
//!< neighbor 0 in byes/sec
RDC_EVNT_XGMI_1_THRPUT, //!< Transmit throughput to XGMI
//!< neighbor 1 in byes/sec
RDC_EVNT_XGMI_2_THRPUT, //!< Transmit throughput to XGMI
//!< neighbor 2 in byes/sec
RDC_EVNT_XGMI_3_THRPUT, //!< Transmit throughput to XGMI
//!< neighbor 3 in byes/sec
RDC_EVNT_XGMI_4_THRPUT, //!< Transmit throughput to XGMI
//!< neighbor 4 in byes/sec
RDC_EVNT_XGMI_5_THRPUT, //!< Transmit throughput to XGMI
//!< neighbor 5 in byes/sec
RDC_EVNT_XGMI_0_THRPUT = 1500, //!< Transmit throughput to XGMI
//!< neighbor 0 in byes/sec
RDC_EVNT_XGMI_1_THRPUT, //!< Transmit throughput to XGMI
//!< neighbor 1 in byes/sec
RDC_EVNT_XGMI_2_THRPUT, //!< Transmit throughput to XGMI
//!< neighbor 2 in byes/sec
RDC_EVNT_XGMI_3_THRPUT, //!< Transmit throughput to XGMI
//!< neighbor 3 in byes/sec
RDC_EVNT_XGMI_4_THRPUT, //!< Transmit throughput to XGMI
//!< neighbor 4 in byes/sec
RDC_EVNT_XGMI_5_THRPUT, //!< Transmit throughput to XGMI
//!< neighbor 5 in byes/sec
RDC_EVNT_NOTIF_VMFAULT = 2000, //!< VM page fault
RDC_EVNT_NOTIF_VMFAULT = 2000, //!< VM page fault
RDC_EVNT_NOTIF_FIRST = RDC_EVNT_NOTIF_VMFAULT,
RDC_EVNT_NOTIF_THERMAL_THROTTLE, //!< Clock frequency has decreased
//!< due to temperature rise
RDC_EVNT_NOTIF_PRE_RESET, //!< GPU reset is about to occur
RDC_EVNT_NOTIF_POST_RESET, //!< GPU reset just occurred
RDC_EVNT_NOTIF_THERMAL_THROTTLE, //!< Clock frequency has decreased
//!< due to temperature rise
RDC_EVNT_NOTIF_PRE_RESET, //!< GPU reset is about to occur
RDC_EVNT_NOTIF_POST_RESET, //!< GPU reset just occurred
RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_POST_RESET,
} rdc_field_t;
@@ -304,153 +295,153 @@ typedef enum {
/**
* @brief handlers used in various rdc calls
*/
typedef void *rdc_handle_t; //!< Handle used for an RDC session
typedef uint32_t rdc_gpu_group_t; //!< GPU Group ID type
typedef uint32_t rdc_field_grp_t; //!< Field group ID type
typedef void* rdc_handle_t; //!< Handle used for an RDC session
typedef uint32_t rdc_gpu_group_t; //!< GPU Group ID type
typedef uint32_t rdc_field_grp_t; //!< Field group ID type
/**
* @brief Represents attributes corresponding to a device
*/
typedef struct {
char device_name[RDC_MAX_STR_LENGTH]; //!< Name of the device.
char device_name[RDC_MAX_STR_LENGTH]; //!< Name of the device.
} rdc_device_attributes_t;
/**
* @brief The structure to store the group info
*/
typedef struct {
unsigned int count; //!< count of GPUs in the group
char group_name[RDC_MAX_STR_LENGTH]; //!< group name
/**
* The list of entities in the group
*/
uint32_t entity_ids[RDC_GROUP_MAX_ENTITIES];
unsigned int count; //!< count of GPUs in the group
char group_name[RDC_MAX_STR_LENGTH]; //!< group name
/**
* The list of entities in the group
*/
uint32_t entity_ids[RDC_GROUP_MAX_ENTITIES];
} rdc_group_info_t;
/**
* @brief The structure to store summary of data
*/
typedef struct {
uint64_t max_value; //!< Maximum value measured
uint64_t min_value; //!< Minimum value measured
uint64_t average; //!< Average value measured
double standard_deviation; //!< The standard deviation
uint64_t max_value; //!< Maximum value measured
uint64_t min_value; //!< Minimum value measured
uint64_t average; //!< Average value measured
double standard_deviation; //!< The standard deviation
} rdc_stats_summary_t;
/**
* @brief The structure to hold the GPU usage information
*/
typedef struct {
uint32_t gpu_id; //!< GPU_ID_INVALID for summary information
uint64_t start_time; //!< The time to start the watching
uint64_t end_time; //!< The time to stop the watching
uint32_t gpu_id; //!< GPU_ID_INVALID for summary information
uint64_t start_time; //!< The time to start the watching
uint64_t end_time; //!< The time to stop the watching
uint64_t energy_consumed; //!< GPU Energy consumed
uint64_t ecc_correct; //!< Correctable errors
uint64_t ecc_uncorrect; //!< Uncorrectable errors
rdc_stats_summary_t pcie_tx; //!< Bytes sent over PCIe stats
rdc_stats_summary_t pcie_rx; //!< Bytes received over PCIe stats
rdc_stats_summary_t power_usage; //!< GPU Power usage stats
rdc_stats_summary_t gpu_clock; //!< GPU Clock speed stats
rdc_stats_summary_t memory_clock; //!< Mem. Clock speed stats
rdc_stats_summary_t gpu_utilization; //!< GPU Utilization stats
rdc_stats_summary_t gpu_temperature; //!< GPU temperature stats
uint64_t energy_consumed; //!< GPU Energy consumed
uint64_t ecc_correct; //!< Correctable errors
uint64_t ecc_uncorrect; //!< Uncorrectable errors
rdc_stats_summary_t pcie_tx; //!< Bytes sent over PCIe stats
rdc_stats_summary_t pcie_rx; //!< Bytes received over PCIe stats
rdc_stats_summary_t power_usage; //!< GPU Power usage stats
rdc_stats_summary_t gpu_clock; //!< GPU Clock speed stats
rdc_stats_summary_t memory_clock; //!< Mem. Clock speed stats
rdc_stats_summary_t gpu_utilization; //!< GPU Utilization stats
rdc_stats_summary_t gpu_temperature; //!< GPU temperature stats
uint64_t max_gpu_memory_used; //!< Maximum GPU memory used
rdc_stats_summary_t memory_utilization; //!< Memory Utilization statistics
} rdc_gpu_usage_info_t; //!< GPU usage statistics
uint64_t max_gpu_memory_used; //!< Maximum GPU memory used
rdc_stats_summary_t memory_utilization; //!< Memory Utilization statistics
} rdc_gpu_usage_info_t; //!< GPU usage statistics
/**
* @brief The structure to hold the job stats
*/
typedef struct {
uint32_t num_gpus; //!< Number of GPUs used by job
rdc_gpu_usage_info_t summary; //!< Job usage summary statistics
//!< (overall)
rdc_gpu_usage_info_t gpus[16]; //!< Job usage summary statistics by GPU
uint32_t num_gpus; //!< Number of GPUs used by job
rdc_gpu_usage_info_t summary; //!< Job usage summary statistics
//!< (overall)
rdc_gpu_usage_info_t gpus[16]; //!< Job usage summary statistics by GPU
} rdc_job_info_t;
/**
* @brief Field value data
*/
typedef union {
int64_t l_int;
double dbl;
char str[RDC_MAX_STR_LENGTH];
int64_t l_int;
double dbl;
char str[RDC_MAX_STR_LENGTH];
} rdc_field_value_data;
/**
* @brief The structure to store the field value
*/
typedef struct {
rdc_field_t field_id; //!< The field id of the value
int status; //!< RDC_ST_OK or error status
uint64_t ts; //!< Timestamp in usec since 1970
rdc_field_type_t type; //!< The field type
rdc_field_value_data value; //!< Value of the field. Value type
//!< depends on the field type.
rdc_field_t field_id; //!< The field id of the value
int status; //!< RDC_ST_OK or error status
uint64_t ts; //!< Timestamp in usec since 1970
rdc_field_type_t type; //!< The field type
rdc_field_value_data value; //!< Value of the field. Value type
//!< depends on the field type.
} rdc_field_value;
/**
* @brief The structure to store the field group info
*/
typedef struct {
uint32_t count; //!< count of fields in the group
char group_name[RDC_MAX_STR_LENGTH]; //!< field group name
/**
* The list of fields in the group
*/
rdc_field_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP];
uint32_t count; //!< count of fields in the group
char group_name[RDC_MAX_STR_LENGTH]; //!< field group name
/**
* The list of fields in the group
*/
rdc_field_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP];
} rdc_field_group_info_t;
/**
* @brief The structure to store the job info
*/
typedef struct {
char job_id[RDC_MAX_STR_LENGTH]; //!< job id
rdc_gpu_group_t group_id; //!< group name
uint64_t start_time; //!< job start time
uint64_t stop_time; //!< job stop time
char job_id[RDC_MAX_STR_LENGTH]; //!< job id
rdc_gpu_group_t group_id; //!< group name
uint64_t start_time; //!< job start time
uint64_t stop_time; //!< job stop time
} rdc_job_group_info_t;
/**
* @brief type of diagnostic level
*/
typedef enum {
RDC_DIAG_LVL_INVALID = 0, //!< invalid level
RDC_DIAG_LVL_SHORT, //!< take a few seconds to run
RDC_DIAG_LVL_MED, //!< take less than 2 minutes to run
RDC_DIAG_LVL_LONG //!< take up to 15 minutes to run
RDC_DIAG_LVL_INVALID = 0, //!< invalid level
RDC_DIAG_LVL_SHORT, //!< take a few seconds to run
RDC_DIAG_LVL_MED, //!< take less than 2 minutes to run
RDC_DIAG_LVL_LONG //!< take up to 15 minutes to run
} rdc_diag_level_t;
/**
* @brief type of diagnostic result
*/
typedef enum {
RDC_DIAG_RESULT_PASS, //!< The diagnostic test pass
RDC_DIAG_RESULT_SKIP, //!< The diagnostic test skipped
RDC_DIAG_RESULT_WARN, //!< The diagnostic test has warnings
RDC_DIAG_RESULT_FAIL //!< The diagnostic test fail
RDC_DIAG_RESULT_PASS, //!< The diagnostic test pass
RDC_DIAG_RESULT_SKIP, //!< The diagnostic test skipped
RDC_DIAG_RESULT_WARN, //!< The diagnostic test has warnings
RDC_DIAG_RESULT_FAIL //!< The diagnostic test fail
} rdc_diag_result_t;
/**
* @brief The test cases to run
*/
typedef enum {
RDC_DIAG_TEST_FIRST = 0,
//!< The diagnostic test pass
RDC_DIAG_COMPUTE_PROCESS = RDC_DIAG_TEST_FIRST,
RDC_DIAG_COMPUTE_QUEUE, //!< The Compute Queue is ready
RDC_DIAG_SYS_MEM_CHECK, //!< Check System memory
RDC_DIAG_NODE_TOPOLOGY, //!< Report node topology
RDC_DIAG_GPU_PARAMETERS, //!< GPU parameters in range
RDC_DIAG_TEST_LAST = RDC_DIAG_GPU_PARAMETERS
RDC_DIAG_TEST_FIRST = 0,
//!< The diagnostic test pass
RDC_DIAG_COMPUTE_PROCESS = RDC_DIAG_TEST_FIRST,
RDC_DIAG_COMPUTE_QUEUE, //!< The Compute Queue is ready
RDC_DIAG_SYS_MEM_CHECK, //!< Check System memory
RDC_DIAG_NODE_TOPOLOGY, //!< Report node topology
RDC_DIAG_GPU_PARAMETERS, //!< GPU parameters in range
RDC_DIAG_TEST_LAST = RDC_DIAG_GPU_PARAMETERS
} rdc_diag_test_cases_t;
/**
* @brief The maximum test cases to run
*/
#define MAX_TEST_CASES (RDC_DIAG_TEST_LAST - RDC_DIAG_TEST_FIRST + 1)
#define MAX_TEST_CASES (RDC_DIAG_TEST_LAST - RDC_DIAG_TEST_FIRST + 1)
/**
* @brief The maximum length of the diagnostic messages
@@ -461,39 +452,39 @@ typedef enum {
* @brief details of the diagnostic errors
*/
typedef struct {
char msg[MAX_DIAG_MSG_LENGTH]; //!< The test result details
uint32_t code; //!< The low level error code
char msg[MAX_DIAG_MSG_LENGTH]; //!< The test result details
uint32_t code; //!< The low level error code
} rdc_diag_detail_t;
/**
* @brief details of the per gpu diagnostic results
*/
typedef struct {
uint32_t gpu_index; //!< The GPU index
rdc_diag_detail_t gpu_result; //!< The detail results
uint32_t gpu_index; //!< The GPU index
rdc_diag_detail_t gpu_result; //!< The detail results
} rdc_diag_per_gpu_result_t;
/**
* @brief The diagnostic results for all GPUs
*/
typedef struct {
rdc_diag_result_t status; //!< The diagnostic result
rdc_diag_detail_t details; //!< The summary details
rdc_diag_test_cases_t test_case; //!< The test case to run
rdc_diag_result_t status; //!< The diagnostic result
rdc_diag_detail_t details; //!< The summary details
rdc_diag_test_cases_t test_case; //!< The test case to run
uint32_t per_gpu_result_count; //!< How many gpu_results
//!< Result details
rdc_diag_per_gpu_result_t gpu_results[RDC_MAX_NUM_DEVICES];
uint32_t per_gpu_result_count; //!< How many gpu_results
//!< Result details
rdc_diag_per_gpu_result_t gpu_results[RDC_MAX_NUM_DEVICES];
char info[MAX_DIAG_MSG_LENGTH]; //!< Detail information
char info[MAX_DIAG_MSG_LENGTH]; //!< Detail information
} rdc_diag_test_result_t;
/**
* @brief The diagnostic responses for test cases
*/
typedef struct {
uint32_t results_count;
rdc_diag_test_result_t diag_info[MAX_TEST_CASES];
uint32_t results_count;
rdc_diag_test_result_t diag_info[MAX_TEST_CASES];
} rdc_diag_response_t;
/**
@@ -524,17 +515,17 @@ rdc_status_t rdc_shutdown();
* rdc_field_update_all() when op_mode is RDC_OPERATION_MODE_MANUAL, which
* tells RDC to collect the stats.
*
* @param[in] op_mode Operation modes. When RDC_OPERATION_MODE_AUTO, RDC schedules
* background task to collect the stats. When RDC_OPERATION_MODE_MANUAL, the user
* needs to call rdc_field_update_all() periodically.
* @param[in] op_mode Operation modes. When RDC_OPERATION_MODE_AUTO, RDC
* schedules background task to collect the stats. When
* RDC_OPERATION_MODE_MANUAL, the user needs to call rdc_field_update_all()
* periodically.
*
* @param[inout] p_rdc_handle Caller provided pointer to rdc_handle_t. Upon
* successful call, the value will contain the handler for following API calls.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode,
rdc_handle_t* p_rdc_handle);
rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, rdc_handle_t* p_rdc_handle);
/**
* @brief Stop embedded RDC agent.
@@ -573,8 +564,8 @@ rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle);
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_connect(const char *ipAndPort, rdc_handle_t* p_rdc_handle,
const char* root_ca, const char* client_cert, const char* client_key);
rdc_status_t rdc_connect(const char* ipAndPort, rdc_handle_t* p_rdc_handle, const char* root_ca,
const char* client_cert, const char* client_key);
/**
* @brief Disconnect from rdcd daemon.
@@ -606,8 +597,8 @@ rdc_status_t rdc_disconnect(rdc_handle_t p_rdc_handle);
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id, const char job_id[64], uint64_t update_freq);
rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
const char job_id[64], uint64_t update_freq);
/**
* @brief Get the stats of the job using the job id.
@@ -624,8 +615,8 @@ rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle,
const char job_id[64], rdc_job_info_t* p_job_info);
rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, const char job_id[64],
rdc_job_info_t* p_job_info);
/**
* @brief Request RDC to stop watching the stats of the job
@@ -640,8 +631,7 @@ rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle,
const char job_id[64]);
rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, const char job_id[64]);
/**
* @brief Request RDC to stop tracking the job given by job_id
@@ -684,8 +674,7 @@ rdc_status_t rdc_job_remove_all(rdc_handle_t p_rdc_handle);
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_field_update_all(rdc_handle_t p_rdc_handle,
uint32_t wait_for_update);
rdc_status_t rdc_field_update_all(rdc_handle_t p_rdc_handle, uint32_t wait_for_update);
/**
* @brief Get indexes corresponding to all the devices on the system.
@@ -704,7 +693,7 @@ rdc_status_t rdc_field_update_all(rdc_handle_t p_rdc_handle,
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_device_get_all(rdc_handle_t p_rdc_handle,
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count);
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count);
/**
* @brief Gets device attributes corresponding to the gpu_index.
@@ -720,8 +709,8 @@ rdc_status_t rdc_device_get_all(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle,
uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr);
rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
rdc_device_attributes_t* p_rdc_attr);
/**
* @brief Create a group contains multiple GPUs
@@ -744,9 +733,8 @@ rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle,
rdc_group_type_t type, const char* group_name,
rdc_gpu_group_t* p_rdc_group_id);
rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle, rdc_group_type_t type,
const char* group_name, rdc_gpu_group_t* p_rdc_group_id);
/**
* @brief Add a GPU to the group
@@ -761,8 +749,8 @@ rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id, uint32_t gpu_index);
rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
uint32_t gpu_index);
/**
* @brief Get information about a GPU group
@@ -780,8 +768,8 @@ rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info);
rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle, rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info);
/**
* @brief Used to get information about all GPU groups in the system.
@@ -797,8 +785,8 @@ rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_group_get_all_ids(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id_list[], uint32_t* count);
rdc_status_t rdc_group_get_all_ids(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id_list[],
uint32_t* count);
/**
* @brief Destroy GPU group represented by p_rdc_group_id
@@ -811,8 +799,7 @@ rdc_status_t rdc_group_get_all_ids(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t p_rdc_group_id);
rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle, rdc_gpu_group_t p_rdc_group_id);
/**
* @brief create a group of fields
@@ -834,9 +821,9 @@ rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle,
uint32_t num_field_ids, rdc_field_t* field_ids,
const char* field_group_name, rdc_field_grp_t* rdc_field_group_id);
rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle, uint32_t num_field_ids,
rdc_field_t* field_ids, const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id);
/**
* @brief Get information about a field group
@@ -854,9 +841,8 @@ rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle,
rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info);
rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle, rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info);
/**
* @brief Used to get information about all field groups in the system.
@@ -873,7 +859,7 @@ rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle,
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_group_field_get_all_ids(rdc_handle_t p_rdc_handle,
rdc_field_grp_t field_group_id_list[], uint32_t* count);
rdc_field_grp_t field_group_id_list[], uint32_t* count);
/**
* @brief Destroy field group represented by rdc_field_group_id
@@ -886,8 +872,7 @@ rdc_status_t rdc_group_field_get_all_ids(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle,
rdc_field_grp_t rdc_field_group_id);
rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle, rdc_field_grp_t rdc_field_group_id);
/**
* @brief Request the RDC start recording updates for a given field
@@ -911,9 +896,9 @@ rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_field_watch(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id,
uint64_t update_freq, double max_keep_age, uint32_t max_keep_samples);
rdc_status_t rdc_field_watch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, uint64_t update_freq,
double max_keep_age, uint32_t max_keep_samples);
/**
* @brief Request a latest cached field of a GPU
@@ -930,8 +915,8 @@ rdc_status_t rdc_field_watch(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_field_get_latest_value(rdc_handle_t p_rdc_handle,
uint32_t gpu_index, rdc_field_t field, rdc_field_value* value);
rdc_status_t rdc_field_get_latest_value(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
rdc_field_t field, rdc_field_value* value);
/**
* @brief Request a history cached field of a GPU
@@ -954,9 +939,9 @@ rdc_status_t rdc_field_get_latest_value(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle,
uint32_t gpu_index, rdc_field_t field, uint64_t since_time_stamp,
uint64_t *next_since_time_stamp, rdc_field_value* value);
rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
rdc_field_t field, uint64_t since_time_stamp,
uint64_t* next_since_time_stamp, rdc_field_value* value);
/**
* @brief Stop record updates for a given field collection.
@@ -971,8 +956,8 @@ rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id);
rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id);
/**
* @brief Run the diagnostic test cases
@@ -991,11 +976,8 @@ rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle,
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_diagnostic_run(
rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_response_t* response);
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_level_t level, rdc_diag_response_t* response);
/**
* @brief Run one diagnostic test case
@@ -1012,11 +994,8 @@ rdc_status_t rdc_diagnostic_run(
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_test_case_run(
rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result);
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case, rdc_diag_test_result_t* result);
/**
* @brief Get a description of a provided RDC error status
+28 -30
查看文件
@@ -22,50 +22,48 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_RDCCACHEMANAGER_H_
#define INCLUDE_RDC_LIB_RDCCACHEMANAGER_H_
#include <memory>
#include <utility>
#include <string>
#include <vector>
#include <map>
#include "rdc_lib/rdc_common.h"
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "rdc/rdc.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
class RdcCacheManager {
public:
virtual rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index,
rdc_field_t field, rdc_field_value* value) = 0;
virtual rdc_status_t rdc_field_get_value_since(uint32_t gpu_index,
rdc_field_t field, uint64_t since_time_stamp,
uint64_t *next_since_time_stamp, rdc_field_value* value) = 0;
virtual rdc_status_t rdc_update_cache(uint32_t gpu_index,
const rdc_field_value& value) = 0;
virtual rdc_status_t evict_cache(uint32_t gpu_index, rdc_field_t field_id,
uint64_t max_keep_samples, double max_keep_age) = 0;
virtual std::string get_cache_stats() = 0;
virtual rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field,
rdc_field_value* value) = 0;
virtual rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field,
uint64_t since_time_stamp,
uint64_t* next_since_time_stamp,
rdc_field_value* value) = 0;
virtual rdc_status_t rdc_update_cache(uint32_t gpu_index, const rdc_field_value& value) = 0;
virtual rdc_status_t evict_cache(uint32_t gpu_index, rdc_field_t field_id,
uint64_t max_keep_samples, double max_keep_age) = 0;
virtual std::string get_cache_stats() = 0;
virtual rdc_status_t rdc_job_get_stats(const char job_id[64],
const rdc_gpu_gauges_t& gpu_gauges,
rdc_job_info_t* p_job_info) = 0;
virtual rdc_status_t rdc_job_start_stats(const char job_id[64],
const rdc_group_info_t& group,
const rdc_field_group_info_t& finfo,
const rdc_gpu_gauges_t& gpu_gauges) = 0;
virtual rdc_status_t rdc_job_stop_stats(const char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) = 0;
virtual rdc_status_t rdc_update_job_stats(uint32_t gpu_index,
const std::string& job_id, const rdc_field_value& value) = 0;
virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove_all() = 0;
virtual rdc_status_t rdc_job_get_stats(const char job_id[64], const rdc_gpu_gauges_t& gpu_gauges,
rdc_job_info_t* p_job_info) = 0;
virtual rdc_status_t rdc_job_start_stats(const char job_id[64], const rdc_group_info_t& group,
const rdc_field_group_info_t& finfo,
const rdc_gpu_gauges_t& gpu_gauges) = 0;
virtual rdc_status_t rdc_job_stop_stats(const char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) = 0;
virtual rdc_status_t rdc_update_job_stats(uint32_t gpu_index, const std::string& job_id,
const rdc_field_value& value) = 0;
virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove_all() = 0;
virtual ~RdcCacheManager() {}
virtual ~RdcCacheManager() {}
};
typedef std::shared_ptr<RdcCacheManager> RdcCacheManagerPtr;
} // namespace rdc
} // namespace amd
+14 -19
查看文件
@@ -23,6 +23,7 @@ THE SOFTWARE.
#define INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_
#include <memory>
#include "rdc/rdc.h"
#include "rdc_lib/RdcDiagnosticLibInterface.h"
@@ -31,33 +32,27 @@ namespace rdc {
class RdcDiagnostic {
public:
// get support test cases
virtual rdc_status_t rdc_diag_test_cases_query(
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) = 0;
// get support test cases
virtual rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) = 0;
// Run a specific test case
virtual rdc_status_t rdc_test_case_run(
rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) = 0;
// Run a specific test case
virtual rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, rdc_diag_test_result_t* result) = 0;
// Run multiple test cases
virtual rdc_status_t rdc_diagnostic_run(
const rdc_group_info_t& gpus,
rdc_diag_level_t level,
rdc_diag_response_t* response) = 0;
// Run multiple test cases
virtual rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
rdc_diag_response_t* response) = 0;
virtual rdc_status_t rdc_diag_init(uint64_t flags) = 0;
virtual rdc_status_t rdc_diag_destroy() = 0;
virtual rdc_status_t rdc_diag_init(uint64_t flags) = 0;
virtual rdc_status_t rdc_diag_destroy() = 0;
virtual ~RdcDiagnostic() {}
virtual ~RdcDiagnostic() {}
};
typedef std::shared_ptr<RdcDiagnostic> RdcDiagnosticPtr;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_RDCDIAGNOSTIC_H_
+6 -12
查看文件
@@ -25,29 +25,23 @@ THE SOFTWARE.
// The telemetry interface for libraries, for example, RAS.
#include <rdc/rdc.h>
extern "C" {
// The library will implement below function
// Which test cases are supported in the library
rdc_status_t rdc_diag_test_cases_query(
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count);
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count);
// Run a specific test case
rdc_status_t rdc_diag_test_case_run(
rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t rdc_diag_init(uint64_t flags);
rdc_status_t rdc_diag_destroy();
}
#endif // INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_
#endif // INCLUDE_RDC_LIB_RDCDIAGNOSTICLIBINTERFACE_H_
+3 -3
查看文件
@@ -25,6 +25,7 @@ THE SOFTWARE.
#include <exception>
#include <string>
#include "rdc/rdc.h"
namespace amd {
@@ -32,8 +33,8 @@ namespace rdc {
class RdcException : public std::exception {
public:
RdcException(rdc_status_t error, const std::string description) :
err_(error), desc_(description) {}
RdcException(rdc_status_t error, const std::string description)
: err_(error), desc_(description) {}
rdc_status_t error_code() const noexcept { return err_; }
const char* what() const noexcept override { return desc_.c_str(); }
@@ -46,4 +47,3 @@ class RdcException : public std::exception {
} // namespace amd
#endif // INCLUDE_RDC_LIB_RDCEXCEPTION_H_
+18 -24
查看文件
@@ -23,39 +23,33 @@ THE SOFTWARE.
#define INCLUDE_RDC_LIB_RDCGROUPSETTINGS_H_
#include <memory>
#include "rdc_lib/rdc_common.h"
#include "rdc/rdc.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
class RdcGroupSettings {
public:
virtual rdc_status_t rdc_group_gpu_create(const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) = 0;
virtual rdc_status_t rdc_group_gpu_destroy(
rdc_gpu_group_t p_rdc_group_id) = 0;
virtual rdc_status_t rdc_group_gpu_add(
rdc_gpu_group_t groupId, uint32_t gpu_index) = 0;
virtual rdc_status_t rdc_group_gpu_get_info(
rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) = 0;
virtual rdc_status_t rdc_group_get_all_ids(
rdc_gpu_group_t group_id_list[], uint32_t* count) = 0;
virtual rdc_status_t rdc_group_gpu_create(const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) = 0;
virtual rdc_status_t rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) = 0;
virtual rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) = 0;
virtual rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) = 0;
virtual rdc_status_t rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], uint32_t* count) = 0;
virtual rdc_status_t rdc_group_field_create(uint32_t num_field_ids, rdc_field_t* field_ids,
const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) = 0;
virtual rdc_status_t rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) = 0;
virtual rdc_status_t rdc_group_field_get_info(rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) = 0;
virtual rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[],
uint32_t* count) = 0;
virtual rdc_status_t rdc_group_field_create(uint32_t num_field_ids,
rdc_field_t* field_ids, const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) = 0;
virtual rdc_status_t rdc_group_field_destroy(
rdc_field_grp_t rdc_field_group_id) = 0;
virtual rdc_status_t rdc_group_field_get_info(
rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) = 0;
virtual rdc_status_t rdc_group_field_get_all_ids(
rdc_field_grp_t field_group_id_list[], uint32_t* count) = 0;
virtual ~RdcGroupSettings() {}
virtual ~RdcGroupSettings() {}
};
typedef std::shared_ptr<RdcGroupSettings> RdcGroupSettingsPtr;
+49 -58
查看文件
@@ -22,8 +22,8 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_RDCHANDLER_H_
#define INCLUDE_RDC_LIB_RDCHANDLER_H_
#include "rdc_lib/rdc_common.h"
#include "rdc/rdc.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
@@ -31,70 +31,61 @@ namespace rdc {
// Interface class
class RdcHandler {
public:
// Job API
virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId,
const char job_id[64], uint64_t update_freq) = 0;
virtual rdc_status_t rdc_job_get_stats(const char jobId[64],
rdc_job_info_t* p_job_info)= 0;
virtual rdc_status_t rdc_job_stop_stats(const char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove_all() = 0;
// Job API
virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, const char job_id[64],
uint64_t update_freq) = 0;
virtual rdc_status_t rdc_job_get_stats(const char jobId[64], rdc_job_info_t* p_job_info) = 0;
virtual rdc_status_t rdc_job_stop_stats(const char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove_all() = 0;
// Discovery API
virtual rdc_status_t rdc_device_get_all(
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) = 0;
virtual rdc_status_t rdc_device_get_attributes(uint32_t gpu_index,
rdc_device_attributes_t* p_rdc_attr) = 0;
// Discovery API
virtual rdc_status_t rdc_device_get_all(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES],
uint32_t* count) = 0;
virtual rdc_status_t rdc_device_get_attributes(uint32_t gpu_index,
rdc_device_attributes_t* p_rdc_attr) = 0;
// Group API
virtual rdc_status_t rdc_group_gpu_create(rdc_group_type_t type,
const char* group_name, rdc_gpu_group_t* p_rdc_group_id) = 0;
virtual rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId,
uint32_t gpu_index) = 0;
virtual rdc_status_t rdc_group_field_create(uint32_t num_field_ids,
rdc_field_t* field_ids, const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) = 0;
virtual rdc_status_t rdc_group_field_get_info(
rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) = 0;
virtual rdc_status_t rdc_group_gpu_get_info(
rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info) = 0;
virtual rdc_status_t rdc_group_get_all_ids(
rdc_gpu_group_t group_id_list[], uint32_t* count) = 0;
virtual rdc_status_t rdc_group_field_get_all_ids(
rdc_field_grp_t field_group_id_list[], uint32_t* count) = 0;
virtual rdc_status_t rdc_group_gpu_destroy(
rdc_gpu_group_t p_rdc_group_id) = 0;
virtual rdc_status_t rdc_group_field_destroy(
rdc_field_grp_t rdc_field_group_id) = 0;
// Group API
virtual rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) = 0;
virtual rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) = 0;
virtual rdc_status_t rdc_group_field_create(uint32_t num_field_ids, rdc_field_t* field_ids,
const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) = 0;
virtual rdc_status_t rdc_group_field_get_info(rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) = 0;
virtual rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) = 0;
virtual rdc_status_t rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], uint32_t* count) = 0;
virtual rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[],
uint32_t* count) = 0;
virtual rdc_status_t rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) = 0;
virtual rdc_status_t rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) = 0;
// Field API
virtual rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, uint64_t update_freq,
double max_keep_age, uint32_t max_keep_samples) = 0;
virtual rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index,
rdc_field_t field, rdc_field_value* value) = 0;
virtual rdc_status_t rdc_field_get_value_since(uint32_t gpu_index,
rdc_field_t field, uint64_t since_time_stamp,
uint64_t *next_since_time_stamp, rdc_field_value* value) = 0;
virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id) = 0;
// Field API
virtual rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id,
uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) = 0;
virtual rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field,
rdc_field_value* value) = 0;
virtual rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field,
uint64_t since_time_stamp,
uint64_t* next_since_time_stamp,
rdc_field_value* value) = 0;
virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id) = 0;
// Diagnostic API
virtual rdc_status_t rdc_diagnostic_run(
rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_response_t* response) = 0;
// Diagnostic API
virtual rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
rdc_diag_response_t* response) = 0;
virtual rdc_status_t rdc_test_case_run(
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) = 0;
virtual rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) = 0;
// Control API
virtual rdc_status_t rdc_field_update_all(uint32_t wait_for_update) = 0;
// Control API
virtual rdc_status_t rdc_field_update_all(uint32_t wait_for_update) = 0;
virtual ~RdcHandler(){}
virtual ~RdcHandler() {}
};
} // namespace rdc
+40 -43
查看文件
@@ -23,78 +23,75 @@ THE SOFTWARE.
#define INCLUDE_RDC_LIB_RDCLIBRARYLOADER_H_
#include <dlfcn.h>
#include <string.h>
#include <map>
#include <mutex> // NOLINT(build/c++11)
#include <mutex> // NOLINT(build/c++11)
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
class RdcLibraryLoader {
public:
RdcLibraryLoader();
RdcLibraryLoader();
rdc_status_t load(const char* filename);
rdc_status_t load(const char* filename);
template<typename T> rdc_status_t load_symbol(T* func_handler,
const char* func_name);
template <typename T>
rdc_status_t load_symbol(T* func_handler, const char* func_name);
template<typename T> rdc_status_t load(const char* filename,
T* func_make_handler);
template <typename T>
rdc_status_t load(const char* filename, T* func_make_handler);
rdc_status_t unload();
rdc_status_t unload();
~RdcLibraryLoader();
~RdcLibraryLoader();
private:
void* libHandler_;
std::mutex library_mutex_;
void* libHandler_;
std::mutex library_mutex_;
};
template<typename T> rdc_status_t RdcLibraryLoader::load_symbol(T* func_handler,
const char* func_name) {
if (!libHandler_) {
RDC_LOG(RDC_ERROR, "Must load the library before load the symbol");
return RDC_ST_FAIL_LOAD_MODULE;
}
template <typename T>
rdc_status_t RdcLibraryLoader::load_symbol(T* func_handler, const char* func_name) {
if (!libHandler_) {
RDC_LOG(RDC_ERROR, "Must load the library before load the symbol");
return RDC_ST_FAIL_LOAD_MODULE;
}
if (!func_handler || !func_name) {
return RDC_ST_FAIL_LOAD_MODULE;
}
if (!func_handler || !func_name) {
return RDC_ST_FAIL_LOAD_MODULE;
}
std::lock_guard<std::mutex> guard(library_mutex_);
std::lock_guard<std::mutex> guard(library_mutex_);
*reinterpret_cast<void**>(func_handler) =
dlsym(libHandler_, func_name);
if (*func_handler == nullptr) {
char* error = dlerror();
RDC_LOG(RDC_ERROR, "RdcLibraryLoader: Fail to load the symbol "
<< func_name << ": " << error);
return RDC_ST_FAIL_LOAD_MODULE;
}
*reinterpret_cast<void**>(func_handler) = dlsym(libHandler_, func_name);
if (*func_handler == nullptr) {
char* error = dlerror();
RDC_LOG(RDC_ERROR, "RdcLibraryLoader: Fail to load the symbol " << func_name << ": " << error);
return RDC_ST_FAIL_LOAD_MODULE;
}
return RDC_ST_OK;
return RDC_ST_OK;
}
template <typename T>
rdc_status_t RdcLibraryLoader::load(const char* filename, T* func_make_handler) {
if (filename == nullptr || func_make_handler == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
template<typename T> rdc_status_t RdcLibraryLoader::load(const char* filename,
T* func_make_handler) {
if (filename == nullptr || func_make_handler == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = load(filename);
if (status != RDC_ST_OK) {
return status;
}
rdc_status_t status = load(filename);
if (status != RDC_ST_OK) {
return status;
}
return load_symbol(func_make_handler, "make_handler");
return load_symbol(func_make_handler, "make_handler");
}
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_RDCLIBRARYLOADER_H_
+11 -17
查看文件
@@ -21,39 +21,33 @@ THE SOFTWARE.
*/
#ifndef INCLUDE_RDC_LIB_RDCLOGGER_H_
#define INCLUDE_RDC_LIB_RDCLOGGER_H_
#include <chrono> // NOLINT
#include <iostream>
#include <string>
#include <chrono> // NOLINT
namespace amd {
namespace rdc {
class RdcLogger {
public:
explicit RdcLogger(std::ostream& os);
explicit RdcLogger(std::ostream& os);
static RdcLogger& getLogger() {
static RdcLogger logger(std::cout);
return logger;
}
static RdcLogger& getLogger() {
static RdcLogger logger(std::cout);
return logger;
}
bool should_log(uint32_t severity) {
return log_level_ >= severity;
}
bool should_log(uint32_t severity) { return log_level_ >= severity; }
std::ostream& get_ostream() {
return os_;
}
std::ostream& get_ostream() { return os_; }
std::string get_log_header(uint32_t severity,
const char* file, int line);
std::string get_log_header(uint32_t severity, const char* file, int line);
private:
std::ostream& os_;
uint32_t log_level_;
std::ostream& os_;
uint32_t log_level_;
};
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_RDCLOGGER_H_
+12 -12
查看文件
@@ -24,26 +24,26 @@ THE SOFTWARE.
#include <memory>
#include <vector>
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcTelemetryLibInterface.h"
#include "rdc/rdc.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcTelemetryLibInterface.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
class RdcMetricFetcher {
class RdcMetricFetcher {
public:
virtual rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) = 0;
virtual rdc_status_t delete_rsmi_handle(RdcFieldKey fk) = 0;
virtual rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) = 0;
virtual rdc_status_t delete_rsmi_handle(RdcFieldKey fk) = 0;
virtual rdc_status_t fetch_smi_field(uint32_t gpu_index,
rdc_field_t field_id, rdc_field_value* value) = 0;
virtual rdc_status_t fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id,
rdc_field_value* value) = 0;
virtual rdc_status_t bulk_fetch_smi_fields(
rdc_gpu_field_t* fields, uint32_t fields_count,
std::vector<rdc_gpu_field_value_t>& results) = 0; // NOLINT
virtual ~RdcMetricFetcher() {}
virtual rdc_status_t bulk_fetch_smi_fields(
rdc_gpu_field_t* fields, uint32_t fields_count,
std::vector<rdc_gpu_field_value_t>& results) = 0; // NOLINT
virtual ~RdcMetricFetcher() {}
};
typedef std::shared_ptr<RdcMetricFetcher> RdcMetricFetcherPtr;
+2 -3
查看文件
@@ -29,8 +29,8 @@ namespace rdc {
class RdcMetricsUpdater {
public:
virtual void start() = 0;
virtual void stop() = 0;
virtual void start() = 0;
virtual void stop() = 0;
};
typedef std::shared_ptr<RdcMetricsUpdater> RdcMetricsUpdaterPtr;
@@ -38,5 +38,4 @@ typedef std::shared_ptr<RdcMetricsUpdater> RdcMetricsUpdaterPtr;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_RDCMETRICSUPDATER_H_
+5 -5
查看文件
@@ -23,18 +23,19 @@ THE SOFTWARE.
#define INCLUDE_RDC_LIB_RDCMODULEMGR_H_
#include <memory>
#include "rdc_lib/rdc_common.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcTelemetry.h"
#include "rdc_lib/RdcDiagnostic.h"
#include "rdc_lib/RdcTelemetry.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
class RdcModuleMgr {
public:
virtual RdcTelemetryPtr get_telemetry_module() = 0;
virtual RdcDiagnosticPtr get_diagnostic_module() = 0;
virtual RdcTelemetryPtr get_telemetry_module() = 0;
virtual RdcDiagnosticPtr get_diagnostic_module() = 0;
};
typedef std::shared_ptr<RdcModuleMgr> RdcModuleMgrPtr;
@@ -42,5 +43,4 @@ typedef std::shared_ptr<RdcModuleMgr> RdcModuleMgrPtr;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_RDCMODULEMGR_H_
+11 -13
查看文件
@@ -24,8 +24,9 @@ THE SOFTWARE.
#include <memory>
#include <vector>
#include "rdc_lib/rdc_common.h"
#include "rdc/rdc.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
@@ -33,24 +34,22 @@ namespace rdc {
extern const uint32_t kMaxRSMIEvents;
typedef struct {
uint32_t gpu_id;
rdc_field_value field;
uint32_t gpu_id;
rdc_field_value field;
} rdc_evnt_notification_t;
class RdcNotification {
public:
virtual bool is_notification_event(rdc_field_t field) const = 0;
virtual bool is_notification_event(rdc_field_t field) const = 0;
virtual rdc_status_t
set_listen_events(const std::vector<RdcFieldKey> fk_arr) = 0;
virtual rdc_status_t set_listen_events(const std::vector<RdcFieldKey> fk_arr) = 0;
// Blocking
virtual rdc_status_t
listen(rdc_evnt_notification_t *events, uint32_t *num_events,
uint32_t timeout_ms) = 0;
// Blocking
virtual rdc_status_t listen(rdc_evnt_notification_t* events, uint32_t* num_events,
uint32_t timeout_ms) = 0;
virtual rdc_status_t stop_listening(uint32_t gpu_id) = 0;
virtual ~RdcNotification() {}
virtual rdc_status_t stop_listening(uint32_t gpu_id) = 0;
virtual ~RdcNotification() {}
};
typedef std::shared_ptr<RdcNotification> RdcNotificationPtr;
@@ -59,4 +58,3 @@ typedef std::shared_ptr<RdcNotification> RdcNotificationPtr;
} // namespace amd
#endif // INCLUDE_RDC_LIB_RDCNOTIFICATION_H_
可執行檔 → 一般檔案
+4 -5
查看文件
@@ -24,9 +24,10 @@ THE SOFTWARE.
#define INCLUDE_RDC_LIB_RDCRdcPerfTimer_H_
#include <stdint.h>
#include <iostream>
#include <vector>
#include <string>
#include <vector>
/// \file
/// Timer related class.
@@ -37,9 +38,9 @@ class RdcPerfTimer {
private:
struct Timer {
std::string name; /* < name name of time object*/
uint64_t _freq; /* < _freq frequency*/
uint64_t _freq; /* < _freq frequency*/
uint64_t _clocks; /* < _clocks number of ticks at end*/
uint64_t _start; /* < _start start point ticks*/
uint64_t _start; /* < _start start point ticks*/
};
std::vector<Timer*> _timers; /*< _timers vector to Timer objects */
@@ -80,9 +81,7 @@ class RdcPerfTimer {
uint64_t MeasureTSCFreqHz();
};
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_RDCRdcPerfTimer_H_
+14 -14
查看文件
@@ -23,6 +23,7 @@ THE SOFTWARE.
#define INCLUDE_RDC_LIB_RDCTELEMETRY_H_
#include <memory>
#include "rdc/rdc.h"
#include "rdc_lib/RdcTelemetryLibInterface.h"
@@ -31,27 +32,26 @@ namespace rdc {
class RdcTelemetry {
public:
// get support field ids
virtual rdc_status_t rdc_telemetry_fields_query(
uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) = 0;
// get support field ids
virtual rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) = 0;
// Fetch
virtual rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
uint32_t fields_count, rdc_field_value_f callback,
void* user_data) = 0;
// Fetch
virtual rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data) = 0;
virtual rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count) = 0;
virtual rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) = 0;
virtual rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count) = 0;
virtual rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) = 0;
virtual ~RdcTelemetry() {}
virtual ~RdcTelemetry() {}
};
typedef std::shared_ptr<RdcTelemetry> RdcTelemetryPtr;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_RDCTELEMETRY_H_
+13 -18
查看文件
@@ -24,45 +24,40 @@ THE SOFTWARE.
// The telemetry interface for libraries, for example, RAS.
#include <rdc/rdc.h>
#include <cstdint>
extern "C" {
// Structure to keep both gup index and field value
typedef struct {
uint32_t gpu_index;
rdc_field_value field_value;
uint32_t gpu_index;
rdc_field_value field_value;
} rdc_gpu_field_value_t;
typedef struct {
uint32_t gpu_index;
rdc_field_t field_id;
uint32_t gpu_index;
rdc_field_t field_id;
} rdc_gpu_field_t;
#define MAX_NUM_FIELDS 8192
typedef rdc_status_t(*rdc_field_value_f)(rdc_gpu_field_value_t* values,
uint32_t num_values, void* user_data);
typedef rdc_status_t (*rdc_field_value_f)(rdc_gpu_field_value_t* values, uint32_t num_values,
void* user_data);
// The library will implement below function
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count);
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count);
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
uint32_t fields_count, rdc_field_value_f callback,
void* user_data);
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count,
rdc_field_value_f callback, void* user_data);
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count);
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count);
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count);
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count);
rdc_status_t rdc_module_init(uint64_t flags);
rdc_status_t rdc_module_destroy();
}
#endif // INCLUDE_RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_
#endif // INCLUDE_RDC_LIB_RDCTELEMETRYLIBINTERFACE_H_
+17 -17
查看文件
@@ -24,33 +24,33 @@ THE SOFTWARE.
#include <memory>
#include <vector>
#include "rdc_lib/rdc_common.h"
#include "rdc/rdc.h"
#include "rdc/rdc.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
class RdcWatchTable {
public:
virtual rdc_status_t rdc_field_update_all() = 0;
virtual rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) = 0;
virtual rdc_status_t rdc_field_update_all() = 0;
virtual rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) = 0;
virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id,
const char job_id[64], uint64_t update_freq,
const rdc_gpu_gauges_t& gpu_gauge) = 0;
virtual rdc_status_t rdc_job_stop_stats(const char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) = 0;
virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove_all() = 0;
virtual rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, const char job_id[64],
uint64_t update_freq,
const rdc_gpu_gauges_t& gpu_gauge) = 0;
virtual rdc_status_t rdc_job_stop_stats(const char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) = 0;
virtual rdc_status_t rdc_job_remove(const char job_id[64]) = 0;
virtual rdc_status_t rdc_job_remove_all() = 0;
virtual rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, uint64_t update_freq,
double max_keep_age, uint32_t max_keep_samples) = 0;
virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id) = 0;
virtual rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id,
uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) = 0;
virtual rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id) = 0;
virtual ~RdcWatchTable() {}
virtual ~RdcWatchTable() {}
};
typedef std::shared_ptr<RdcWatchTable> RdcWatchTablePtr;
+55 -60
查看文件
@@ -22,15 +22,15 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_RDCCACHEMANAGERIMPL_H_
#define INCLUDE_RDC_LIB_IMPL_RDCCACHEMANAGERIMPL_H_
#include <map>
#include <memory>
#include <mutex> // NOLINT(build/c++11)
#include <string>
#include <vector>
#include <map>
#include "rdc/rdc.h"
#include "rdc_lib/RdcCacheManager.h"
#include "rdc_lib/rdc_common.h"
#include "rdc/rdc.h"
namespace amd {
namespace rdc {
@@ -39,89 +39,84 @@ namespace rdc {
// types and arrays (no pointers). If a pointer is added, make sure to update
// any code that copies this structure.
struct RdcCacheEntry {
uint64_t last_time;
rdc_field_type_t type;
rdc_field_value_data value;
uint64_t last_time;
rdc_field_type_t type;
rdc_field_value_data value;
};
typedef std::map<RdcFieldKey, std::vector<RdcCacheEntry>> RdcCacheSamples;
struct FieldSummaryStats {
int64_t max_value;
int64_t min_value;
int64_t total_value;
int64_t max_value;
int64_t min_value;
int64_t total_value;
// Use Welford algorithm to calculate the standard deviations.
// https://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods
// https://www.johndcook.com/blog/standard_deviation/
double old_m;
double old_s;
double new_m;
double new_s;
// Use Welford algorithm to calculate the standard deviations.
// https://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods
// https://www.johndcook.com/blog/standard_deviation/
double old_m;
double old_s;
double new_m;
double new_s;
uint64_t last_time;
uint64_t count;
uint64_t last_time;
uint64_t count;
};
struct GpuSummaryStats {
uint64_t energy_consumed;
uint64_t energy_last_time;
uint64_t ecc_correct_init; // Init counter when job starts
uint64_t ecc_uncorrect_init; // Init counter when job starts
std::map<uint32_t, FieldSummaryStats> field_summaries;
uint64_t energy_consumed;
uint64_t energy_last_time;
uint64_t ecc_correct_init; // Init counter when job starts
uint64_t ecc_uncorrect_init; // Init counter when job starts
std::map<uint32_t, FieldSummaryStats> field_summaries;
};
// Per job entry
struct RdcJobStatsCacheEntry {
uint64_t start_time;
uint64_t end_time;
std::map<uint32_t, GpuSummaryStats> gpu_stats;
uint64_t start_time;
uint64_t end_time;
std::map<uint32_t, GpuSummaryStats> gpu_stats;
};
// <job_id, job_stats>
typedef std::map<std::string, RdcJobStatsCacheEntry> RdcJobStatsCache;
class RdcCacheManagerImpl: public RdcCacheManager {
class RdcCacheManagerImpl : public RdcCacheManager {
public:
rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index,
rdc_field_t field, rdc_field_value* value) override;
rdc_status_t rdc_field_get_value_since(uint32_t gpu_index,
rdc_field_t field, uint64_t since_time_stamp,
uint64_t *next_since_time_stamp, rdc_field_value* value) override;
rdc_status_t rdc_update_cache(uint32_t gpu_index,
const rdc_field_value& value) override;
rdc_status_t evict_cache(uint32_t gpu_index, rdc_field_t field_id,
uint64_t max_keep_samples, double max_keep_age) override;
std::string get_cache_stats() override;
rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field,
rdc_field_value* value) override;
rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field,
uint64_t since_time_stamp, uint64_t* next_since_time_stamp,
rdc_field_value* value) override;
rdc_status_t rdc_update_cache(uint32_t gpu_index, const rdc_field_value& value) override;
rdc_status_t evict_cache(uint32_t gpu_index, rdc_field_t field_id, uint64_t max_keep_samples,
double max_keep_age) override;
std::string get_cache_stats() override;
rdc_status_t rdc_job_get_stats(const char job_id[64],
const rdc_gpu_gauges_t& gpu_gauges,
rdc_job_info_t* p_job_info) override;
rdc_status_t rdc_job_start_stats(const char job_id[64],
const rdc_group_info_t& group,
const rdc_field_group_info_t& finfo,
const rdc_gpu_gauges_t& gpu_gauges) override;
rdc_status_t rdc_job_stop_stats(const char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) override;
rdc_status_t rdc_update_job_stats(uint32_t gpu_index,
const std::string& job_id,
const rdc_field_value& value) override;
rdc_status_t rdc_job_remove(const char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
rdc_status_t rdc_job_get_stats(const char job_id[64], const rdc_gpu_gauges_t& gpu_gauges,
rdc_job_info_t* p_job_info) override;
rdc_status_t rdc_job_start_stats(const char job_id[64], const rdc_group_info_t& group,
const rdc_field_group_info_t& finfo,
const rdc_gpu_gauges_t& gpu_gauges) override;
rdc_status_t rdc_job_stop_stats(const char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) override;
rdc_status_t rdc_update_job_stats(uint32_t gpu_index, const std::string& job_id,
const rdc_field_value& value) override;
rdc_status_t rdc_job_remove(const char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
private:
void set_summary(const FieldSummaryStats & stats,
rdc_stats_summary_t& gpu, rdc_stats_summary_t& summary, // NOLINT
unsigned int adjuster);
void set_average_summary(
rdc_stats_summary_t& summary, uint32_t num_gpus); // NOLINT
RdcCacheSamples cache_samples_;
RdcJobStatsCache cache_jobs_;
std::mutex cache_mutex_;
void set_summary(const FieldSummaryStats& stats, rdc_stats_summary_t& gpu,
rdc_stats_summary_t& summary, // NOLINT
unsigned int adjuster);
void set_average_summary(rdc_stats_summary_t& summary,
uint32_t num_gpus); // NOLINT
RdcCacheSamples cache_samples_;
RdcJobStatsCache cache_jobs_;
std::mutex cache_mutex_;
};
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCCACHEMANAGERIMPL_H_
+21 -28
查看文件
@@ -22,10 +22,11 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_
#define INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_
#include <map>
#include <list>
#include <vector>
#include <map>
#include <memory>
#include <vector>
#include "rdc_lib/RdcDiagnostic.h"
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/RdcTelemetryLibInterface.h"
@@ -35,37 +36,30 @@ namespace rdc {
class RdcDiagnosticModule : public RdcDiagnostic {
public:
rdc_status_t rdc_diag_test_cases_query(
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) override;
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) override;
// Run a specific test case
rdc_status_t rdc_test_case_run(
rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) override;
// Run a specific test case
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result) override;
rdc_status_t rdc_diagnostic_run(
const rdc_group_info_t& gpus,
rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
explicit RdcDiagnosticModule(RdcMetricFetcherPtr& fetcher);
explicit RdcDiagnosticModule(RdcMetricFetcherPtr& fetcher);
private:
//< Helper function to dispatch fields to module
void get_fields_for_module(
rdc_gpu_field_t* fields,
uint32_t fields_count,
std::map<RdcDiagnosticPtr, std::vector<rdc_gpu_field_t>>
& fields_in_module,
std::vector<rdc_gpu_field_value_t>& unsupport_fields); // NOLINT
std::list<RdcDiagnosticPtr> diagnostic_modules_;
std::map<rdc_diag_test_cases_t, RdcDiagnosticPtr> testcases_to_module_;
//< Helper function to dispatch fields to module
void get_fields_for_module(
rdc_gpu_field_t* fields, uint32_t fields_count,
std::map<RdcDiagnosticPtr, std::vector<rdc_gpu_field_t>>& fields_in_module,
std::vector<rdc_gpu_field_value_t>& unsupport_fields); // NOLINT
std::list<RdcDiagnosticPtr> diagnostic_modules_;
std::map<rdc_diag_test_cases_t, RdcDiagnosticPtr> testcases_to_module_;
};
typedef std::shared_ptr<RdcDiagnosticModule> RdcDiagnosticModulePtr;
@@ -73,5 +67,4 @@ typedef std::shared_ptr<RdcDiagnosticModule> RdcDiagnosticModulePtr;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_DIAGNOSTICMODULE_H_
+63 -74
查看文件
@@ -23,102 +23,91 @@ THE SOFTWARE.
#define INCLUDE_RDC_LIB_IMPL_RDCEMBEDDEDHANDLER_H_
#include <future> // NOLINT(build/c++11)
#include "rdc_lib/RdcHandler.h"
#include "rdc_lib/RdcGroupSettings.h"
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/RdcCacheManager.h"
#include "rdc_lib/RdcGroupSettings.h"
#include "rdc_lib/RdcHandler.h"
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/RdcMetricsUpdater.h"
#include "rdc_lib/RdcWatchTable.h"
#include "rdc_lib/RdcModuleMgr.h"
#include "rdc_lib/RdcNotification.h"
#include "rdc_lib/RdcWatchTable.h"
namespace amd {
namespace rdc {
class RdcEmbeddedHandler: public RdcHandler {
class RdcEmbeddedHandler : public RdcHandler {
public:
// Job API
rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId,
const char job_id[64], uint64_t update_freq) override;
rdc_status_t rdc_job_get_stats(const char jobId[64],
rdc_job_info_t* p_job_info) override;
rdc_status_t rdc_job_stop_stats(const char job_id[64]) override;
rdc_status_t rdc_job_remove(const char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
// Job API
rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, const char job_id[64],
uint64_t update_freq) override;
rdc_status_t rdc_job_get_stats(const char jobId[64], rdc_job_info_t* p_job_info) override;
rdc_status_t rdc_job_stop_stats(const char job_id[64]) override;
rdc_status_t rdc_job_remove(const char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
// Discovery API
rdc_status_t rdc_device_get_all(
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) override;
rdc_status_t rdc_device_get_attributes(uint32_t gpu_index,
rdc_device_attributes_t* p_rdc_attr) override;
// Discovery API
rdc_status_t rdc_device_get_all(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES],
uint32_t* count) override;
rdc_status_t rdc_device_get_attributes(uint32_t gpu_index,
rdc_device_attributes_t* p_rdc_attr) override;
// Group API
rdc_status_t rdc_group_gpu_create(rdc_group_type_t type,
const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) override;
rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId,
uint32_t gpu_index) override;
rdc_status_t rdc_group_field_create(uint32_t num_field_ids,
rdc_field_t* field_ids, const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) override;
rdc_status_t rdc_group_field_get_info(
rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) override;
rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) override;
rdc_status_t rdc_group_get_all_ids(
rdc_gpu_group_t group_id_list[], uint32_t* count) override;
rdc_status_t rdc_group_field_get_all_ids(
rdc_field_grp_t field_group_id_list[], uint32_t* count) override;
rdc_status_t rdc_group_gpu_destroy(
rdc_gpu_group_t p_rdc_group_id) override;
rdc_status_t rdc_group_field_destroy(
rdc_field_grp_t rdc_field_group_id) override;
// Group API
rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) override;
rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) override;
rdc_status_t rdc_group_field_create(uint32_t num_field_ids, rdc_field_t* field_ids,
const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) override;
rdc_status_t rdc_group_field_get_info(rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) override;
rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) override;
rdc_status_t rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], uint32_t* count) override;
rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[],
uint32_t* count) override;
rdc_status_t rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) override;
rdc_status_t rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) override;
// Field API
rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, uint64_t update_freq,
double max_keep_age, uint32_t max_keep_samples) override;
rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index,
rdc_field_t field, rdc_field_value* value) override;
rdc_status_t rdc_field_get_value_since(uint32_t gpu_index,
rdc_field_t field, uint64_t since_time_stamp,
uint64_t *next_since_time_stamp, rdc_field_value* value) override;
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id) override;
// Diagnostic API
rdc_status_t rdc_diagnostic_run(
rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_test_case_run(
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) override;
// Field API
rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id,
uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) override;
rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field,
rdc_field_value* value) override;
rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field,
uint64_t since_time_stamp, uint64_t* next_since_time_stamp,
rdc_field_value* value) override;
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override;
// Diagnostic API
rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) override;
// Control API
rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override;
// Control API
rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override;
explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode);
~RdcEmbeddedHandler();
explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode);
~RdcEmbeddedHandler();
private:
rdc_status_t get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges);
RdcGroupSettingsPtr group_settings_;
RdcCacheManagerPtr cache_mgr_;
RdcMetricFetcherPtr metric_fetcher_;
RdcModuleMgrPtr rdc_module_mgr_;
RdcNotificationPtr rdc_notif_;
RdcWatchTablePtr watch_table_;
RdcMetricsUpdaterPtr metrics_updater_;
std::future<void> updater_;
rdc_status_t get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges);
RdcGroupSettingsPtr group_settings_;
RdcCacheManagerPtr cache_mgr_;
RdcMetricFetcherPtr metric_fetcher_;
RdcModuleMgrPtr rdc_module_mgr_;
RdcNotificationPtr rdc_notif_;
RdcWatchTablePtr watch_table_;
RdcMetricsUpdaterPtr metrics_updater_;
std::future<void> updater_;
};
} // namespace rdc
} // namespace amd
extern "C" {
amd::rdc::RdcHandler *make_handler(rdc_operation_mode_t op_mode);
amd::rdc::RdcHandler* make_handler(rdc_operation_mode_t op_mode);
}
#endif // INCLUDE_RDC_LIB_IMPL_RDCEMBEDDEDHANDLER_H_
+26 -31
查看文件
@@ -22,52 +22,47 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_RDCGROUPSETTINGSIMPL_H_
#define INCLUDE_RDC_LIB_IMPL_RDCGROUPSETTINGSIMPL_H_
#include <memory>
#include <map>
#include <mutex> // NOLINT
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include "rdc_lib/RdcGroupSettings.h"
namespace amd {
namespace rdc {
class RdcGroupSettingsImpl: public RdcGroupSettings {
class RdcGroupSettingsImpl : public RdcGroupSettings {
public:
rdc_status_t rdc_group_gpu_create(
const char* group_name, rdc_gpu_group_t* p_rdc_group_id) override;
rdc_status_t rdc_group_gpu_destroy(
rdc_gpu_group_t p_rdc_group_id) override;
rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId,
uint32_t gpu_index) override;
rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) override;
rdc_status_t rdc_group_get_all_ids(
rdc_gpu_group_t group_id_list[], uint32_t* count) override;
rdc_status_t rdc_group_gpu_create(const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) override;
rdc_status_t rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) override;
rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) override;
rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) override;
rdc_status_t rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], uint32_t* count) override;
rdc_status_t rdc_group_field_create(uint32_t num_field_ids,
rdc_field_t* field_ids, const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) override;
rdc_status_t rdc_group_field_destroy(
rdc_field_grp_t rdc_field_group_id) override;
rdc_status_t rdc_group_field_get_info(
rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) override;
rdc_status_t rdc_group_field_get_all_ids(
rdc_field_grp_t field_group_id_list[], uint32_t* count) override;
rdc_status_t rdc_group_field_create(uint32_t num_field_ids, rdc_field_t* field_ids,
const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) override;
rdc_status_t rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) override;
rdc_status_t rdc_group_field_get_info(rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) override;
rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[],
uint32_t* count) override;
RdcGroupSettingsImpl();
RdcGroupSettingsImpl();
private:
std::map<rdc_gpu_group_t, rdc_group_info_t> gpu_group_;
std::map<rdc_field_grp_t, rdc_field_group_info_t> field_group_;
uint32_t cur_group_id_ = 1;
uint32_t cur_field_group_id_ = 0;
std::mutex group_mutex_;
std::mutex field_group_mutex_;
std::map<rdc_gpu_group_t, rdc_group_info_t> gpu_group_;
std::map<rdc_field_grp_t, rdc_field_group_info_t> field_group_;
uint32_t cur_group_id_ = 1;
uint32_t cur_field_group_id_ = 0;
std::mutex group_mutex_;
std::mutex field_group_mutex_;
};
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCGROUPSETTINGSIMPL_H_
+41 -42
查看文件
@@ -22,12 +22,13 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_
#define INCLUDE_RDC_LIB_IMPL_RDCMETRICFETCHERIMPL_H_
#include <mutex> // NOLINT(build/c++11)
#include <future> // NOLINT(build/c++11)
#include <memory>
#include <condition_variable> // NOLINT(build/c++11)
#include <future> // NOLINT(build/c++11)
#include <map>
#include <memory>
#include <mutex> // NOLINT(build/c++11)
#include <queue>
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/rdc_common.h"
#include "rocm_smi/rocm_smi.h"
@@ -38,9 +39,9 @@ namespace rdc {
//!< Some metrics, like PCIe throughput may take a second to retreive. The
//!< MetricValue will cache those metrics for async retreive.
struct MetricValue {
uint64_t cache_ttl;
uint64_t last_time;
rdc_field_value value;
uint64_t cache_ttl;
uint64_t last_time;
rdc_field_value value;
};
// This union represents any RSMI handles require initialization and/or
@@ -49,56 +50,54 @@ struct MetricValue {
// underlying raw event, then only one FieldRSMIData should be created,
// and it should be used by both events.
struct FieldRSMIData {
union {
rsmi_event_handle_t evt_handle;
};
union {
rsmi_counter_value_t counter_val;
};
~FieldRSMIData() {}
FieldRSMIData() : evt_handle(0), counter_val{0, 0, 0}{}
union {
rsmi_event_handle_t evt_handle;
};
union {
rsmi_counter_value_t counter_val;
};
~FieldRSMIData() {}
FieldRSMIData() : evt_handle(0), counter_val{0, 0, 0} {}
};
//!< The data structure to store the async fetch task
class RdcMetricFetcherImpl;
struct MetricTask {
RdcFieldKey field;
std::function<void(RdcMetricFetcherImpl&, RdcFieldKey)> task;
RdcFieldKey field;
std::function<void(RdcMetricFetcherImpl&, RdcFieldKey)> task;
};
class RdcMetricFetcherImpl: public RdcMetricFetcher {
class RdcMetricFetcherImpl : public RdcMetricFetcher {
public:
rdc_status_t fetch_smi_field(uint32_t gpu_index,
rdc_field_t field_id, rdc_field_value* value) override;
rdc_status_t bulk_fetch_smi_fields(
rdc_gpu_field_t* fields, uint32_t fields_count,
std::vector<rdc_gpu_field_value_t>& results) override; // NOLINT
RdcMetricFetcherImpl();
~RdcMetricFetcherImpl();
rdc_status_t fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id,
rdc_field_value* value) override;
rdc_status_t bulk_fetch_smi_fields(
rdc_gpu_field_t* fields, uint32_t fields_count,
std::vector<rdc_gpu_field_value_t>& results) override; // NOLINT
RdcMetricFetcherImpl();
~RdcMetricFetcherImpl();
rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) override;
rdc_status_t delete_rsmi_handle(RdcFieldKey fk) override;
rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) override;
rdc_status_t delete_rsmi_handle(RdcFieldKey fk) override;
private:
std::shared_ptr<FieldRSMIData> get_rsmi_data(RdcFieldKey key);
std::shared_ptr<FieldRSMIData> get_rsmi_data(RdcFieldKey key);
uint64_t now();
void get_ecc_error(uint32_t gpu_index,
rdc_field_t field_id, rdc_field_value* value);
uint64_t now();
void get_ecc_error(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
//!< return true if starting async_get
bool async_get_pcie_throughput(uint32_t gpu_index,
rdc_field_t field_id, rdc_field_value* value);
void get_pcie_throughput(const RdcFieldKey& key);
//!< return true if starting async_get
bool async_get_pcie_throughput(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
void get_pcie_throughput(const RdcFieldKey& key);
//!< Async metric retreive
std::map<RdcFieldKey, MetricValue> async_metrics_;
std::map<RdcFieldKey, std::shared_ptr<FieldRSMIData>> rsmi_data_;
std::queue<MetricTask> updated_tasks_;
std::mutex task_mutex_;
std::future<void> updater_; // keep the future of updater
std::condition_variable cv_;
std::atomic<bool> task_started_;
//!< Async metric retreive
std::map<RdcFieldKey, MetricValue> async_metrics_;
std::map<RdcFieldKey, std::shared_ptr<FieldRSMIData>> rsmi_data_;
std::queue<MetricTask> updated_tasks_;
std::mutex task_mutex_;
std::future<void> updater_; // keep the future of updater
std::condition_variable cv_;
std::atomic<bool> task_started_;
};
rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi);
+12 -10
查看文件
@@ -24,24 +24,26 @@ THE SOFTWARE.
#include <future> // NOLINT(build/c++11)
#include <memory>
#include "rdc_lib/RdcMetricsUpdater.h"
#include "rdc_lib/RdcWatchTable.h"
namespace amd {
namespace rdc {
class RdcMetricsUpdaterImpl: public RdcMetricsUpdater {
class RdcMetricsUpdaterImpl : public RdcMetricsUpdater {
public:
void start() override;
void stop() override;
explicit RdcMetricsUpdaterImpl(const RdcWatchTablePtr& watch_table,
const uint32_t check_frequency);
void start() override;
void stop() override;
explicit RdcMetricsUpdaterImpl(const RdcWatchTablePtr& watch_table,
const uint32_t check_frequency);
private:
RdcWatchTablePtr watch_table_;
std::atomic<bool> started_;
std::future<void> updater_; // keep the future of updater
std::future<void> notif_updater_; // keep the future of notif updater
const uint32_t _check_frequency; // Check frequency in milliseconds
RdcWatchTablePtr watch_table_;
std::atomic<bool> started_;
std::future<void> updater_; // keep the future of updater
std::future<void> notif_updater_; // keep the future of notif updater
const uint32_t _check_frequency; // Check frequency in milliseconds
};
} // namespace rdc
+8 -8
查看文件
@@ -36,17 +36,17 @@ namespace rdc {
class RdcModuleMgrImpl : public RdcModuleMgr {
public:
RdcTelemetryPtr get_telemetry_module() override;
RdcDiagnosticPtr get_diagnostic_module() override;
explicit RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher);
RdcTelemetryPtr get_telemetry_module() override;
RdcDiagnosticPtr get_diagnostic_module() override;
explicit RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher);
private:
// Function module
RdcTelemetryPtr rdc_telemetry_module_;
RdcDiagnosticPtr rdc_diagnostic_module_;
// Function module
RdcTelemetryPtr rdc_telemetry_module_;
RdcDiagnosticPtr rdc_diagnostic_module_;
// Domain module
RdcMetricFetcherPtr fetcher_;
// Domain module
RdcMetricFetcherPtr fetcher_;
};
} // namespace rdc
+14 -18
查看文件
@@ -22,40 +22,36 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_RDCNOTIFICATIONIMPL_H_
#define INCLUDE_RDC_LIB_IMPL_RDCNOTIFICATIONIMPL_H_
#include <memory>
#include <vector>
#include <map>
#include <memory>
#include <mutex>
#include <vector>
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcNotification.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcNotification.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
class RdcNotificationImpl : public RdcNotification {
public:
RdcNotificationImpl();
~RdcNotificationImpl();
RdcNotificationImpl();
~RdcNotificationImpl();
bool is_notification_event(rdc_field_t field) const override;
rdc_status_t set_listen_events(
const std::vector<RdcFieldKey> fk_arr) override;
// Blocking
rdc_status_t listen(rdc_evnt_notification_t *events,
uint32_t *num_events, uint32_t timeout_ms) override;
rdc_status_t stop_listening(uint32_t gpu_id) override;
bool is_notification_event(rdc_field_t field) const override;
rdc_status_t set_listen_events(const std::vector<RdcFieldKey> fk_arr) override;
// Blocking
rdc_status_t listen(rdc_evnt_notification_t* events, uint32_t* num_events,
uint32_t timeout_ms) override;
rdc_status_t stop_listening(uint32_t gpu_id) override;
private:
std::map<uint32_t, uint64_t> gpu_evnt_notif_masks_;
std::mutex notif_mutex_;
std::map<uint32_t, uint64_t> gpu_evnt_notif_masks_;
std::mutex notif_mutex_;
};
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCNOTIFICATIONIMPL_H_
+35 -45
查看文件
@@ -22,76 +22,66 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_RDCRASLIB_H_
#define INCLUDE_RDC_LIB_IMPL_RDCRASLIB_H_
#include <map>
#include <list>
#include <vector>
#include <memory>
#include <algorithm>
#include <list>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "rdc_lib/RdcDiagnostic.h"
#include "rdc_lib/RdcLibraryLoader.h"
#include "rdc_lib/RdcTelemetry.h"
#include "rdc_lib/RdcDiagnostic.h"
namespace amd {
namespace rdc {
class RdcRasLib: public RdcTelemetry, public RdcDiagnostic {
class RdcRasLib : public RdcTelemetry, public RdcDiagnostic {
public:
// get support field ids
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) override;
// get support field ids
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) override;
// Fetch
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
uint32_t fields_count, rdc_field_value_f callback,
void* user_data) override;
// Fetch
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count,
rdc_field_value_f callback, void* user_data) override;
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count) override;
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) override;
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) override;
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) override;
rdc_status_t rdc_diag_test_cases_query(
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) override;
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) override;
// Run a specific test case
rdc_status_t rdc_test_case_run(
rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) override;
// Run a specific test case
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result) override;
rdc_status_t rdc_diagnostic_run(
const rdc_group_info_t& gpus,
rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
explicit RdcRasLib();
RdcRasLib();
~RdcRasLib();
~RdcRasLib();
private:
RdcLibraryLoader lib_loader_;
rdc_status_t (*fields_value_get_)(rdc_gpu_field_t*,
uint32_t, rdc_field_value_f, void*);
rdc_status_t (*fields_query_)(uint32_t[MAX_NUM_FIELDS], uint32_t*);
RdcLibraryLoader lib_loader_;
rdc_status_t (*fields_value_get_)(rdc_gpu_field_t*, uint32_t, rdc_field_value_f, void*);
rdc_status_t (*fields_query_)(uint32_t[MAX_NUM_FIELDS], uint32_t*);
rdc_status_t (*fields_watch_)(rdc_gpu_field_t*, uint32_t);
rdc_status_t (*fields_unwatch_)(rdc_gpu_field_t*, uint32_t);
rdc_status_t (*fields_watch_)(rdc_gpu_field_t*, uint32_t);
rdc_status_t (*fields_unwatch_)(rdc_gpu_field_t*, uint32_t);
rdc_status_t (*rdc_module_init_)(uint64_t);
rdc_status_t (*rdc_module_destroy_)();
rdc_status_t (*rdc_module_init_)(uint64_t);
rdc_status_t (*rdc_module_destroy_)();
};
typedef std::shared_ptr<RdcRasLib> RdcRasLibPtr;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCRASLIB_H_
+28 -43
查看文件
@@ -34,63 +34,48 @@ namespace rdc {
class RdcRocpLib : public RdcTelemetry {
public:
/* Telemetry */
/* Telemetry */
// get support field ids
rdc_status_t rdc_telemetry_fields_query(
uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) override;
// get support field ids
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) override;
// Fetch
rdc_status_t rdc_telemetry_fields_value_get(
rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data) override;
// Fetch
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count,
rdc_field_value_f callback, void* user_data) override;
rdc_status_t rdc_telemetry_fields_watch(
rdc_gpu_field_t* fields,
uint32_t fields_count) override;
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) override;
rdc_status_t rdc_telemetry_fields_unwatch(
rdc_gpu_field_t* fields,
uint32_t fields_count) override;
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) override;
explicit RdcRocpLib(const char* lib_name);
explicit RdcRocpLib(const char* lib_name);
~RdcRocpLib();
~RdcRocpLib();
private:
RdcLibraryLoader lib_loader_;
RdcLibraryLoader lib_loader_;
rdc_status_t (*telemetry_fields_query_)(
uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count);
rdc_status_t (*telemetry_fields_query_)(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count);
rdc_status_t (*telemetry_fields_value_get_)(
rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data);
rdc_status_t (*telemetry_fields_value_get_)(rdc_gpu_field_t* fields, uint32_t fields_count,
rdc_field_value_f callback, void* user_data);
rdc_status_t (*telemetry_fields_watch_)(
rdc_gpu_field_t* fields,
uint32_t fields_count);
rdc_status_t (*telemetry_fields_watch_)(rdc_gpu_field_t* fields, uint32_t fields_count);
rdc_status_t (*telemetry_fields_unwatch_)(
rdc_gpu_field_t* fields,
uint32_t fields_count);
rdc_status_t (*telemetry_fields_unwatch_)(rdc_gpu_field_t* fields, uint32_t fields_count);
/**
* @brief Extract current ROCM_PATH from library or the environment
*/
std::string get_rocm_path();
/**
* @brief Extract current ROCM_PATH from library or the environment
*/
std::string get_rocm_path();
/**
* @brief Set ROCMTOOLS_METRICS_PATH environment variable needed by
* librocmtools
*/
rdc_status_t set_rocmtools_path();
/**
* @brief Set ROCMTOOLS_METRICS_PATH environment variable needed by
* librocmtools
*/
rdc_status_t set_rocmtools_path();
};
using RdcRocpLibPtr = std::shared_ptr<RdcRocpLib>;
+21 -27
查看文件
@@ -22,48 +22,42 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_RDCROCRLIB_H_
#define INCLUDE_RDC_LIB_IMPL_RDCROCRLIB_H_
#include <vector>
#include <memory>
#include "rdc_lib/RdcLibraryLoader.h"
#include <vector>
#include "rdc_lib/RdcDiagnostic.h"
#include "rdc_lib/RdcLibraryLoader.h"
namespace amd {
namespace rdc {
class RdcRocrLib : public RdcDiagnostic {
public:
rdc_status_t rdc_diag_test_cases_query(
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) override;
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) override;
// Run a specific test case
rdc_status_t rdc_test_case_run(
rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) override;
// Run a specific test case
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result) override;
rdc_status_t rdc_diagnostic_run(
const rdc_group_info_t& gpus,
rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
explicit RdcRocrLib();
RdcRocrLib();
~RdcRocrLib();
~RdcRocrLib();
private:
RdcLibraryLoader lib_loader_;
rdc_status_t (*test_case_run_)(rdc_diag_test_cases_t,
uint32_t[RDC_MAX_NUM_DEVICES], uint32_t,
rdc_diag_test_result_t*);
rdc_status_t (*diag_test_cases_query_)(
rdc_diag_test_cases_t[MAX_TEST_CASES], uint32_t*);
rdc_status_t (*diag_init_)(uint64_t);
rdc_status_t (*diag_destroy_)();
RdcLibraryLoader lib_loader_;
rdc_status_t (*test_case_run_)(rdc_diag_test_cases_t, uint32_t[RDC_MAX_NUM_DEVICES], uint32_t,
rdc_diag_test_result_t*);
rdc_status_t (*diag_test_cases_query_)(rdc_diag_test_cases_t[MAX_TEST_CASES], uint32_t*);
rdc_status_t (*diag_init_)(uint64_t);
rdc_status_t (*diag_destroy_)();
};
typedef std::shared_ptr<RdcRocrLib> RdcRocrLibPtr;
+17 -26
查看文件
@@ -23,8 +23,9 @@ THE SOFTWARE.
#define INCLUDE_RDC_LIB_IMPL_RDCSMIDIAGNOSTICIMPL_H_
#include <memory>
#include <string>
#include "rdc_lib/rdc_common.h"
#include "rdc/rdc.h"
#include "rdc_lib/rdc_common.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
@@ -32,35 +33,25 @@ namespace rdc {
class RdcSmiDiagnosticImpl {
public:
RdcSmiDiagnosticImpl();
RdcSmiDiagnosticImpl();
rdc_status_t check_rsmi_process_info(
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t check_rsmi_topo_info(
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t check_rsmi_param_info(
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t check_rsmi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t check_rsmi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t check_rsmi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result);
private:
rdc_diag_result_t check_temperature_level(uint32_t gpu_index
, rsmi_temperature_type_t type
, char msg[MAX_DIAG_MSG_LENGTH]
, char per_gpu_msg[MAX_DIAG_MSG_LENGTH]);
std::string get_temperature_string(
rsmi_temperature_type_t type) const;
rdc_diag_result_t check_temperature_level(uint32_t gpu_index, rsmi_temperature_type_t type,
char msg[MAX_DIAG_MSG_LENGTH],
char per_gpu_msg[MAX_DIAG_MSG_LENGTH]);
std::string get_temperature_string(rsmi_temperature_type_t type) const;
rdc_diag_result_t check_voltage_level(uint32_t gpu_index
, rsmi_voltage_type_t type
, char msg[MAX_DIAG_MSG_LENGTH]
, char per_gpu_msg[MAX_DIAG_MSG_LENGTH]);
std::string get_voltage_string(
rsmi_voltage_type_t type) const;
rdc_diag_result_t check_voltage_level(uint32_t gpu_index, rsmi_voltage_type_t type,
char msg[MAX_DIAG_MSG_LENGTH],
char per_gpu_msg[MAX_DIAG_MSG_LENGTH]);
std::string get_voltage_string(rsmi_voltage_type_t type) const;
};
typedef std::shared_ptr<RdcSmiDiagnosticImpl> RdcSmiDiagnosticPtr;
+26 -32
查看文件
@@ -22,11 +22,12 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_RDCSMILIB_H_
#define INCLUDE_RDC_LIB_IMPL_RDCSMILIB_H_
#include <vector>
#include <memory>
#include <vector>
#include "rdc_lib/RdcDiagnostic.h"
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/RdcTelemetry.h"
#include "rdc_lib/RdcDiagnostic.h"
#include "rdc_lib/impl/RdcSmiDiagnosticImpl.h"
namespace amd {
@@ -34,45 +35,38 @@ namespace rdc {
class RdcSmiLib : public RdcTelemetry, public RdcDiagnostic {
public:
// get support field ids
rdc_status_t rdc_telemetry_fields_query(
uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) override;
// get support field ids
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) override;
// Fetch
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
uint32_t fields_count, rdc_field_value_f callback,
void* user_data) override;
// Fetch
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count,
rdc_field_value_f callback, void* user_data) override;
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count) override;
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) override;
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) override;
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) override;
rdc_status_t rdc_diag_test_cases_query(
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) override;
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) override;
// Run a specific test case
rdc_status_t rdc_test_case_run(
rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) override;
// Run a specific test case
rdc_status_t rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result) override;
rdc_status_t rdc_diagnostic_run(
const rdc_group_info_t& gpus,
rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
rdc_status_t rdc_diag_init(uint64_t flags) override;
rdc_status_t rdc_diag_destroy() override;
explicit RdcSmiLib(const RdcMetricFetcherPtr& mf);
explicit RdcSmiLib(const RdcMetricFetcherPtr& mf);
private:
RdcMetricFetcherPtr metric_fetcher_;
bool bulk_fetch_enabled_;
RdcSmiDiagnosticPtr smi_diag_;
RdcMetricFetcherPtr metric_fetcher_;
bool bulk_fetch_enabled_;
RdcSmiDiagnosticPtr smi_diag_;
};
typedef std::shared_ptr<RdcSmiLib> RdcSmiLibPtr;
+56 -69
查看文件
@@ -22,98 +22,85 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_RDCSTANDALONEHANDLER_H_
#define INCLUDE_RDC_LIB_IMPL_RDCSTANDALONEHANDLER_H_
#include <grpcpp/grpcpp.h>
#include <memory>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc_lib/RdcHandler.h"
namespace amd {
namespace rdc {
class RdcStandaloneHandler: public RdcHandler {
class RdcStandaloneHandler : public RdcHandler {
public:
// Job RdcAPI
rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId,
const char job_id[64], uint64_t update_freq) override;
rdc_status_t rdc_job_get_stats(const char jobId[64],
rdc_job_info_t* p_job_info) override;
rdc_status_t rdc_job_stop_stats(const char job_id[64]) override;
rdc_status_t rdc_job_remove(const char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
// Job RdcAPI
rdc_status_t rdc_job_start_stats(rdc_gpu_group_t groupId, const char job_id[64],
uint64_t update_freq) override;
rdc_status_t rdc_job_get_stats(const char jobId[64], rdc_job_info_t* p_job_info) override;
rdc_status_t rdc_job_stop_stats(const char job_id[64]) override;
rdc_status_t rdc_job_remove(const char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
// Discovery RdcAPI
rdc_status_t rdc_device_get_all(
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) override;
rdc_status_t rdc_device_get_attributes(uint32_t gpu_index,
rdc_device_attributes_t* p_rdc_attr) override;
// Discovery RdcAPI
rdc_status_t rdc_device_get_all(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES],
uint32_t* count) override;
rdc_status_t rdc_device_get_attributes(uint32_t gpu_index,
rdc_device_attributes_t* p_rdc_attr) override;
// Group RdcAPI
rdc_status_t rdc_group_gpu_create(rdc_group_type_t type,
const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) override;
rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId,
uint32_t gpu_index) override;
rdc_status_t rdc_group_field_create(uint32_t num_field_ids,
rdc_field_t* field_ids, const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) override;
rdc_status_t rdc_group_field_get_info(
rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) override;
rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) override;
rdc_status_t rdc_group_get_all_ids(
rdc_gpu_group_t group_id_list[], uint32_t* count) override;
rdc_status_t rdc_group_field_get_all_ids(
rdc_field_grp_t field_group_id_list[], uint32_t* count) override;
rdc_status_t rdc_group_gpu_destroy(
rdc_gpu_group_t p_rdc_group_id) override;
rdc_status_t rdc_group_field_destroy(
rdc_field_grp_t rdc_field_group_id) override;
// Group RdcAPI
rdc_status_t rdc_group_gpu_create(rdc_group_type_t type, const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) override;
rdc_status_t rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) override;
rdc_status_t rdc_group_field_create(uint32_t num_field_ids, rdc_field_t* field_ids,
const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) override;
rdc_status_t rdc_group_field_get_info(rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) override;
rdc_status_t rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) override;
rdc_status_t rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[], uint32_t* count) override;
rdc_status_t rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[],
uint32_t* count) override;
rdc_status_t rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) override;
rdc_status_t rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) override;
// Field RdcAPI
rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, uint64_t update_freq,
double max_keep_age, uint32_t max_keep_samples) override;
rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index,
rdc_field_t field, rdc_field_value* value) override;
rdc_status_t rdc_field_get_value_since(uint32_t gpu_index,
rdc_field_t field, uint64_t since_time_stamp,
uint64_t *next_since_time_stamp, rdc_field_value* value) override;
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id) override;
// Diagnostic API
rdc_status_t rdc_diagnostic_run(
rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_test_case_run(
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) override;
// Field RdcAPI
rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id,
uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) override;
rdc_status_t rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field,
rdc_field_value* value) override;
rdc_status_t rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field,
uint64_t since_time_stamp, uint64_t* next_since_time_stamp,
rdc_field_value* value) override;
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override;
// Diagnostic API
rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level,
rdc_diag_response_t* response) override;
rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) override;
// Control RdcAPI
rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override;
// Control RdcAPI
rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override;
explicit RdcStandaloneHandler(const char* ip_and_port,
const char* root_ca, const char* client_cert, const char* client_key);
explicit RdcStandaloneHandler(const char* ip_and_port, const char* root_ca,
const char* client_cert, const char* client_key);
private:
// Helper function to handle the error
rdc_status_t error_handle(::grpc::Status status, uint32_t rdc_status);
// Helper function to handle the error
rdc_status_t error_handle(::grpc::Status status, uint32_t rdc_status);
bool copy_gpu_usage_info(
const ::rdc::GpuUsageInfo& src,
rdc_gpu_usage_info_t* target);
bool copy_gpu_usage_info(const ::rdc::GpuUsageInfo& src, rdc_gpu_usage_info_t* target);
std::unique_ptr<::rdc::RdcAPI::Stub> stub_;
std::unique_ptr<::rdc::RdcAPI::Stub> stub_;
};
} // namespace rdc
} // namespace amd
extern "C" {
amd::rdc::RdcHandler *make_handler(const char* ip_port,
const char* root_ca, const char* client_cert, const char* client_key);
amd::rdc::RdcHandler* make_handler(const char* ip_port, const char* root_ca,
const char* client_cert, const char* client_key);
}
#endif // INCLUDE_RDC_LIB_IMPL_RDCSTANDALONEHANDLER_H_
+18 -23
查看文件
@@ -22,45 +22,41 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_
#define INCLUDE_RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_
#include <map>
#include <list>
#include <vector>
#include <map>
#include <memory>
#include <vector>
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/RdcTelemetry.h"
#include "rdc_lib/impl/RdcRasLib.h"
#include "rdc_lib/impl/RdcSmiLib.h"
#include "rdc_lib/RdcMetricFetcher.h"
namespace amd {
namespace rdc {
class RdcTelemetryModule : public RdcTelemetry {
public:
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
uint32_t fields_count, rdc_field_value_f callback,
void* user_data);
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count,
rdc_field_value_f callback, void* user_data);
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count);
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count);
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count);
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count);
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count);
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count);
RdcTelemetryModule(RdcMetricFetcherPtr fetcher);
explicit RdcTelemetryModule(RdcMetricFetcherPtr fetcher);
private:
//< Helper function to dispatch fields to module
void get_fields_for_module(
rdc_gpu_field_t* fields,
uint32_t fields_count,
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>>
& fields_in_module,
std::vector<rdc_gpu_field_value_t>& unsupport_fields); // NOLINT
std::list<RdcTelemetryPtr> telemetry_modules_;
std::map<uint32_t, RdcTelemetryPtr> fields_id_module_;
//< Helper function to dispatch fields to module
void get_fields_for_module(
rdc_gpu_field_t* fields, uint32_t fields_count,
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>>& fields_in_module,
std::vector<rdc_gpu_field_value_t>& unsupport_fields); // NOLINT
std::list<RdcTelemetryPtr> telemetry_modules_;
std::map<uint32_t, RdcTelemetryPtr> fields_id_module_;
};
typedef std::shared_ptr<RdcTelemetryModule> RdcTelemetryModulePtr;
@@ -68,5 +64,4 @@ typedef std::shared_ptr<RdcTelemetryModule> RdcTelemetryModulePtr;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCTELEMETRYMODULE_H_
+70 -78
查看文件
@@ -22,19 +22,20 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_RDCWATCHTABLEIMPL_H_
#define INCLUDE_RDC_LIB_IMPL_RDCWATCHTABLEIMPL_H_
#include <string>
#include <atomic>
#include <map>
#include <vector>
#include <utility>
#include <memory>
#include <mutex> // NOLINT
#include <atomic>
#include "rdc_lib/RdcWatchTable.h"
#include "rdc_lib/RdcGroupSettings.h"
#include <string>
#include <utility>
#include <vector>
#include "rdc_lib/RdcCacheManager.h"
#include "rdc_lib/RdcGroupSettings.h"
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/RdcModuleMgr.h"
#include "rdc_lib/RdcNotification.h"
#include "rdc_lib/RdcWatchTable.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
@@ -42,104 +43,95 @@ namespace rdc {
//!< The settings for a field or a group of field in the watch table.
struct FieldSettings {
uint64_t update_freq;
uint32_t max_keep_samples;
double max_keep_age;
bool is_watching;
uint64_t last_update_time;
uint64_t update_freq;
uint32_t max_keep_samples;
double max_keep_age;
bool is_watching;
uint64_t last_update_time;
};
struct JobWatchTableEntry {
uint32_t group_id;
std::vector<RdcFieldKey> fields; //< store fields for faster query
uint32_t group_id;
std::vector<RdcFieldKey> fields; //< store fields for faster query
};
class RdcWatchTableImpl : public RdcWatchTable {
public:
rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id,
const char job_id[64], uint64_t update_freq,
const rdc_gpu_gauges_t& gpu_gauge) override;
rdc_status_t rdc_job_stop_stats(const char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) override;
rdc_status_t rdc_job_remove(const char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
rdc_status_t rdc_job_start_stats(rdc_gpu_group_t group_id, const char job_id[64],
uint64_t update_freq,
const rdc_gpu_gauges_t& gpu_gauge) override;
rdc_status_t rdc_job_stop_stats(const char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) override;
rdc_status_t rdc_job_remove(const char job_id[64]) override;
rdc_status_t rdc_job_remove_all() override;
rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, uint64_t update_freq,
double max_keep_age, uint32_t max_keep_samples) override;
rdc_status_t rdc_field_watch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id,
uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) override;
//!< rdc_field_unwatch() will not remove the entry from watch_table.
//!< The unwatched entry is still kept until the max_keep_age of the entry
//!< is reached, which will be handled in the clean_up() function.
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id) override;
//!< rdc_field_unwatch() will not remove the entry from watch_table.
//!< The unwatched entry is still kept until the max_keep_age of the entry
//!< is reached, which will be handled in the clean_up() function.
rdc_status_t rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) override;
//!< When the RDC is running as RDC_OPERATION_MODE_MANUAL, the user will
//!< call this function periodically. Instead of providing other APIs to
//!< cleanup the cache, this function will update and cleanup the cache.
//!<
//!< This function may be called very frequently, and the cache cleanup
//!< is expensive. Internally, this function will throttle the cleanup to
//!< once per second.
rdc_status_t rdc_field_update_all() override;
rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) override;
//!< When the RDC is running as RDC_OPERATION_MODE_MANUAL, the user will
//!< call this function periodically. Instead of providing other APIs to
//!< cleanup the cache, this function will update and cleanup the cache.
//!<
//!< This function may be called very frequently, and the cache cleanup
//!< is expensive. Internally, this function will throttle the cleanup to
//!< once per second.
rdc_status_t rdc_field_update_all() override;
rdc_status_t rdc_field_listen_notif(uint32_t timeout_ms) override;
RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings,
const RdcCacheManagerPtr& cache_mgr,
const RdcModuleMgrPtr& module_mgr,
const RdcNotificationPtr& notif);
RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings, const RdcCacheManagerPtr& cache_mgr,
const RdcModuleMgrPtr& module_mgr, const RdcNotificationPtr& notif);
private:
//!< Helper function to Update the fields_in_table when unwatch tables
rdc_status_t update_field_in_table_when_unwatch(
const RdcFieldGroupKey& entry);
//!< Helper function to Update the fields_in_table when unwatch tables
rdc_status_t update_field_in_table_when_unwatch(const RdcFieldGroupKey& entry);
//!< Helper function to clean up the watch table and cache
void clean_up();
//!< Helper function to clean up the watch table and cache
void clean_up();
//!< Helper function for debug information in watch table and cache
void debug_status();
//!< Helper function for debug information in watch table and cache
void debug_status();
//!< Helper function to get the fields using the group and the field group.
rdc_status_t get_fields_from_group(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id,
std::vector<RdcFieldKey> & fields); // NOLINT
//!< Helper function to get the fields using the group and the field group.
rdc_status_t get_fields_from_group(rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id,
std::vector<RdcFieldKey>& fields); // NOLINT
bool is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id,
std::string& job_id) const; // NOLINT
bool is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id,
std::string& job_id) const; // NOLINT
rdc_status_t rdc_notif_update_cache(rdc_evnt_notification_t *events,
uint32_t num_events);
//!< The function will be pass as the callback for bulk fetch
static rdc_status_t handle_fields(rdc_gpu_field_value_t* values,
uint32_t num_values, void* user_data);
rdc_status_t rdc_notif_update_cache(rdc_evnt_notification_t* events, uint32_t num_events);
//!< The function will be pass as the callback for bulk fetch
static rdc_status_t handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values,
void* user_data);
RdcGroupSettingsPtr group_settings_;
RdcCacheManagerPtr cache_mgr_;
RdcModuleMgrPtr rdc_module_mgr_;
RdcNotificationPtr notifications_;
RdcGroupSettingsPtr group_settings_;
RdcCacheManagerPtr cache_mgr_;
RdcModuleMgrPtr rdc_module_mgr_;
RdcNotificationPtr notifications_;
//!< The watch table to store the watch settings.
std::map<RdcFieldGroupKey, FieldSettings> watch_table_;
//!< The watch table to store the watch settings.
std::map<RdcFieldGroupKey, FieldSettings> watch_table_;
//!< <job_id, gpu_group_id> pairs
std::map<std::string, JobWatchTableEntry> job_watch_table_;
//!< <job_id, gpu_group_id> pairs
std::map<std::string, JobWatchTableEntry> job_watch_table_;
//!< The settings for each field can be deduced from watch_table. But every
//!< rdc_field_update_all() call needs to deduce them. To improve the
//!< performance, the fields_to_watch_ is used to track the field settings.
//!< Those settings will only be updated when watching or unwatching.
std::map<RdcFieldKey, FieldSettings> fields_to_watch_;
//!< The settings for each field can be deduced from watch_table. But every
//!< rdc_field_update_all() call needs to deduce them. To improve the
//!< performance, the fields_to_watch_ is used to track the field settings.
//!< Those settings will only be updated when watching or unwatching.
std::map<RdcFieldKey, FieldSettings> fields_to_watch_;
//!< The last clean up time
std::atomic<uint64_t> last_cleanup_time_;
std::mutex watch_mutex_;
//!< The last clean up time
std::atomic<uint64_t> last_cleanup_time_;
std::mutex watch_mutex_;
};
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCWATCHTABLEIMPL_H_
-1
查看文件
@@ -34,4 +34,3 @@ rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi);
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_
+12 -13
查看文件
@@ -28,18 +28,18 @@ THE SOFTWARE.
#include "rdc/rdc.h"
#define RDC_ERROR 0
#define RDC_INFO 1
#define RDC_DEBUG 2
#define RDC_ERROR 0
#define RDC_INFO 1
#define RDC_DEBUG 2
#define RDC_LOG(debug_level, msg) do { \
auto& logger = amd::rdc::RdcLogger::getLogger(); \
if (logger.should_log((debug_level))) { \
logger.get_ostream() << \
logger.get_log_header((debug_level), __FILE__, __LINE__) << \
msg << std::endl; \
} \
} while (0)
#define RDC_LOG(debug_level, msg) \
do { \
auto& logger = amd::rdc::RdcLogger::getLogger(); \
if (logger.should_log((debug_level))) { \
logger.get_ostream() << logger.get_log_header((debug_level), __FILE__, __LINE__) << msg \
<< std::endl; \
} \
} while (0)
//<! The key to identify the field with <gpu_id, field_id>
typedef std::pair<uint32_t, rdc_field_t> RdcFieldKey;
@@ -64,7 +64,6 @@ typedef std::map<RdcFieldKey, uint64_t> rdc_gpu_gauges_t;
*
* @retval Return a pointer to the destination string.
*/
char *strncpy_with_null(char *dest, const char *src, size_t n);
char* strncpy_with_null(char* dest, const char* src, size_t n);
#endif // INCLUDE_RDC_LIB_RDC_COMMON_H_
+56 -58
查看文件
@@ -23,6 +23,7 @@ THE SOFTWARE.
#ifndef RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_
#define RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_
#include <rocmtools.h>
#include <chrono>
#include <cstdint>
#include <cstdio>
@@ -30,6 +31,7 @@ THE SOFTWARE.
#include <string>
#include <typeinfo>
#include <unordered_map>
#include "rdc/rdc.h"
namespace amd {
@@ -62,73 +64,69 @@ static const std::unordered_map<rdc_field_t, const char*> counter_map_k = {
/// Common interface for RocP tests and samples
class RdcRocpBase {
typedef std::pair<uint32_t, rdc_field_t> pair_gpu_field_t;
typedef struct session_info_t {
rocmtools_session_id_t id{};
std::chrono::
time_point<std::chrono::system_clock, std::chrono::nanoseconds>
start_time;
std::chrono::
time_point<std::chrono::system_clock, std::chrono::nanoseconds>
stop_time;
} session_info_t;
typedef std::pair<uint32_t, rdc_field_t> pair_gpu_field_t;
typedef struct session_info_t {
rocmtools_session_id_t id{};
std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> start_time;
std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> stop_time;
} session_info_t;
public:
RdcRocpBase();
RdcRocpBase(const RdcRocpBase&) = default;
RdcRocpBase(RdcRocpBase&&) = delete;
RdcRocpBase& operator=(const RdcRocpBase&) = delete;
RdcRocpBase& operator=(RdcRocpBase&&) = delete;
~RdcRocpBase();
RdcRocpBase();
RdcRocpBase(const RdcRocpBase&) = default;
RdcRocpBase(RdcRocpBase&&) = delete;
RdcRocpBase& operator=(const RdcRocpBase&) = delete;
RdcRocpBase& operator=(RdcRocpBase&&) = delete;
~RdcRocpBase();
/**
* @brief Lookup ROCProfiler counter
*
* @param[in] field An existing field already added to sessions dictionary
* @param[out] value A pointer that will be populated with returned value
*
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
* successfully.
*/
rdc_status_t rocp_lookup(pair_gpu_field_t gpu_field, double* value);
/**
* @brief Lookup ROCProfiler counter
*
* @param[in] field An existing field already added to sessions dictionary
* @param[out] value A pointer that will be populated with returned value
*
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
* successfully.
*/
rdc_status_t rocp_lookup(pair_gpu_field_t gpu_field, double* value);
/**
* @brief Destroy ROCmTools session responsible for monitoring a given
* field
*
* @details While rocmtools supports multiple fields per ID - it has a
* limit to how many counters it can query internally.
* To avoid concerning ourselves with said limit, we limit each session to
* 1 field.
* In the future this can be optimized to allow for multiple fields per
* session.
*
* @param[in] field A field to start monitoring
*
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
* successfully.
*/
rdc_status_t create_session(pair_gpu_field_t gpu_field);
/**
* @brief Destroy ROCmTools session responsible for monitoring a given
* field
*
* @details While rocmtools supports multiple fields per ID - it has a
* limit to how many counters it can query internally.
* To avoid concerning ourselves with said limit, we limit each session to
* 1 field.
* In the future this can be optimized to allow for multiple fields per
* session.
*
* @param[in] field A field to start monitoring
*
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
* successfully.
*/
rdc_status_t create_session(pair_gpu_field_t gpu_field);
/**
* @brief Destroy ROCmTools session responsible for monitoring a given
* field
*
* @param[in] field A field to stop monitoring
*
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
* successfully.
*/
rdc_status_t destroy_session(pair_gpu_field_t gpu_field);
/**
* @brief Destroy ROCmTools session responsible for monitoring a given
* field
*
* @param[in] field A field to stop monitoring
*
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
* successfully.
*/
rdc_status_t destroy_session(pair_gpu_field_t gpu_field);
protected:
private:
std::map<pair_gpu_field_t, session_info_t> sessions;
std::map<pair_gpu_field_t, session_info_t> sessions;
/**
* @brief Convert from rocmtools status into RDC status
*/
rdc_status_t Rocp2RdcError(rocmtools_status_t rocm_status);
/**
* @brief Convert from rocmtools status into RDC status
*/
rdc_status_t Rocp2RdcError(rocmtools_status_t rocm_status);
};
} // namespace rdc
+9 -12
查看文件
@@ -22,8 +22,8 @@ THE SOFTWARE.
#ifndef RDC_MODULES_RDC_ROCR_COMPUTEQUEUETEST_H_
#define RDC_MODULES_RDC_ROCR_COMPUTEQUEUETEST_H_
#include "rdc_modules/rdc_rocr/TestBase.h"
#include "hsa/hsa.h"
#include "rdc_modules/rdc_rocr/TestBase.h"
namespace amd {
namespace rdc {
@@ -65,11 +65,10 @@ typedef struct BinarySearch {
// Other items we need to populate AQL packet
uint64_t kernel_object;
uint32_t group_segment_size; ///< Kernel group seg size
uint32_t private_segment_size; ///< Kernel private seg size
uint32_t group_segment_size; ///< Kernel group seg size
uint32_t private_segment_size; ///< Kernel private seg size
} BinarySearch;
class ComputeQueueTest : public TestBase {
public:
explicit ComputeQueueTest(uint32_t gpu_index);
@@ -101,14 +100,12 @@ class ComputeQueueTest : public TestBase {
hsa_status_t LoadKernelFromObjFile(BinarySearch* bs);
hsa_status_t Run(BinarySearch* bs);
hsa_status_t CleanUp(BinarySearch* bs);
void PopulateAQLPacket(BinarySearch const* bs,
hsa_kernel_dispatch_packet_t* aql);
hsa_status_t AgentMemcpy(void* dst, const void* src,
size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag);
hsa_status_t AllocAndSetKernArgs(BinarySearch* bs, void* args,
size_t arg_size, void** aql_buf_ptr);
void WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql,
hsa_queue_t* q);
void PopulateAQLPacket(BinarySearch const* bs, hsa_kernel_dispatch_packet_t* aql);
hsa_status_t AgentMemcpy(void* dst, const void* src, size_t size, hsa_agent_t dst_ag,
hsa_agent_t src_ag);
hsa_status_t AllocAndSetKernArgs(BinarySearch* bs, void* args, size_t arg_size,
void** aql_buf_ptr);
void WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql, hsa_queue_t* q);
};
} // namespace rdc
+3 -7
查看文件
@@ -23,9 +23,8 @@ THE SOFTWARE.
#ifndef RDC_MODULES_RDC_ROCR_MEMORYACCESS_H_
#define RDC_MODULES_RDC_ROCR_MEMORYACCESS_H_
#include "rdc_modules/rdc_rocr/TestBase.h"
#include "hsa/hsa.h"
#include "rdc_modules/rdc_rocr/TestBase.h"
namespace amd {
namespace rdc {
@@ -51,18 +50,15 @@ class MemoryAccessTest : public TestBase {
// @Brief: Display information about what this test does
virtual void DisplayTestInfo(void);
// @Brief: This test verify that CPU is able to Read & write GPU memory
void CPUAccessToGPUMemoryTest(void);
// @Brief: This test verify that GPU is able to Read & write CPU memory
void GPUAccessToCPUMemoryTest(void);
private:
void CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent,
hsa_agent_t gpuAgent,
hsa_amd_memory_pool_t pool);
void CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent,
hsa_amd_memory_pool_t pool);
void GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent);
};
+2 -3
查看文件
@@ -22,8 +22,8 @@ THE SOFTWARE.
#ifndef RDC_MODULES_RDC_ROCR_MEMORYTEST_H_
#define RDC_MODULES_RDC_ROCR_MEMORYTEST_H_
#include "rdc_modules/rdc_rocr/TestBase.h"
#include "hsa/hsa.h"
#include "rdc_modules/rdc_rocr/TestBase.h"
namespace amd {
namespace rdc {
@@ -54,8 +54,7 @@ class MemoryTest : public TestBase {
hsa_status_t TestAllocate(hsa_amd_memory_pool_t pool, size_t sz);
private:
hsa_status_t MaxSingleAllocationTest(hsa_agent_t ag,
hsa_amd_memory_pool_t pool);
hsa_status_t MaxSingleAllocationTest(hsa_agent_t ag, hsa_amd_memory_pool_t pool);
};
} // namespace rdc
-1
查看文件
@@ -24,5 +24,4 @@ THE SOFTWARE.
#include "rdc/rdc.h"
#include "rdc_lib/RdcDiagnosticLibInterface.h"
#endif // RDC_MODULES_RDC_DIAGNOSTIC_RDCDIAGNOSTICLIB_H_
+75 -165
查看文件
@@ -24,10 +24,12 @@ THE SOFTWARE.
#define RDC_MODULES_RDC_ROCR_RDCROCRBASE_H_
#include <stdint.h>
#include <stdio.h>
#include <string>
#include "rdc_lib/RdcPerfTimer.h"
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
#include "rdc_lib/RdcPerfTimer.h"
namespace amd {
namespace rdc {
@@ -41,226 +43,134 @@ class RdcRocrBase {
///< Setters and Getters
void set_gpu_device1(hsa_agent_t in_dev) {
gpu_device1_.handle = in_dev.handle;
}
hsa_agent_t* gpu_device1(void) {
return &gpu_device1_;
}
void set_gpu_device1(hsa_agent_t in_dev) { gpu_device1_.handle = in_dev.handle; }
hsa_agent_t* gpu_device1(void) { return &gpu_device1_; }
void set_cpu_device(hsa_agent_t in_dev) {
cpu_device_.handle = in_dev.handle;
}
hsa_agent_t* cpu_device(void) {
return &cpu_device_;
}
void set_cpu_device(hsa_agent_t in_dev) { cpu_device_.handle = in_dev.handle; }
hsa_agent_t* cpu_device(void) { return &cpu_device_; }
void set_kernel_file_name(const char* in_file_name) {
kernel_file_name_ = in_file_name;
}
std::string const kernel_file_name(void) const {
return kernel_file_name_;
}
void set_kernel_file_name(const char* in_file_name) { kernel_file_name_ = in_file_name; }
std::string const kernel_file_name(void) const { return kernel_file_name_; }
void set_kernel_name(std::string in_kernel_name) {
kernel_name_ = in_kernel_name;
}
std::string const kernel_name(void) const {
return kernel_name_;
}
void set_kernel_name(std::string in_kernel_name) { kernel_name_ = in_kernel_name; }
std::string const kernel_name(void) const { return kernel_name_; }
void set_agent_name(std::string in_agent_name) {
agent_name_ = in_agent_name;
}
void set_agent_name(std::string in_agent_name) { agent_name_ = in_agent_name; }
std::string const get_agent_name(void) const {
return agent_name_;
}
std::string const get_agent_name(void) const { return agent_name_; }
void set_kernel_object(uint64_t in_kernel_object) {
kernel_object_ = in_kernel_object;
}
uint64_t kernel_object(void) const {
return kernel_object_;
}
void set_kernel_object(uint64_t in_kernel_object) { kernel_object_ = in_kernel_object; }
uint64_t kernel_object(void) const { return kernel_object_; }
void set_profile(hsa_profile_t in_prof) {
profile_ = in_prof;
}
hsa_profile_t profile(void) const {
return profile_;
}
void set_profile(hsa_profile_t in_prof) { profile_ = in_prof; }
hsa_profile_t profile(void) const { return profile_; }
uint32_t private_segment_size(void) const {
return private_segment_size_;
}
void set_private_segment_size(uint32_t sz) {
private_segment_size_ = sz;
}
uint32_t private_segment_size(void) const { return private_segment_size_; }
void set_private_segment_size(uint32_t sz) { private_segment_size_ = sz; }
void set_group_segment_size(uint32_t sz) {
group_segment_size_ = sz;
}
uint32_t group_segment_size(void) const {
return group_segment_size_;
}
void set_group_segment_size(uint32_t sz) { group_segment_size_ = sz; }
uint32_t group_segment_size(void) const { return group_segment_size_; }
void set_group_size(uint32_t sz) {
group_size_ = sz;
}
uint32_t group_size(void) const {
return group_size_;
}
void set_group_size(uint32_t sz) { group_size_ = sz; }
uint32_t group_size(void) const { return group_size_; }
void set_main_queue(hsa_queue_t* q) {
main_queue_ = q;
}
hsa_queue_t* main_queue(void) const {
return main_queue_;
}
void set_main_queue(hsa_queue_t* q) { main_queue_ = q; }
hsa_queue_t* main_queue(void) const { return main_queue_; }
hsa_kernel_dispatch_packet_t& aql(void) {
return aql_;
}
hsa_kernel_dispatch_packet_t& aql(void) { return aql_; }
void set_num_iteration(int num) {
num_iteration_ = num;
}
uint32_t num_iteration(void) const {
return num_iteration_;
}
void set_num_iteration(int num) { num_iteration_ = num; }
uint32_t num_iteration(void) const { return num_iteration_; }
hsa_amd_memory_pool_t& device_pool(void) {
return device_pool_;
}
hsa_amd_memory_pool_t& device_pool(void) { return device_pool_; }
hsa_amd_memory_pool_t& cpu_pool(void) {
return cpu_pool_;
}
hsa_amd_memory_pool_t& cpu_pool(void) { return cpu_pool_; }
hsa_amd_memory_pool_t& kern_arg_pool(void) {
return kern_arg_pool_;
}
hsa_amd_memory_pool_t& kern_arg_pool(void) { return kern_arg_pool_; }
void set_kernarg_size(uint32_t sz) {
kernarg_size_ = sz;
}
uint32_t kernarg_size(void) const {
return kernarg_size_;
}
void set_kernarg_size(uint32_t sz) { kernarg_size_ = sz; }
uint32_t kernarg_size(void) const { return kernarg_size_; }
void set_kernarg_align(uint32_t align) {
kernarg_align_ = align;
}
uint32_t kernarg_align(void) const {
return kernarg_align_;
}
void set_kernarg_align(uint32_t align) { kernarg_align_ = align; }
uint32_t kernarg_align(void) const { return kernarg_align_; }
void* kernarg_buffer(void) const {
return kernarg_buffer_;
}
void set_kernarg_buffer(void* buffer) {
kernarg_buffer_ = buffer;
}
void* kernarg_buffer(void) const { return kernarg_buffer_; }
void set_kernarg_buffer(void* buffer) { kernarg_buffer_ = buffer; }
int32_t requires_profile(void) const {
return requires_profile_;
}
int32_t requires_profile(void) const { return requires_profile_; }
char* orig_hsa_enable_interrupt() const {
return orig_hsa_enable_interrupt_;
}
char* orig_hsa_enable_interrupt() const { return orig_hsa_enable_interrupt_; }
bool enable_interrupt() const {
return enable_interrupt_;
}
bool enable_interrupt() const { return enable_interrupt_; }
void set_title(std::string name) {
title_ = name;
}
std::string title(void) const {
return title_;
}
void set_title(std::string name) { title_ = name; }
std::string title(void) const { return title_; }
RdcPerfTimer* hsa_timer(void) {
return &hsa_timer_;
}
RdcPerfTimer* hsa_timer(void) { return &hsa_timer_; }
void set_verbosity(uint32_t v) {
verbosity_ = v;
}
uint32_t verbosity(void) const {
return verbosity_;
}
void set_verbosity(uint32_t v) { verbosity_ = v; }
uint32_t verbosity(void) const { return verbosity_; }
void set_monitor_verbosity(uint32_t m) {
monitor_verbosity_ = m;
}
uint32_t monitor_verbosity(void) const {
return monitor_verbosity_;
}
void set_monitor_verbosity(uint32_t m) { monitor_verbosity_ = m; }
uint32_t monitor_verbosity(void) const { return monitor_verbosity_; }
protected:
void set_requires_profile(int32_t reqd_prof) {
requires_profile_ = reqd_prof;
}
void set_requires_profile(int32_t reqd_prof) { requires_profile_ = reqd_prof; }
void set_enable_interrupt(bool doEnable) {
enable_interrupt_ = doEnable;
}
void set_enable_interrupt(bool doEnable) { enable_interrupt_ = doEnable; }
private:
uint64_t num_iteration_; ///< Number of times to execute test
uint64_t num_iteration_; ///< Number of times to execute test
hsa_queue_t* main_queue_; ///< AQL queue used for packets
hsa_queue_t* main_queue_; ///< AQL queue used for packets
hsa_agent_t gpu_device1_; ///< Handle to first GPU found
hsa_agent_t gpu_device1_; ///< Handle to first GPU found
hsa_agent_t cpu_device_; ///< Handle to CPU
hsa_agent_t cpu_device_; ///< Handle to CPU
hsa_amd_memory_pool_t device_pool_; ///< Memory pool on gpu pool list
hsa_amd_memory_pool_t device_pool_; ///< Memory pool on gpu pool list
hsa_amd_memory_pool_t cpu_pool_; ///< Memory pool on cpu pool list
hsa_amd_memory_pool_t cpu_pool_; ///< Memory pool on cpu pool list
hsa_amd_memory_pool_t kern_arg_pool_; ///< Memory pool suitable for args
hsa_amd_memory_pool_t kern_arg_pool_; ///< Memory pool suitable for args
uint64_t kernel_object_; ///< Handle to kernel code
uint64_t kernel_object_; ///< Handle to kernel code
std::string kernel_file_name_; ///< Code object file name
std::string kernel_file_name_; ///< Code object file name
std::string kernel_name_; ///< Kernel name
std::string kernel_name_; ///< Kernel name
std::string agent_name_; ///< Agent name
std::string agent_name_; ///< Agent name
hsa_kernel_dispatch_packet_t aql_; ///< Kernel dispatch packet
hsa_kernel_dispatch_packet_t aql_; ///< Kernel dispatch packet
uint32_t group_segment_size_; ///< Kernel group seg size
uint32_t group_segment_size_; ///< Kernel group seg size
uint32_t kernarg_size_; ///< Kernarg memory size
uint32_t kernarg_size_; ///< Kernarg memory size
uint32_t kernarg_align_; ///< Alignment for kern argument memory
uint32_t kernarg_align_; ///< Alignment for kern argument memory
void* kernarg_buffer_; ///< Unaligned allocated kernel arg. buffer
void* kernarg_buffer_; ///< Unaligned allocated kernel arg. buffer
hsa_profile_t profile_; ///< Device profile.
hsa_profile_t profile_; ///< Device profile.
uint32_t group_size_; ///< Number of work items in one group
uint32_t group_size_; ///< Number of work items in one group
uint32_t private_segment_size_; ///< Kernel private seg size
uint32_t private_segment_size_; ///< Kernel private seg size
int32_t requires_profile_; ///< Profile required by test (-1 if no req.)
int32_t requires_profile_; ///< Profile required by test (-1 if no req.)
char* orig_hsa_enable_interrupt_; ///< Orig. value of HSA_ENABLE_INTERRUPT
char* orig_hsa_enable_interrupt_; ///< Orig. value of HSA_ENABLE_INTERRUPT
bool enable_interrupt_; ///< Whether to enable/disable interrupts for test
bool enable_interrupt_; ///< Whether to enable/disable interrupts for test
std::string title_; ///< Displayed title of test
std::string title_; ///< Displayed title of test
uint32_t verbosity_; ///< How much additional output to produce
uint32_t verbosity_; ///< How much additional output to produce
uint32_t monitor_verbosity_; ///< verbose or not
uint32_t monitor_verbosity_; ///< verbose or not
RdcPerfTimer hsa_timer_; ///< Timer to be used for timing parts of test
RdcPerfTimer hsa_timer_; ///< Timer to be used for timing parts of test
};
} // namespace rdc
+6 -5
查看文件
@@ -22,9 +22,10 @@ THE SOFTWARE.
#ifndef RDC_MODULES_RDC_ROCR_TESTBASE_H_
#define RDC_MODULES_RDC_ROCR_TESTBASE_H_
#include <string>
#include <memory>
#include <string>
#include <vector>
#include "rdc_modules/rdc_rocr/RdcRocrBase.h"
namespace amd {
@@ -36,7 +37,7 @@ class TestBase : public RdcRocrBase {
virtual ~TestBase(void);
enum VerboseLevel {VERBOSE_MIN = 0, VERBOSE_STANDARD, VERBOSE_PROGRESS};
enum VerboseLevel { VERBOSE_MIN = 0, VERBOSE_STANDARD, VERBOSE_PROGRESS };
// @Brief: Before run the core measure codes, do something to set up
// i.e. init runtime, prepare packet...
@@ -54,12 +55,12 @@ class TestBase : public RdcRocrBase {
// @Brief: Display information about the test
virtual void DisplayTestInfo(void);
const std::string & description(void) const {return description_;}
const std::string& description(void) const { return description_; }
void set_description(std::string d);
const std::string & get_gpu_info() const { return gpu_info_;}
const std::string & get_per_gpu_info() const { return per_gpu_info_;}
const std::string& get_gpu_info() const { return gpu_info_; }
const std::string& get_per_gpu_info() const { return per_gpu_info_; }
hsa_status_t FindGPUIndex(hsa_agent_t agent, void* data);
// Return the agent by GPU index in rocm_smi
+15 -19
查看文件
@@ -26,10 +26,11 @@ THE SOFTWARE.
/// \file
/// Prototypes of utility functions that act on RdcRocrBase objects.
#include "rdc_modules/rdc_rocr/RdcRocrBase.h"
#include <string>
#include "rdc_modules/rdc_rocr/common.h"
#include "hsa/hsa.h"
#include "rdc_modules/rdc_rocr/RdcRocrBase.h"
#include "rdc_modules/rdc_rocr/common.h"
namespace amd {
namespace rdc {
@@ -58,8 +59,7 @@ hsa_status_t SetDefaultAgents(RdcRocrBase* test);
/// \param[in] do_profile [Optional] Specificy whether profiled queue should
/// be created
/// \returns HSA_STATUS_SUCCESS if no errors encountered
hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue,
uint32_t num_pkts = 0);
hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue, uint32_t num_pkts = 0);
/// This function sets some reasonable default values for an AQL packet.
/// Override any field as necessary after calling this function.
@@ -68,18 +68,16 @@ hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue,
/// \param[inout] aql Caller provided pointer to aql packet that will be
/// populated
/// \returns Appropriate hsa_status_t
hsa_status_t InitializeAQLPacket(const RdcRocrBase* test,
hsa_kernel_dispatch_packet_t* aql);
hsa_status_t InitializeAQLPacket(const RdcRocrBase* test, hsa_kernel_dispatch_packet_t* aql);
/// This function writes all of the aql packet fields to the queue besides
/// "setup" and "header". This assumes all the aql fields have be set
/// appropriately.
/// \param[in] test Test containing the queue and aql packet to be written.
/// \returns Pointer to dispatch packet in queue that was written to
hsa_kernel_dispatch_packet_t* WriteAQLToQueue(RdcRocrBase* test, uint64_t *ind);
hsa_kernel_dispatch_packet_t* WriteAQLToQueue(RdcRocrBase* test, uint64_t* ind);
void WriteAQLToQueueLoc(hsa_queue_t *queue, uint64_t indx,
hsa_kernel_dispatch_packet_t *aql_pkt);
void WriteAQLToQueueLoc(hsa_queue_t* queue, uint64_t indx, hsa_kernel_dispatch_packet_t* aql_pkt);
/// This function writes the first 32 bits of an aql packet to the provided
/// aql packet. This function is meant to be called immediately before
/// ringing door_bell signal.
@@ -89,9 +87,9 @@ void WriteAQLToQueueLoc(hsa_queue_t *queue, uint64_t indx,
/// be written
/// \returns void
inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup,
hsa_kernel_dispatch_packet_t* queue_packet) {
__atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet),
header | (setup <<16), __ATOMIC_RELEASE);
hsa_kernel_dispatch_packet_t* queue_packet) {
__atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet), header | (setup << 16),
__ATOMIC_RELEASE);
}
/// Perform common operations to clean up after executing a test. Specifically,
@@ -121,8 +119,7 @@ bool CheckProfile(RdcRocrBase const* test);
/// \param arg_size Size of the kernel arg data (including padding) to be
/// written
/// \returns HSA_STATUS_SUCCESS if no errors
hsa_status_t AllocAndSetKernArgs(RdcRocrBase* test, void* args,
size_t arg_size);
hsa_status_t AllocAndSetKernArgs(RdcRocrBase* test, void* args, size_t arg_size);
/// Verify that the machine running the test has the required profile.
/// This function will verify that the execution machine meets any specific
@@ -149,8 +146,9 @@ hsa_status_t SetPoolsTypical(RdcRocrBase* test);
/// \param[in] test Test that has handles to cpu and gpu agents that can own
/// either source or destination of fill
/// \returns HSA_STATUS_OK if not errors
hsa_status_t hsa_memory_fill_workaround_gen(void* ptr, uint32_t value,
size_t count, hsa_agent_t dst_ag, hsa_agent_t src_ag, RdcRocrBase* test);
hsa_status_t hsa_memory_fill_workaround_gen(void* ptr, uint32_t value, size_t count,
hsa_agent_t dst_ag, hsa_agent_t src_ag,
RdcRocrBase* test);
/// Get the library directory which is loaded by current process.
/// It will search /proc/self/maps for it.
@@ -162,11 +160,9 @@ std::string get_app_dir();
// Search multiple folder for the hsaco file
// Return empty if cannot find it.
std::string search_hsaco_full_path(const char* hsaco_file_name,
const char* agent_name);
std::string search_hsaco_full_path(const char* hsaco_file_name, const char* agent_name);
} // namespace rdc
} // namespace amd
#endif // RDC_MODULES_RDC_ROCR_BASE_ROCR_UTILS_H_
+35 -40
查看文件
@@ -28,12 +28,13 @@ THE SOFTWARE.
#include <stdio.h>
#include <string.h>
#include <cmath>
#include <cstdlib>
#include <iostream>
#include <vector>
#include <string>
#include <memory>
#include <string>
#include <vector>
#include "hsa/hsa.h"
#include "hsa/hsa_ext_amd.h"
@@ -45,13 +46,13 @@ namespace rdc {
#define ALIGNED_(x) __declspec(align(x))
#else
#if defined(__GNUC__)
#define ALIGNED_(x) __attribute__ ((aligned(x)))
#define ALIGNED_(x) __attribute__((aligned(x)))
#endif // __GNUC__
#endif // _MSC_VER
#define MULTILINE(...) # __VA_ARGS__
#define MULTILINE(...) #__VA_ARGS__
#define ASSERT_EQ(a, b) (a==b)
#define ASSERT_EQ(a, b) (a == b)
void SetEnv(const char* env_var_name, const char* env_var_value);
intptr_t AlignDown(intptr_t value, size_t alignment);
@@ -66,39 +67,35 @@ void* AlignUp(void* value, size_t alignment);
// related calls, and is later used for reference when displaying the
// information.
typedef struct pool_info_t_ {
uint32_t segment;
size_t size;
bool alloc_allowed;
size_t alloc_granule;
size_t alloc_alignment;
bool accessible_by_all;
uint32_t global_flag;
uint64_t aggregate_alloc_max;
inline bool operator==(const pool_info_t_ &a) {
if (a.segment == segment && a.size == size
&& a.alloc_allowed == alloc_allowed
&& a.alloc_granule == alloc_granule
&& a.alloc_alignment == alloc_alignment
&& a.accessible_by_all == accessible_by_all
&& a.aggregate_alloc_max == aggregate_alloc_max
&& a.global_flag == global_flag )
return true;
else
return false;
}
uint32_t segment;
size_t size;
bool alloc_allowed;
size_t alloc_granule;
size_t alloc_alignment;
bool accessible_by_all;
uint32_t global_flag;
uint64_t aggregate_alloc_max;
inline bool operator==(const pool_info_t_& a) {
if (a.segment == segment && a.size == size && a.alloc_allowed == alloc_allowed &&
a.alloc_granule == alloc_granule && a.alloc_alignment == alloc_alignment &&
a.accessible_by_all == accessible_by_all && a.aggregate_alloc_max == aggregate_alloc_max &&
a.global_flag == global_flag)
return true;
else
return false;
}
} pool_info_t;
struct agent_pools_t{
hsa_agent_t agent;
std::vector<hsa_amd_memory_pool_t> pools;
struct agent_pools_t {
hsa_agent_t agent;
std::vector<hsa_amd_memory_pool_t> pools;
};
/// Fill in the pool_info_t structure for the provided pool.
/// \param[in] pool Pool for which information will be retrieved
/// \param[out] pool_i Pointer to structure where pool info will be stored
/// \returns HSA_STATUS_SUCCESS if no errors are encountered.
hsa_status_t AcquirePoolInfo(hsa_amd_memory_pool_t pool, pool_info_t *pool_i);
hsa_status_t AcquirePoolInfo(hsa_amd_memory_pool_t pool, pool_info_t* pool_i);
/// If the provided agent is associated with a GPU, return that agent through
/// output parameter. This function is meant to be the call-back function used
@@ -128,7 +125,7 @@ hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data);
/// \param[out] data If agent is associated with a CPU, this pointer will point
/// to the agent upon return
/// \returns HSA_STATUS_SUCCESS if no errors are encountered.
hsa_status_t IterateCPUAgents(hsa_agent_t agent, void *data);
hsa_status_t IterateCPUAgents(hsa_agent_t agent, void* data);
/// If the provided agent is associated with a GPU, return that agent through
/// output parameter. This function is meant to be the call-back function used
@@ -137,7 +134,7 @@ hsa_status_t IterateCPUAgents(hsa_agent_t agent, void *data);
/// \param[out] data If agent is associated with a GPU, this pointer will point
/// to the agent upon return
/// \returns HSA_STATUS_SUCCESS if no errors are encountered.
hsa_status_t IterateGPUAgents(hsa_agent_t agent, void *data);
hsa_status_t IterateGPUAgents(hsa_agent_t agent, void* data);
/// Find a GLOBAL memory pool. By this, we mean not a kernel args pool.
/// This function is meant to be the call-back function used
@@ -163,7 +160,6 @@ hsa_status_t GetGlobalMemoryPool(hsa_amd_memory_pool_t pool, void* data);
/// -else return an appropriate error code for any error encountered
hsa_status_t GetKernArgMemoryPool(hsa_amd_memory_pool_t pool, void* data);
/// Find a "standard" pool. By this, we mean not a kernel args pool.
/// The pool found will have the following properties:
/// HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL: Don't care
@@ -201,16 +197,14 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data);
/// \param[in] pool Pool to gather and dump information for
/// \param[in] indent Number of spaces to indent output.
/// \returns hsa_status_t HSA_STATUS_SUCCESS if no errors
hsa_status_t DumpMemoryPoolInfo(const pool_info_t *pool_i,
uint32_t indent = 0);
hsa_status_t DumpMemoryPoolInfo(const pool_info_t* pool_i, uint32_t indent = 0);
/// Dump information about a provided pointer to STDOUT.
/// \param[in] ptr Pointer about which information is dumped.
/// \returns HSA_STATUS_SUCCESS if there are no errors
hsa_status_t DumpPointerInfo(void* ptr);
hsa_status_t GetAgentPools(
std::vector<std::shared_ptr<agent_pools_t>> *agent_pools);
hsa_status_t GetAgentPools(std::vector<std::shared_ptr<agent_pools_t>>* agent_pools);
void throw_if_error(hsa_status_t err, const std::string& msg = "");
@@ -219,10 +213,11 @@ void throw_if_skip(const std::string& msg);
// The customize exception when the test has to be skipped
class SkipException : public std::exception {
public:
explicit SkipException(const char* msg): _msg(msg) {}
virtual const char* what() const noexcept { return _msg.c_str(); }
explicit SkipException(const char* msg) : _msg(msg) {}
virtual const char* what() const noexcept { return _msg.c_str(); }
private:
std::string _msg;
std::string _msg;
};
} // namespace rdc
+252 -286
查看文件
@@ -21,404 +21,370 @@ THE SOFTWARE.
*/
#include <dlfcn.h>
#include <string.h>
#include <map>
#include "common/rdc_fields_supported.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcHandler.h"
#include "rdc_lib/RdcLibraryLoader.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcLibraryLoader.h"
static amd::rdc::RdcLibraryLoader rdc_lib_loader;
rdc_status_t rdc_init(uint64_t) {
return RDC_ST_OK;
}
rdc_status_t rdc_init(uint64_t) { return RDC_ST_OK; }
rdc_status_t rdc_shutdown() {
return rdc_lib_loader.unload();
}
rdc_status_t rdc_shutdown() { return rdc_lib_loader.unload(); }
rdc_status_t rdc_connect(const char* ipAddress,
rdc_handle_t* p_rdc_handle,
const char* root_ca, const char* client_cert,
const char* client_key ) {
amd::rdc::RdcHandler* (*func_make_handler)(const char*,
const char*, const char*, const char*) = nullptr;
rdc_status_t rdc_connect(const char* ipAddress, rdc_handle_t* p_rdc_handle, const char* root_ca,
const char* client_cert, const char* client_key) {
amd::rdc::RdcHandler* (*func_make_handler)(const char*, const char*, const char*, const char*) =
nullptr;
if (!ipAddress || !p_rdc_handle) {
return RDC_ST_FAIL_LOAD_MODULE;
}
if (!ipAddress || !p_rdc_handle) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = rdc_lib_loader.load("librdc_client.so",
&func_make_handler);
if (status != RDC_ST_OK) {
*p_rdc_handle = nullptr;
return status;
}
rdc_status_t status = rdc_lib_loader.load("librdc_client.so", &func_make_handler);
if (status != RDC_ST_OK) {
*p_rdc_handle = nullptr;
return status;
}
*p_rdc_handle = static_cast<rdc_handle_t>
(func_make_handler(ipAddress,
root_ca, client_cert, client_key));
return RDC_ST_OK;
*p_rdc_handle =
static_cast<rdc_handle_t>(func_make_handler(ipAddress, root_ca, client_cert, client_key));
return RDC_ST_OK;
}
rdc_status_t rdc_disconnect(rdc_handle_t p_rdc_handle) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
delete static_cast<amd::rdc::RdcHandler*>(p_rdc_handle);
p_rdc_handle = nullptr;
return RDC_ST_OK;
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
delete static_cast<amd::rdc::RdcHandler*>(p_rdc_handle);
p_rdc_handle = nullptr;
return RDC_ST_OK;
}
rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode,
rdc_handle_t* p_rdc_handle ) {
amd::rdc::RdcHandler* (*func_make_handler)(rdc_operation_mode_t)
= nullptr;
if (!p_rdc_handle) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, rdc_handle_t* p_rdc_handle) {
amd::rdc::RdcHandler* (*func_make_handler)(rdc_operation_mode_t) = nullptr;
if (!p_rdc_handle) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = rdc_lib_loader.load("librdc.so",
&func_make_handler);
if (status != RDC_ST_OK) {
*p_rdc_handle = nullptr;
return status;
}
rdc_status_t status = rdc_lib_loader.load("librdc.so", &func_make_handler);
if (status != RDC_ST_OK) {
*p_rdc_handle = nullptr;
return status;
}
*p_rdc_handle = static_cast<rdc_handle_t>
(func_make_handler(op_mode));
*p_rdc_handle = static_cast<rdc_handle_t>(func_make_handler(op_mode));
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
delete static_cast<amd::rdc::RdcHandler*>(p_rdc_handle);
p_rdc_handle = nullptr;
return RDC_ST_OK;
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
delete static_cast<amd::rdc::RdcHandler*>(p_rdc_handle);
p_rdc_handle = nullptr;
return RDC_ST_OK;
}
rdc_status_t rdc_field_update_all(rdc_handle_t p_rdc_handle,
uint32_t wait_for_update) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_field_update_all(rdc_handle_t p_rdc_handle, uint32_t wait_for_update) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_field_update_all(wait_for_update);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_field_update_all(wait_for_update);
}
rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle,
const char job_id[64], rdc_job_info_t* p_job_info) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, const char job_id[64],
rdc_job_info_t* p_job_info) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_job_get_stats(job_id, p_job_info);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_job_get_stats(job_id, p_job_info);
}
rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t groupId, const char job_id[64],
uint64_t update_freq) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, rdc_gpu_group_t groupId,
const char job_id[64], uint64_t update_freq) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_job_start_stats(groupId, job_id, update_freq);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_job_start_stats(groupId, job_id, update_freq);
}
rdc_status_t rdc_job_remove(rdc_handle_t p_rdc_handle, const char job_id[64]) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_job_remove(job_id);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_job_remove(job_id);
}
rdc_status_t rdc_job_remove_all(rdc_handle_t p_rdc_handle) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_job_remove_all();
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_job_remove_all();
}
rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, const char job_id[64]) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle,
const char job_id[64] ) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_job_stop_stats(job_id);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_job_stop_stats(job_id);
}
rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle,
rdc_group_type_t type, const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle, rdc_group_type_t type,
const char* group_name, rdc_gpu_group_t* p_rdc_group_id) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_group_gpu_create(type, group_name, p_rdc_group_id);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_group_gpu_create(type, group_name, p_rdc_group_id);
}
rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t groupId, uint32_t gpuIndex ) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, rdc_gpu_group_t groupId,
uint32_t gpuIndex) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_group_gpu_add(groupId, gpuIndex);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_group_gpu_add(groupId, gpuIndex);
}
rdc_status_t rdc_device_get_all(rdc_handle_t p_rdc_handle,
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) {
if (!p_rdc_handle || !count) {
return RDC_ST_INVALID_HANDLER;
}
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) {
if (!p_rdc_handle || !count) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_device_get_all(gpu_index_list, count);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_device_get_all(gpu_index_list, count);
}
rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle,
uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) {
if (!p_rdc_handle || !p_rdc_attr) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
rdc_device_attributes_t* p_rdc_attr) {
if (!p_rdc_handle || !p_rdc_attr) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_device_get_attributes(gpu_index, p_rdc_attr);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_device_get_attributes(gpu_index, p_rdc_attr);
}
rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle,
uint32_t num_field_ids, rdc_field_t* field_ids,
const char* field_group_name, rdc_field_grp_t* rdc_field_group_id) {
if (!p_rdc_handle || !field_ids ||
!field_group_name || !rdc_field_group_id) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle, uint32_t num_field_ids,
rdc_field_t* field_ids, const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) {
if (!p_rdc_handle || !field_ids || !field_group_name || !rdc_field_group_id) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_group_field_create(num_field_ids, field_ids,
field_group_name, rdc_field_group_id);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_group_field_create(num_field_ids, field_ids, field_group_name, rdc_field_group_id);
}
rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle,
rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle, rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_group_field_get_info(rdc_field_group_id, field_group_info);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_group_field_get_info(rdc_field_group_id, field_group_info);
}
rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info) {
if (!p_rdc_handle || !p_rdc_group_info) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle, rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) {
if (!p_rdc_handle || !p_rdc_group_info) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_group_gpu_get_info(p_rdc_group_id, p_rdc_group_info);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_group_gpu_get_info(p_rdc_group_id, p_rdc_group_info);
}
rdc_status_t rdc_group_get_all_ids(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id_list[], uint32_t* count) {
if (!p_rdc_handle || !count) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_group_get_all_ids(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id_list[],
uint32_t* count) {
if (!p_rdc_handle || !count) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_group_get_all_ids(group_id_list, count);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_group_get_all_ids(group_id_list, count);
}
rdc_status_t rdc_group_field_get_all_ids(rdc_handle_t p_rdc_handle,
rdc_field_grp_t field_group_id_list[], uint32_t* count) {
if (!p_rdc_handle || !count) {
return RDC_ST_INVALID_HANDLER;
}
rdc_field_grp_t field_group_id_list[], uint32_t* count) {
if (!p_rdc_handle || !count) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_group_field_get_all_ids(field_group_id_list, count);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_group_field_get_all_ids(field_group_id_list, count);
}
rdc_status_t rdc_field_watch(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id,
uint64_t update_freq, double max_keep_age, uint32_t max_keep_samples) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_field_watch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, uint64_t update_freq,
double max_keep_age, uint32_t max_keep_samples) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_field_watch(group_id, field_group_id, update_freq,
max_keep_age, max_keep_samples);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_field_watch(group_id, field_group_id, update_freq, max_keep_age, max_keep_samples);
}
rdc_status_t rdc_field_get_latest_value(rdc_handle_t p_rdc_handle,
uint32_t gpu_index, rdc_field_t field, rdc_field_value* value) {
if (!p_rdc_handle || !value) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_field_get_latest_value(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
rdc_field_t field, rdc_field_value* value) {
if (!p_rdc_handle || !value) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_field_get_latest_value(gpu_index, field, value);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_field_get_latest_value(gpu_index, field, value);
}
rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle,
uint32_t gpu_index, rdc_field_t field, uint64_t since_time_stamp,
uint64_t *next_since_time_stamp, rdc_field_value* value) {
if (!p_rdc_handle || !next_since_time_stamp || !value) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
rdc_field_t field, uint64_t since_time_stamp,
uint64_t* next_since_time_stamp, rdc_field_value* value) {
if (!p_rdc_handle || !next_since_time_stamp || !value) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_field_get_value_since(gpu_index, field, since_time_stamp,
next_since_time_stamp, value);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_field_get_value_since(gpu_index, field, since_time_stamp, next_since_time_stamp, value);
}
rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_field_unwatch(group_id, field_group_id);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_field_unwatch(group_id, field_group_id);
}
rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t p_rdc_group_id) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle, rdc_gpu_group_t p_rdc_group_id) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_group_gpu_destroy(p_rdc_group_id);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_group_gpu_destroy(p_rdc_group_id);
}
rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle,
rdc_field_grp_t rdc_field_group_id) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_field_grp_t rdc_field_group_id) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_group_field_destroy(rdc_field_group_id);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_group_field_destroy(rdc_field_group_id);
}
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_response_t* response) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_level_t level, rdc_diag_response_t* response) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_diagnostic_run(group_id, level, response);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_diagnostic_run(group_id, level, response);
}
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle,
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case, rdc_diag_test_result_t* result) {
if (!p_rdc_handle) {
return RDC_ST_INVALID_HANDLER;
}
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->
rdc_test_case_run(group_id, test_case, result);
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)
->rdc_test_case_run(group_id, test_case, result);
}
const char* rdc_status_string(rdc_status_t result) {
switch (result) {
case RDC_ST_OK:
return "Success";
case RDC_ST_NOT_SUPPORTED:
return "Not supported";
case RDC_ST_FAIL_LOAD_MODULE:
return "Fail to load module";
case RDC_ST_INVALID_HANDLER:
return "Invalid handler";
case RDC_ST_NOT_FOUND:
return "Cannot find the value";
case RDC_ST_BAD_PARAMETER:
return "Invalid parameters";
case RDC_ST_MSI_ERROR:
return "SMI error";
case RDC_ST_MAX_LIMIT:
return "The max limit reached";
case RDC_ST_CONFLICT:
return "Conflict with current state";
case RDC_ST_ALREADY_EXIST:
return "The value already exists";
case RDC_ST_CLIENT_ERROR:
return "RDC Client error";
case RDC_ST_INSUFF_RESOURCES:
return "Not enough resources to complete operation";
case RDC_ST_FILE_ERROR:
return "Failed to access a file";
case RDC_ST_NO_DATA:
return "Data was requested, but none was found";
case RDC_ST_PERM_ERROR:
return "Insufficient permission to complete operation";
case RDC_ST_UNKNOWN_ERROR:
return "Unknown error";
default:
return "Unknown";
}
switch (result) {
case RDC_ST_OK:
return "Success";
case RDC_ST_NOT_SUPPORTED:
return "Not supported";
case RDC_ST_FAIL_LOAD_MODULE:
return "Fail to load module";
case RDC_ST_INVALID_HANDLER:
return "Invalid handler";
case RDC_ST_NOT_FOUND:
return "Cannot find the value";
case RDC_ST_BAD_PARAMETER:
return "Invalid parameters";
case RDC_ST_MSI_ERROR:
return "SMI error";
case RDC_ST_MAX_LIMIT:
return "The max limit reached";
case RDC_ST_CONFLICT:
return "Conflict with current state";
case RDC_ST_ALREADY_EXIST:
return "The value already exists";
case RDC_ST_CLIENT_ERROR:
return "RDC Client error";
case RDC_ST_INSUFF_RESOURCES:
return "Not enough resources to complete operation";
case RDC_ST_FILE_ERROR:
return "Failed to access a file";
case RDC_ST_NO_DATA:
return "Data was requested, but none was found";
case RDC_ST_PERM_ERROR:
return "Insufficient permission to complete operation";
case RDC_ST_UNKNOWN_ERROR:
return "Unknown error";
default:
return "Unknown";
}
}
const char* rdc_diagnostic_result_string(rdc_diag_result_t result) {
switch (result) {
case RDC_DIAG_RESULT_PASS:
return "Pass";
case RDC_DIAG_RESULT_SKIP:
return "Skip";
case RDC_DIAG_RESULT_WARN:
return "Warn";
case RDC_DIAG_RESULT_FAIL:
return "Fail";
default:
return "Unknown";
}
switch (result) {
case RDC_DIAG_RESULT_PASS:
return "Pass";
case RDC_DIAG_RESULT_SKIP:
return "Skip";
case RDC_DIAG_RESULT_WARN:
return "Warn";
case RDC_DIAG_RESULT_FAIL:
return "Fail";
default:
return "Unknown";
}
}
const char* field_id_string(rdc_field_t field_id) {
amd::rdc::fld_id2name_map_t &field_id_to_descript =
amd::rdc::get_field_id_description_from_id();
amd::rdc::fld_id2name_map_t& field_id_to_descript = amd::rdc::get_field_id_description_from_id();
return field_id_to_descript.find(field_id)->second.label.c_str();
}
rdc_field_t get_field_id_from_name(const char* name) {
rdc_field_t value;
if (amd::rdc::get_field_id_from_name(name, &value)) {
return value;
}
return RDC_FI_INVALID;
rdc_field_t value;
if (amd::rdc::get_field_id_from_name(name, &value)) {
return value;
}
return RDC_FI_INVALID;
}
char *strncpy_with_null(char *dest, const char *src, size_t n) {
if (n == 0) {
return dest;
}
strncpy(dest, src, n - 1);
dest[n - 1]= '\0';
return dest;
char* strncpy_with_null(char* dest, const char* src, size_t n) {
if (n == 0) {
return dest;
}
strncpy(dest, src, n - 1);
dest[n - 1] = '\0';
return dest;
}
+22 -25
查看文件
@@ -25,40 +25,37 @@ THE SOFTWARE.
namespace amd {
namespace rdc {
RdcLibraryLoader::RdcLibraryLoader(): libHandler_(nullptr) {
}
RdcLibraryLoader::RdcLibraryLoader() : libHandler_(nullptr) {}
rdc_status_t RdcLibraryLoader::load(const char* filename) {
if (filename == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
if (libHandler_) {
unload();
}
if (filename == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
if (libHandler_) {
unload();
}
std::lock_guard<std::mutex> guard(library_mutex_);
libHandler_ = dlopen(filename, RTLD_LAZY);
if (!libHandler_) {
char* error = dlerror();
RDC_LOG(RDC_ERROR, "Fail to open " << filename <<": " << error);
return RDC_ST_FAIL_LOAD_MODULE;
}
std::lock_guard<std::mutex> guard(library_mutex_);
libHandler_ = dlopen(filename, RTLD_LAZY);
if (!libHandler_) {
char* error = dlerror();
RDC_LOG(RDC_ERROR, "Fail to open " << filename << ": " << error);
return RDC_ST_FAIL_LOAD_MODULE;
}
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcLibraryLoader::unload() {
std::lock_guard<std::mutex> guard(library_mutex_);
if (libHandler_) {
dlclose(libHandler_);
libHandler_ = nullptr;
}
return RDC_ST_OK;
std::lock_guard<std::mutex> guard(library_mutex_);
if (libHandler_) {
dlclose(libHandler_);
libHandler_ = nullptr;
}
return RDC_ST_OK;
}
RdcLibraryLoader::~RdcLibraryLoader() {
unload();
}
RdcLibraryLoader::~RdcLibraryLoader() { unload(); }
} // namespace rdc
} // namespace amd
+38 -38
查看文件
@@ -20,59 +20,59 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/RdcLogger.h"
#include <stdlib.h>
#include <string.h>
#include <sstream>
#include <chrono> // NOLINT
#include <iomanip>
#include <iostream>
#include <chrono> // NOLINT
#include <sstream>
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdcLogger::RdcLogger(std::ostream& os):
os_(os) {
char* verbose = getenv("RDC_LOG");
if (verbose == nullptr) {
log_level_ = RDC_ERROR;
} else if (strcmp(verbose, "DEBUG") == 0) {
log_level_ = RDC_DEBUG;
} else if (strcmp(verbose, "INFO") == 0) {
log_level_ = RDC_INFO;
} else {
log_level_ = RDC_ERROR;
}
RdcLogger::RdcLogger(std::ostream& os) : os_(os) {
char* verbose = getenv("RDC_LOG");
if (verbose == nullptr) {
log_level_ = RDC_ERROR;
} else if (strcmp(verbose, "DEBUG") == 0) {
log_level_ = RDC_DEBUG;
} else if (strcmp(verbose, "INFO") == 0) {
log_level_ = RDC_INFO;
} else {
log_level_ = RDC_ERROR;
}
}
std::string RdcLogger::get_log_header(uint32_t severity,
const char* file, int line) {
std::stringstream strstream;
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>
(std::chrono::system_clock::now().time_since_epoch()).count();
strstream << std::fixed << std::setprecision(3) << (ms/1000.0) << " ";
if (severity == RDC_DEBUG) {
strstream << "DEBUG ";
} else if (severity == RDC_INFO) {
strstream << "INFO ";
} else {
strstream << "ERROR ";
}
std::string RdcLogger::get_log_header(uint32_t severity, const char* file, int line) {
std::stringstream strstream;
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
strstream << std::fixed << std::setprecision(3) << (ms / 1000.0) << " ";
if (severity == RDC_DEBUG) {
strstream << "DEBUG ";
} else if (severity == RDC_INFO) {
strstream << "INFO ";
} else {
strstream << "ERROR ";
}
// extract out the file path as it may be very long.
if (file != nullptr) {
std::string file_str(file);
auto found = file_str.find_last_of("/");
if (found != std::string::npos) {
file_str = file_str.substr(found+1);
}
strstream << file_str << "(" << line << "): ";
// extract out the file path as it may be very long.
if (file != nullptr) {
std::string file_str(file);
auto found = file_str.find_last_of("/");
if (found != std::string::npos) {
file_str = file_str.substr(found + 1);
}
strstream << file_str << "(" << line << "): ";
}
return strstream.str();
return strstream.str();
}
} // namespace rdc
} // namespace amd
+347 -378
查看文件
@@ -20,464 +20,433 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcCacheManagerImpl.h"
#include <sys/time.h>
#include <cmath>
#include <ctime>
#include <sstream>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
rdc_status_t RdcCacheManagerImpl::rdc_field_get_value_since(
uint32_t gpu_index, rdc_field_t field_id, uint64_t since_time_stamp,
uint64_t *next_since_time_stamp, rdc_field_value* value) {
if (!next_since_time_stamp || !value) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t RdcCacheManagerImpl::rdc_field_get_value_since(uint32_t gpu_index,
rdc_field_t field_id,
uint64_t since_time_stamp,
uint64_t* next_since_time_stamp,
rdc_field_value* value) {
if (!next_since_time_stamp || !value) {
return RDC_ST_BAD_PARAMETER;
}
std::lock_guard<std::mutex> guard(cache_mutex_);
RdcFieldKey field{gpu_index, field_id};
auto cache_samples_ite = cache_samples_.find(field);
if (cache_samples_ite == cache_samples_.end() ||
cache_samples_ite->second.size() == 0) {
return RDC_ST_NOT_FOUND;
}
// TODO(bill_liu): Optimize it using the binary search
auto cache_values = cache_samples_ite->second;
for (auto cache_value=cache_values.begin();
cache_value != cache_values.end(); cache_value++) {
if ( cache_value->last_time >= since_time_stamp ) {
// move to next potential timestamp
auto next_iter = std::next(cache_value);
if (next_iter != cache_values.end()) {
*next_since_time_stamp = next_iter->last_time;
} else { // Last item, set it to the future by adding 1us
*next_since_time_stamp = cache_value->last_time + 1;
}
value->ts = cache_value->last_time;
value->type = cache_value->type;
if (value->type == STRING) {
strncpy_with_null(value->value.str, cache_value->value.str,
RDC_MAX_STR_LENGTH);
} else {
value->value.l_int = cache_value->value.l_int;
}
value->field_id = field_id;
return RDC_ST_OK;
}
}
*next_since_time_stamp = since_time_stamp;
std::lock_guard<std::mutex> guard(cache_mutex_);
RdcFieldKey field{gpu_index, field_id};
auto cache_samples_ite = cache_samples_.find(field);
if (cache_samples_ite == cache_samples_.end() || cache_samples_ite->second.size() == 0) {
return RDC_ST_NOT_FOUND;
}
// TODO(bill_liu): Optimize it using the binary search
auto cache_values = cache_samples_ite->second;
for (auto cache_value = cache_values.begin(); cache_value != cache_values.end(); cache_value++) {
if (cache_value->last_time >= since_time_stamp) {
// move to next potential timestamp
auto next_iter = std::next(cache_value);
if (next_iter != cache_values.end()) {
*next_since_time_stamp = next_iter->last_time;
} else { // Last item, set it to the future by adding 1us
*next_since_time_stamp = cache_value->last_time + 1;
}
value->ts = cache_value->last_time;
value->type = cache_value->type;
if (value->type == STRING) {
strncpy_with_null(value->value.str, cache_value->value.str, RDC_MAX_STR_LENGTH);
} else {
value->value.l_int = cache_value->value.l_int;
}
value->field_id = field_id;
return RDC_ST_OK;
}
}
*next_since_time_stamp = since_time_stamp;
return RDC_ST_NOT_FOUND;
}
rdc_status_t RdcCacheManagerImpl::evict_cache(uint32_t gpu_index, rdc_field_t field_id,
uint64_t max_keep_samples, double max_keep_age) {
std::lock_guard<std::mutex> guard(cache_mutex_);
rdc_status_t RdcCacheManagerImpl::evict_cache(uint32_t gpu_index,
rdc_field_t field_id, uint64_t max_keep_samples, double max_keep_age) {
std::lock_guard<std::mutex> guard(cache_mutex_);
RdcFieldKey field{gpu_index, field_id};
auto cache_samples_ite = cache_samples_.find(field);
if (cache_samples_ite == cache_samples_.end() || cache_samples_ite->second.size() == 0) {
return RDC_ST_NOT_FOUND;
}
RdcFieldKey field{gpu_index, field_id};
auto cache_samples_ite = cache_samples_.find(field);
if (cache_samples_ite == cache_samples_.end() ||
cache_samples_ite->second.size() == 0) {
return RDC_ST_NOT_FOUND;
// Check max_keep_samples
auto& cache_values = cache_samples_ite->second;
int item_remove = cache_values.size() - max_keep_samples;
if (item_remove > 0) {
cache_values.erase(cache_values.begin(), cache_values.begin() + item_remove);
}
// Check max_keep_age
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t now = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
auto ite = cache_values.begin();
while (ite != cache_values.end()) {
if (ite->last_time + max_keep_age * 1000 >= now) {
break;
} else {
ite = cache_values.erase(ite);
}
}
// Check max_keep_samples
auto& cache_values = cache_samples_ite->second;
int item_remove = cache_values.size() - max_keep_samples;
if (item_remove > 0) {
cache_values.erase(cache_values.begin(),
cache_values.begin()+item_remove);
}
// Check max_keep_age
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t now = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
auto ite = cache_values.begin();
while (ite != cache_values.end()) {
if (ite->last_time + max_keep_age*1000 >= now) {
break;
} else {
ite = cache_values.erase(ite);
}
}
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_field_get_latest_value(
uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value) {
if (!value) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t RdcCacheManagerImpl::rdc_field_get_latest_value(uint32_t gpu_index,
rdc_field_t field_id,
rdc_field_value* value) {
if (!value) {
return RDC_ST_BAD_PARAMETER;
}
std::lock_guard<std::mutex> guard(cache_mutex_);
RdcFieldKey field{gpu_index, field_id};
auto cache_samples_ite = cache_samples_.find(field);
if (cache_samples_ite == cache_samples_.end() ||
cache_samples_ite->second.size() == 0) {
return RDC_ST_NOT_FOUND;
}
std::lock_guard<std::mutex> guard(cache_mutex_);
RdcFieldKey field{gpu_index, field_id};
auto cache_samples_ite = cache_samples_.find(field);
if (cache_samples_ite == cache_samples_.end() || cache_samples_ite->second.size() == 0) {
return RDC_ST_NOT_FOUND;
}
auto& cache_value = cache_samples_ite->second.back();
value->ts = cache_value.last_time;
value->type = cache_value.type;
value->value = cache_value.value;
value->field_id = field_id;
auto& cache_value = cache_samples_ite->second.back();
value->ts = cache_value.last_time;
value->type = cache_value.type;
value->value = cache_value.value;
value->field_id = field_id;
return RDC_ST_OK;
return RDC_ST_OK;
}
std::string RdcCacheManagerImpl::get_cache_stats() {
std::stringstream strstream;
std::lock_guard<std::mutex> guard(cache_mutex_);
std::stringstream strstream;
std::lock_guard<std::mutex> guard(cache_mutex_);
strstream << "Cache samples:";
auto cache_samples_ite = cache_samples_.begin();
for (; cache_samples_ite != cache_samples_.end(); cache_samples_ite++) {
strstream << "<" << cache_samples_ite->first.first << ","
<< cache_samples_ite->first.second << ":"
<< cache_samples_ite->second.size() << "> ";
}
strstream << "Cache samples:";
auto cache_samples_ite = cache_samples_.begin();
for (; cache_samples_ite != cache_samples_.end(); cache_samples_ite++) {
strstream << "<" << cache_samples_ite->first.first << "," << cache_samples_ite->first.second
<< ":" << cache_samples_ite->second.size() << "> ";
}
strstream <<" Job caches:";
auto job_ite = cache_jobs_.begin();
for ( ; job_ite != cache_jobs_.end(); job_ite++ ) {
strstream << "<" << job_ite->first << ":"
<< job_ite->second.gpu_stats.size() << "> ";
}
strstream << " Job caches:";
auto job_ite = cache_jobs_.begin();
for (; job_ite != cache_jobs_.end(); job_ite++) {
strstream << "<" << job_ite->first << ":" << job_ite->second.gpu_stats.size() << "> ";
}
return strstream.str();
return strstream.str();
}
rdc_status_t RdcCacheManagerImpl::rdc_update_cache(uint32_t gpu_index,
const rdc_field_value& value) {
RdcCacheEntry entry;
entry.last_time = value.ts;
entry.value = value.value;
entry.type = value.type;
const rdc_field_value& value) {
RdcCacheEntry entry;
entry.last_time = value.ts;
entry.value = value.value;
entry.type = value.type;
std::lock_guard<std::mutex> guard(cache_mutex_);
RdcFieldKey field{gpu_index, value.field_id};
auto cache_samples_ite = cache_samples_.find(field);
if (cache_samples_ite == cache_samples_.end()) {
std::vector<RdcCacheEntry> ve;
ve.push_back(entry);
cache_samples_.insert({field, ve});
} else {
cache_samples_ite->second.push_back(entry);
}
std::lock_guard<std::mutex> guard(cache_mutex_);
RdcFieldKey field{gpu_index, value.field_id};
auto cache_samples_ite = cache_samples_.find(field);
if (cache_samples_ite == cache_samples_.end()) {
std::vector<RdcCacheEntry> ve;
ve.push_back(entry);
cache_samples_.insert({field, ve});
} else {
cache_samples_ite->second.push_back(entry);
}
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_job_remove(const char job_id[64]) {
std::lock_guard<std::mutex> guard(cache_mutex_);
cache_jobs_.erase(job_id);
return RDC_ST_OK;
std::lock_guard<std::mutex> guard(cache_mutex_);
cache_jobs_.erase(job_id);
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_job_remove_all() {
std::lock_guard<std::mutex> guard(cache_mutex_);
cache_jobs_.clear();
return RDC_ST_OK;
std::lock_guard<std::mutex> guard(cache_mutex_);
cache_jobs_.clear();
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_update_job_stats(uint32_t gpu_index,
const std::string& job_id, const rdc_field_value& value) {
std::lock_guard<std::mutex> guard(cache_mutex_);
auto job_iter = cache_jobs_.find(job_id);
if (job_iter == cache_jobs_.end()) {
return RDC_ST_NOT_FOUND;
}
const std::string& job_id,
const rdc_field_value& value) {
std::lock_guard<std::mutex> guard(cache_mutex_);
auto job_iter = cache_jobs_.find(job_id);
if (job_iter == cache_jobs_.end()) {
return RDC_ST_NOT_FOUND;
}
auto gpu_iter = job_iter->second.gpu_stats.find(gpu_index);
if (gpu_iter == job_iter->second.gpu_stats.end()) {
return RDC_ST_NOT_FOUND;
}
auto gpu_iter = job_iter->second.gpu_stats.find(gpu_index);
if (gpu_iter == job_iter->second.gpu_stats.end()) {
return RDC_ST_NOT_FOUND;
}
auto fsummary = gpu_iter->second.field_summaries.find(value.field_id);
if (fsummary == gpu_iter->second.field_summaries.end()) {
return RDC_ST_NOT_FOUND;
}
if (fsummary->second.count == 0) { // first item
fsummary->second.count = 1;
fsummary->second.max_value = value.value.l_int;
fsummary->second.min_value = value.value.l_int;
fsummary->second.total_value = value.value.l_int;
fsummary->second.last_time = value.ts;
if (value.field_id == RDC_FI_POWER_USAGE) {
gpu_iter->second.energy_last_time = value.ts;
}
// https://www.johndcook.com/blog/standard_deviation/
fsummary->second.old_s = 0;
fsummary->second.old_m = fsummary->second.new_m = value.value.l_int;
return RDC_ST_OK;
}
if (value.field_id == RDC_FI_POWER_USAGE) {
uint64_t time_elapsed = value.ts - gpu_iter->second.energy_last_time;
// Stored in cache as microseconds and microwats
gpu_iter->second.energy_consumed +=
(time_elapsed * value.value.l_int)/(1000.0*1000000);
}
fsummary->second.max_value = std::max(fsummary->second.max_value,
static_cast<int64_t>(value.value.l_int));
fsummary->second.min_value = std::min(fsummary->second.min_value,
static_cast<int64_t>(value.value.l_int));
fsummary->second.total_value += value.value.l_int;
auto fsummary = gpu_iter->second.field_summaries.find(value.field_id);
if (fsummary == gpu_iter->second.field_summaries.end()) {
return RDC_ST_NOT_FOUND;
}
if (fsummary->second.count == 0) { // first item
fsummary->second.count = 1;
fsummary->second.max_value = value.value.l_int;
fsummary->second.min_value = value.value.l_int;
fsummary->second.total_value = value.value.l_int;
fsummary->second.last_time = value.ts;
fsummary->second.count++;
if (value.field_id == RDC_FI_POWER_USAGE) {
gpu_iter->second.energy_last_time = value.ts;
}
// https://www.johndcook.com/blog/standard_deviation/
fsummary->second.new_m = fsummary->second.old_m +
(value.value.l_int - fsummary->second.old_m)/fsummary->second.count;
fsummary->second.new_s = fsummary->second.old_s +
(value.value.l_int - fsummary->second.old_m)*
(value.value.l_int - fsummary->second.new_m);
fsummary->second.old_m = fsummary->second.new_m;
fsummary->second.old_s = fsummary->second.new_s;
fsummary->second.old_s = 0;
fsummary->second.old_m = fsummary->second.new_m = value.value.l_int;
return RDC_ST_OK;
}
if (value.field_id == RDC_FI_POWER_USAGE) {
uint64_t time_elapsed = value.ts - gpu_iter->second.energy_last_time;
// Stored in cache as microseconds and microwats
gpu_iter->second.energy_consumed += (time_elapsed * value.value.l_int) / (1000.0 * 1000000);
}
fsummary->second.max_value =
std::max(fsummary->second.max_value, static_cast<int64_t>(value.value.l_int));
fsummary->second.min_value =
std::min(fsummary->second.min_value, static_cast<int64_t>(value.value.l_int));
fsummary->second.total_value += value.value.l_int;
fsummary->second.last_time = value.ts;
fsummary->second.count++;
// https://www.johndcook.com/blog/standard_deviation/
fsummary->second.new_m = fsummary->second.old_m +
(value.value.l_int - fsummary->second.old_m) / fsummary->second.count;
fsummary->second.new_s =
fsummary->second.old_s +
(value.value.l_int - fsummary->second.old_m) * (value.value.l_int - fsummary->second.new_m);
fsummary->second.old_m = fsummary->second.new_m;
fsummary->second.old_s = fsummary->second.new_s;
return RDC_ST_OK;
}
void RdcCacheManagerImpl::set_summary(const FieldSummaryStats & stats,
rdc_stats_summary_t & gpu, rdc_stats_summary_t& summary,
unsigned int adjuster) {
if (stats.count == 0) {
gpu.min_value = std::numeric_limits<uint64_t>::max();
gpu.max_value = gpu.average = 0;
return;
}
void RdcCacheManagerImpl::set_summary(const FieldSummaryStats& stats, rdc_stats_summary_t& gpu,
rdc_stats_summary_t& summary, unsigned int adjuster) {
if (stats.count == 0) {
gpu.min_value = std::numeric_limits<uint64_t>::max();
gpu.max_value = gpu.average = 0;
return;
}
gpu.max_value = stats.max_value / adjuster;
gpu.min_value = stats.min_value / adjuster;
gpu.average = stats.total_value / stats.count / adjuster;
summary.max_value = std::max(summary.max_value, gpu.max_value);
summary.min_value = std::min(summary.min_value, gpu.min_value);
//< save total for future average calculation.
summary.average += gpu.average;
gpu.max_value = stats.max_value / adjuster;
gpu.min_value = stats.min_value / adjuster;
gpu.average = stats.total_value / stats.count / adjuster;
summary.max_value = std::max(summary.max_value, gpu.max_value);
summary.min_value = std::min(summary.min_value, gpu.min_value);
//< save total for future average calculation.
summary.average += gpu.average;
//< calculate the sample variance
gpu.standard_deviation = std::sqrt((stats.count > 1)
? stats.new_s/(stats.count - 1) : 0.0)/adjuster;
summary.standard_deviation += gpu.standard_deviation;
//< calculate the sample variance
gpu.standard_deviation =
std::sqrt((stats.count > 1) ? stats.new_s / (stats.count - 1) : 0.0) / adjuster;
summary.standard_deviation += gpu.standard_deviation;
}
rdc_status_t RdcCacheManagerImpl::rdc_job_get_stats(const char jobId[64],
const rdc_gpu_gauges_t& gpu_gauges,
rdc_job_info_t* p_job_info) {
std::lock_guard<std::mutex> guard(cache_mutex_);
auto job_stats = cache_jobs_.find(jobId);
const rdc_gpu_gauges_t& gpu_gauges,
rdc_job_info_t* p_job_info) {
std::lock_guard<std::mutex> guard(cache_mutex_);
auto job_stats = cache_jobs_.find(jobId);
if (job_stats == cache_jobs_.end()) {
return RDC_ST_NOT_FOUND;
}
if (job_stats == cache_jobs_.end()) {
return RDC_ST_NOT_FOUND;
}
//< Init the summary info
bool is_job_stopped = (job_stats->second.end_time != 0);
RDC_LOG(RDC_DEBUG, "rdc_job_get_stats for job " << jobId);
auto& summary_info = p_job_info->summary;
summary_info.start_time = job_stats->second.start_time;
if (job_stats->second.end_time == 0) {
summary_info.end_time = time(nullptr);
//< Init the summary info
bool is_job_stopped = (job_stats->second.end_time != 0);
RDC_LOG(RDC_DEBUG, "rdc_job_get_stats for job " << jobId);
auto& summary_info = p_job_info->summary;
summary_info.start_time = job_stats->second.start_time;
if (job_stats->second.end_time == 0) {
summary_info.end_time = time(nullptr);
} else {
summary_info.end_time = job_stats->second.end_time;
}
summary_info.energy_consumed = 0;
summary_info.max_gpu_memory_used = 0;
summary_info.ecc_correct = 0;
summary_info.ecc_uncorrect = 0;
summary_info.power_usage = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.pcie_tx = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.pcie_rx = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.gpu_temperature = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.memory_clock = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.gpu_clock = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.gpu_utilization = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.memory_utilization = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
p_job_info->num_gpus = job_stats->second.gpu_stats.size();
//< Populate information for each GPUs
auto gpus = job_stats->second.gpu_stats.begin();
for (; gpus != job_stats->second.gpu_stats.end(); gpus++) {
auto& gpu_info = p_job_info->gpus[gpus->first];
gpu_info.start_time = summary_info.start_time;
gpu_info.end_time = summary_info.end_time;
gpu_info.energy_consumed = gpus->second.energy_consumed;
summary_info.energy_consumed += gpu_info.energy_consumed;
if (is_job_stopped) {
gpu_info.ecc_correct = gpus->second.ecc_correct_init;
summary_info.ecc_correct += gpu_info.ecc_correct;
} else if (gpu_gauges.find({gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) {
gpu_info.ecc_correct =
gpu_gauges.at({gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) - gpus->second.ecc_correct_init;
summary_info.ecc_correct += gpu_info.ecc_correct;
} else {
summary_info.end_time = job_stats->second.end_time;
gpu_info.ecc_correct = 0;
}
summary_info.energy_consumed = 0;
summary_info.max_gpu_memory_used = 0;
summary_info.ecc_correct = 0;
summary_info.ecc_uncorrect = 0;
summary_info.power_usage = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.pcie_tx = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.pcie_rx = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.gpu_temperature =
{0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.memory_clock = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.gpu_clock = {0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.gpu_utilization =
{0, std::numeric_limits<uint64_t>::max(), 0, 0};
summary_info.memory_utilization = {0,
std::numeric_limits<uint64_t>::max(), 0, 0};
p_job_info->num_gpus = job_stats->second.gpu_stats.size();
//< Populate information for each GPUs
auto gpus = job_stats->second.gpu_stats.begin();
for (; gpus != job_stats->second.gpu_stats.end(); gpus++) {
auto & gpu_info = p_job_info->gpus[gpus->first];
gpu_info.start_time = summary_info.start_time;
gpu_info.end_time = summary_info.end_time;
gpu_info.energy_consumed = gpus->second.energy_consumed;
summary_info.energy_consumed += gpu_info.energy_consumed;
if (is_job_stopped) {
gpu_info.ecc_correct = gpus->second.ecc_correct_init;
summary_info.ecc_correct += gpu_info.ecc_correct;
} else if (gpu_gauges.find({gpus->first,
RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) {
gpu_info.ecc_correct = gpu_gauges.at({
gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) -
gpus->second.ecc_correct_init;
summary_info.ecc_correct += gpu_info.ecc_correct;
} else {
gpu_info.ecc_correct = 0;
}
if (is_job_stopped) {
gpu_info.ecc_uncorrect = gpus->second.ecc_uncorrect_init;
summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect;
} else if (gpu_gauges.find({gpus->first,
RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) {
gpu_info.ecc_uncorrect = gpu_gauges.at({
gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) -
gpus->second.ecc_uncorrect_init;
summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect;
} else {
gpu_info.ecc_uncorrect = 0;
}
if (gpu_gauges.find({gpus->first,
RDC_FI_GPU_MEMORY_TOTAL}) == gpu_gauges.end()) {
RDC_LOG(RDC_ERROR, "Cannot find the total memory");
return RDC_ST_BAD_PARAMETER;
}
uint64_t tmemory = gpu_gauges.at({gpus->first,
RDC_FI_GPU_MEMORY_TOTAL});
auto ite = gpus->second.field_summaries.begin();
for (; ite != gpus->second.field_summaries.end(); ite++) {
if (ite->first == RDC_FI_POWER_USAGE) {
set_summary(ite->second,
gpu_info.power_usage, summary_info.power_usage, 1000000);
} else if (ite->first == RDC_FI_GPU_MEMORY_USAGE) {
set_summary(ite->second, gpu_info.memory_utilization,
summary_info.memory_utilization, tmemory/100);
gpu_info.max_gpu_memory_used = ite->second.max_value;
summary_info.max_gpu_memory_used = std::max(
summary_info.max_gpu_memory_used,
gpu_info.max_gpu_memory_used);
} else if (ite->first == RDC_FI_GPU_CLOCK) {
set_summary(ite->second, gpu_info.gpu_clock,
summary_info.gpu_clock, 1000000);
} else if (ite->first == RDC_FI_GPU_UTIL) {
set_summary(ite->second, gpu_info.gpu_utilization,
summary_info.gpu_utilization, 1);
} else if (ite->first == RDC_FI_GPU_TEMP) {
set_summary(ite->second,
gpu_info.gpu_temperature, summary_info.gpu_temperature, 1000);
} else if (ite->first == RDC_FI_MEM_CLOCK) {
set_summary(ite->second,
gpu_info.memory_clock, summary_info.memory_clock, 1000000);
} else if (ite->first == RDC_FI_PCIE_TX) {
set_summary(ite->second,
gpu_info.pcie_tx, summary_info.pcie_tx, 1024*1024);
} else if (ite->first == RDC_FI_PCIE_RX) {
set_summary(ite->second,
gpu_info.pcie_rx, summary_info.pcie_rx, 1024*1024);
}
}
if (is_job_stopped) {
gpu_info.ecc_uncorrect = gpus->second.ecc_uncorrect_init;
summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect;
} else if (gpu_gauges.find({gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) {
gpu_info.ecc_uncorrect = gpu_gauges.at({gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) -
gpus->second.ecc_uncorrect_init;
summary_info.ecc_uncorrect += gpu_info.ecc_uncorrect;
} else {
gpu_info.ecc_uncorrect = 0;
}
// Set the average of the summary
set_average_summary(summary_info.power_usage, p_job_info->num_gpus);
set_average_summary(summary_info.gpu_clock, p_job_info->num_gpus);
set_average_summary(summary_info.gpu_utilization, p_job_info->num_gpus);
set_average_summary(summary_info.memory_utilization, p_job_info->num_gpus);
set_average_summary(summary_info.pcie_tx, p_job_info->num_gpus);
set_average_summary(summary_info.pcie_rx, p_job_info->num_gpus);
set_average_summary(summary_info.gpu_temperature, p_job_info->num_gpus);
set_average_summary(summary_info.memory_clock, p_job_info->num_gpus);
return RDC_ST_OK;
if (gpu_gauges.find({gpus->first, RDC_FI_GPU_MEMORY_TOTAL}) == gpu_gauges.end()) {
RDC_LOG(RDC_ERROR, "Cannot find the total memory");
return RDC_ST_BAD_PARAMETER;
}
uint64_t tmemory = gpu_gauges.at({gpus->first, RDC_FI_GPU_MEMORY_TOTAL});
auto ite = gpus->second.field_summaries.begin();
for (; ite != gpus->second.field_summaries.end(); ite++) {
if (ite->first == RDC_FI_POWER_USAGE) {
set_summary(ite->second, gpu_info.power_usage, summary_info.power_usage, 1000000);
} else if (ite->first == RDC_FI_GPU_MEMORY_USAGE) {
set_summary(ite->second, gpu_info.memory_utilization, summary_info.memory_utilization,
tmemory / 100);
gpu_info.max_gpu_memory_used = ite->second.max_value;
summary_info.max_gpu_memory_used =
std::max(summary_info.max_gpu_memory_used, gpu_info.max_gpu_memory_used);
} else if (ite->first == RDC_FI_GPU_CLOCK) {
set_summary(ite->second, gpu_info.gpu_clock, summary_info.gpu_clock, 1000000);
} else if (ite->first == RDC_FI_GPU_UTIL) {
set_summary(ite->second, gpu_info.gpu_utilization, summary_info.gpu_utilization, 1);
} else if (ite->first == RDC_FI_GPU_TEMP) {
set_summary(ite->second, gpu_info.gpu_temperature, summary_info.gpu_temperature, 1000);
} else if (ite->first == RDC_FI_MEM_CLOCK) {
set_summary(ite->second, gpu_info.memory_clock, summary_info.memory_clock, 1000000);
} else if (ite->first == RDC_FI_PCIE_TX) {
set_summary(ite->second, gpu_info.pcie_tx, summary_info.pcie_tx, 1024 * 1024);
} else if (ite->first == RDC_FI_PCIE_RX) {
set_summary(ite->second, gpu_info.pcie_rx, summary_info.pcie_rx, 1024 * 1024);
}
}
}
// Set the average of the summary
set_average_summary(summary_info.power_usage, p_job_info->num_gpus);
set_average_summary(summary_info.gpu_clock, p_job_info->num_gpus);
set_average_summary(summary_info.gpu_utilization, p_job_info->num_gpus);
set_average_summary(summary_info.memory_utilization, p_job_info->num_gpus);
set_average_summary(summary_info.pcie_tx, p_job_info->num_gpus);
set_average_summary(summary_info.pcie_rx, p_job_info->num_gpus);
set_average_summary(summary_info.gpu_temperature, p_job_info->num_gpus);
set_average_summary(summary_info.memory_clock, p_job_info->num_gpus);
return RDC_ST_OK;
}
void RdcCacheManagerImpl::set_average_summary(
rdc_stats_summary_t& summary, uint32_t num_gpus) {
summary.average = summary.average/num_gpus;
summary.standard_deviation = summary.standard_deviation/num_gpus;
void RdcCacheManagerImpl::set_average_summary(rdc_stats_summary_t& summary, uint32_t num_gpus) {
summary.average = summary.average / num_gpus;
summary.standard_deviation = summary.standard_deviation / num_gpus;
}
rdc_status_t RdcCacheManagerImpl::rdc_job_start_stats(const char job_id[64],
const rdc_group_info_t& ginfo, const rdc_field_group_info_t& finfo,
const rdc_gpu_gauges_t& gpu_gauges) {
RdcJobStatsCacheEntry cacheEntry;
cacheEntry.start_time = std::time(nullptr);
cacheEntry.end_time = 0;
for (unsigned int i=0 ; i < ginfo.count; i++) { // GPUs
GpuSummaryStats gstats;
gstats.energy_consumed = 0;
gstats.energy_last_time = 0;
for (unsigned int j = 0; j < finfo.count; j++) { // init fields
FieldSummaryStats s;
s.count = 0;
s.max_value = s.min_value = s.total_value = 0;
gstats.field_summaries.insert({finfo.field_ids[j], s});
}
const rdc_group_info_t& ginfo,
const rdc_field_group_info_t& finfo,
const rdc_gpu_gauges_t& gpu_gauges) {
RdcJobStatsCacheEntry cacheEntry;
cacheEntry.start_time = std::time(nullptr);
cacheEntry.end_time = 0;
for (unsigned int i = 0; i < ginfo.count; i++) { // GPUs
GpuSummaryStats gstats;
gstats.energy_consumed = 0;
gstats.energy_last_time = 0;
for (unsigned int j = 0; j < finfo.count; j++) { // init fields
FieldSummaryStats s;
s.count = 0;
s.max_value = s.min_value = s.total_value = 0;
gstats.field_summaries.insert({finfo.field_ids[j], s});
}
gstats.ecc_correct_init = 0;
if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL}) !=
gpu_gauges.end()) {
gstats.ecc_correct_init = gpu_gauges.at(
{ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL});
}
gstats.ecc_correct_init = 0;
if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) {
gstats.ecc_correct_init = gpu_gauges.at({ginfo.entity_ids[i], RDC_FI_ECC_CORRECT_TOTAL});
}
gstats.ecc_uncorrect_init = 0;
if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL}) !=
gpu_gauges.end()) {
gstats.ecc_uncorrect_init = gpu_gauges.at(
{ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL});
}
gstats.ecc_uncorrect_init = 0;
if (gpu_gauges.find({ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) {
gstats.ecc_uncorrect_init = gpu_gauges.at({ginfo.entity_ids[i], RDC_FI_ECC_UNCORRECT_TOTAL});
}
cacheEntry.gpu_stats.insert({ginfo.entity_ids[i], gstats});
}
cacheEntry.gpu_stats.insert({ginfo.entity_ids[i], gstats});
}
std::lock_guard<std::mutex> guard(cache_mutex_);
// Remove the old stats if it exists
cache_jobs_.erase(job_id);
cache_jobs_.insert({job_id, cacheEntry});
return RDC_ST_OK;
std::lock_guard<std::mutex> guard(cache_mutex_);
// Remove the old stats if it exists
cache_jobs_.erase(job_id);
cache_jobs_.insert({job_id, cacheEntry});
return RDC_ST_OK;
}
rdc_status_t RdcCacheManagerImpl::rdc_job_stop_stats(const char job_id[64],
const rdc_gpu_gauges_t& gpu_gauges) {
std::lock_guard<std::mutex> guard(cache_mutex_);
auto job_stats = cache_jobs_.find(job_id);
const rdc_gpu_gauges_t& gpu_gauges) {
std::lock_guard<std::mutex> guard(cache_mutex_);
auto job_stats = cache_jobs_.find(job_id);
if (job_stats == cache_jobs_.end()) {
return RDC_ST_NOT_FOUND;
if (job_stats == cache_jobs_.end()) {
return RDC_ST_NOT_FOUND;
}
job_stats->second.end_time = std::time(nullptr);
// update the ecc errors
auto gpus = job_stats->second.gpu_stats.begin();
for (; gpus != job_stats->second.gpu_stats.end(); gpus++) {
if (gpu_gauges.find({gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) {
gpus->second.ecc_correct_init =
gpu_gauges.at({gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) - gpus->second.ecc_correct_init;
}
job_stats->second.end_time = std::time(nullptr);
// update the ecc errors
auto gpus = job_stats->second.gpu_stats.begin();
for (; gpus != job_stats->second.gpu_stats.end(); gpus++) {
if (gpu_gauges.find({gpus->first,
RDC_FI_ECC_CORRECT_TOTAL}) != gpu_gauges.end()) {
gpus->second.ecc_correct_init = gpu_gauges.at({
gpus->first, RDC_FI_ECC_CORRECT_TOTAL}) -
gpus->second.ecc_correct_init;
}
if (gpu_gauges.find({gpus->first,
RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) {
gpus->second.ecc_uncorrect_init = gpu_gauges.at({
gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) -
gpus->second.ecc_uncorrect_init;
}
if (gpu_gauges.find({gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) != gpu_gauges.end()) {
gpus->second.ecc_uncorrect_init = gpu_gauges.at({gpus->first, RDC_FI_ECC_UNCORRECT_TOTAL}) -
gpus->second.ecc_uncorrect_init;
}
}
return RDC_ST_OK;
return RDC_ST_OK;
}
} // namespace rdc
+87 -93
查看文件
@@ -20,132 +20,126 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcDiagnosticModule.h"
#include <functional>
#include <map>
#include <memory>
#include <vector>
#include <functional>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/impl/RdcSmiLib.h"
#include "rdc_lib/impl/RdcRasLib.h"
#include "rdc_lib/impl/RdcRocrLib.h"
#include "rdc_lib/impl/RdcSmiLib.h"
namespace amd {
namespace rdc {
rdc_status_t RdcDiagnosticModule::rdc_diag_test_cases_query(
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) {
if (test_case_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES], uint32_t* test_case_count) {
if (test_case_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
auto ite = diagnostic_modules_.begin();
*test_case_count = 0;
for (; ite != diagnostic_modules_.end(); ite++) {
uint32_t count = 0;
rdc_status_t status =
(*ite)->rdc_diag_test_cases_query(&(test_cases[*test_case_count]), &count);
if (status == RDC_ST_OK) {
*test_case_count += count;
}
auto ite = diagnostic_modules_.begin();
*test_case_count = 0;
for (; ite != diagnostic_modules_.end(); ite++) {
uint32_t count = 0;
rdc_status_t status = (*ite)->rdc_diag_test_cases_query(
&(test_cases[*test_case_count]), &count);
if (status == RDC_ST_OK) {
*test_case_count += count;
}
}
return RDC_ST_OK;
}
return RDC_ST_OK;
}
rdc_status_t RdcDiagnosticModule::rdc_test_case_run(
rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t RdcDiagnosticModule::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
// Init test status
auto ite = testcases_to_module_.find(test_case);
if (ite == testcases_to_module_.end()) {
result->status = RDC_DIAG_RESULT_SKIP;
strncpy_with_null(result->info, "Not implemented", MAX_DIAG_MSG_LENGTH);
return RDC_ST_NOT_SUPPORTED;
}
return ite->second->rdc_test_case_run(test_case,
gpu_index, gpu_count, result);
// Init test status
auto ite = testcases_to_module_.find(test_case);
if (ite == testcases_to_module_.end()) {
result->status = RDC_DIAG_RESULT_SKIP;
strncpy_with_null(result->info, "Not implemented", MAX_DIAG_MSG_LENGTH);
return RDC_ST_NOT_SUPPORTED;
}
return ite->second->rdc_test_case_run(test_case, gpu_index, gpu_count, result);
}
rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(
const rdc_group_info_t& gpus,
rdc_diag_level_t level,
rdc_diag_response_t* response) {
if (response == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t RdcDiagnosticModule::rdc_diagnostic_run(const rdc_group_info_t& gpus,
rdc_diag_level_t level,
rdc_diag_response_t* response) {
if (response == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
std::vector<rdc_diag_test_cases_t> rdc_runs;
if (level >= RDC_DIAG_LVL_SHORT) { // Short run and above
rdc_runs.push_back(RDC_DIAG_COMPUTE_PROCESS);
rdc_runs.push_back(RDC_DIAG_NODE_TOPOLOGY);
rdc_runs.push_back(RDC_DIAG_GPU_PARAMETERS);
rdc_runs.push_back(RDC_DIAG_COMPUTE_QUEUE);
rdc_runs.push_back(RDC_DIAG_SYS_MEM_CHECK);
}
std::vector<rdc_diag_test_cases_t> rdc_runs;
if (level >= RDC_DIAG_LVL_SHORT) { // Short run and above
rdc_runs.push_back(RDC_DIAG_COMPUTE_PROCESS);
rdc_runs.push_back(RDC_DIAG_NODE_TOPOLOGY);
rdc_runs.push_back(RDC_DIAG_GPU_PARAMETERS);
rdc_runs.push_back(RDC_DIAG_COMPUTE_QUEUE);
rdc_runs.push_back(RDC_DIAG_SYS_MEM_CHECK);
}
response->results_count = 0;
for (unsigned int i=0; i < rdc_runs.size(); i++) {
response->diag_info[i].test_case = rdc_runs[i];
rdc_test_case_run(rdc_runs[i],
const_cast<uint32_t*>(gpus.entity_ids),
gpus.count, &(response->diag_info[i]));
response->results_count++;
}
response->results_count = 0;
for (unsigned int i = 0; i < rdc_runs.size(); i++) {
response->diag_info[i].test_case = rdc_runs[i];
rdc_test_case_run(rdc_runs[i], const_cast<uint32_t*>(gpus.entity_ids), gpus.count,
&(response->diag_info[i]));
response->results_count++;
}
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcDiagnosticModule::rdc_diag_init(uint64_t flag) {
auto ite = diagnostic_modules_.begin();
for (; ite != diagnostic_modules_.end(); ite++) {
(*ite)->rdc_diag_init(flag);
}
return RDC_ST_OK;
auto ite = diagnostic_modules_.begin();
for (; ite != diagnostic_modules_.end(); ite++) {
(*ite)->rdc_diag_init(flag);
}
return RDC_ST_OK;
}
rdc_status_t RdcDiagnosticModule::RdcDiagnosticModule::rdc_diag_destroy() {
auto ite = diagnostic_modules_.begin();
for (; ite != diagnostic_modules_.end(); ite++) {
(*ite)->rdc_diag_destroy();
}
return RDC_ST_OK;
auto ite = diagnostic_modules_.begin();
for (; ite != diagnostic_modules_.end(); ite++) {
(*ite)->rdc_diag_destroy();
}
return RDC_ST_OK;
}
RdcDiagnosticModule::RdcDiagnosticModule(RdcMetricFetcherPtr& fetcher) {
const RdcSmiLibPtr smi_module = std::make_shared<RdcSmiLib>(fetcher);
const RdcRasLibPtr ras_module = std::make_shared<RdcRasLib>();
const RdcRocrLibPtr rocr_module = std::make_shared<RdcRocrLib>();
if (smi_module) {
diagnostic_modules_.push_back(smi_module);
}
if (rocr_module) {
diagnostic_modules_.push_back(rocr_module);
}
if (ras_module) {
diagnostic_modules_.push_back(ras_module);
}
const RdcSmiLibPtr smi_module = std::make_shared<RdcSmiLib>(fetcher);
const RdcRasLibPtr ras_module = std::make_shared<RdcRasLib>();
const RdcRocrLibPtr rocr_module = std::make_shared<RdcRocrLib>();
if (smi_module) {
diagnostic_modules_.push_back(smi_module);
}
if (rocr_module) {
diagnostic_modules_.push_back(rocr_module);
}
if (ras_module) {
diagnostic_modules_.push_back(ras_module);
}
auto ite = diagnostic_modules_.begin();
for (; ite != diagnostic_modules_.end(); ite++) {
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES];
uint32_t test_count = 0;
rdc_status_t status = (*ite)->
rdc_diag_test_cases_query(test_cases, &test_count);
if (status == RDC_ST_OK) {
for (uint32_t index = 0; index < test_count; index++) {
testcases_to_module_.insert({test_cases[index], (*ite)});
}
}
auto ite = diagnostic_modules_.begin();
for (; ite != diagnostic_modules_.end(); ite++) {
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES];
uint32_t test_count = 0;
rdc_status_t status = (*ite)->rdc_diag_test_cases_query(test_cases, &test_count);
if (status == RDC_ST_OK) {
for (uint32_t index = 0; index < test_count; index++) {
testcases_to_module_.insert({test_cases[index], (*ite)});
}
}
}
}
} // namespace rdc
} // namespace amd
+258 -298
查看文件
@@ -19,48 +19,50 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <string.h>
#include "rdc_lib/impl/RdcEmbeddedHandler.h"
#include "rdc_lib/impl/RdcMetricFetcherImpl.h"
#include "rdc_lib/impl/RdcGroupSettingsImpl.h"
#include "rdc_lib/impl/RdcMetricsUpdaterImpl.h"
#include <string.h>
#include "common/rdc_fields_supported.h"
#include "rdc_lib/RdcException.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/RdcNotification.h"
#include "rdc_lib/impl/RdcCacheManagerImpl.h"
#include "rdc_lib/impl/RdcWatchTableImpl.h"
#include "rdc_lib/impl/RdcGroupSettingsImpl.h"
#include "rdc_lib/impl/RdcMetricFetcherImpl.h"
#include "rdc_lib/impl/RdcMetricsUpdaterImpl.h"
#include "rdc_lib/impl/RdcModuleMgrImpl.h"
#include "rdc_lib/impl/RdcNotificationImpl.h"
#include "rdc_lib/impl/RdcWatchTableImpl.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/RdcException.h"
#include "rdc_lib/RdcNotification.h"
#include "common/rdc_fields_supported.h"
#include "rocm_smi/rocm_smi.h"
namespace {
// call the rsmi_init when load library
// and rsmi_shutdown when unload the library.
class rsmi_initializer {
rsmi_initializer() {
// Make sure rsmi will not be initialized multiple times
rsmi_shut_down();
rsmi_status_t rsmi_ret = rsmi_init(0);
if (rsmi_ret != RSMI_STATUS_SUCCESS) {
throw amd::rdc::RdcException(
RDC_ST_FAIL_LOAD_MODULE, "RSMI initialize fail");
}
}
~rsmi_initializer() { rsmi_shut_down();}
rsmi_initializer() {
// Make sure rsmi will not be initialized multiple times
rsmi_shut_down();
rsmi_status_t rsmi_ret = rsmi_init(0);
if (rsmi_ret != RSMI_STATUS_SUCCESS) {
throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "RSMI initialize fail");
}
}
~rsmi_initializer() { rsmi_shut_down(); }
public:
static rsmi_initializer& getInstance() {
static rsmi_initializer instance;
return instance;
}
static rsmi_initializer& getInstance() {
static rsmi_initializer instance;
return instance;
}
};
static rsmi_initializer& in = rsmi_initializer::getInstance();
} // namespace
amd::rdc::RdcHandler *make_handler(rdc_operation_mode_t op_mode) {
return new amd::rdc::RdcEmbeddedHandler(op_mode);
amd::rdc::RdcHandler* make_handler(rdc_operation_mode_t op_mode) {
return new amd::rdc::RdcEmbeddedHandler(op_mode);
}
namespace amd {
@@ -69,368 +71,326 @@ namespace rdc {
// TODO(bill_liu): make it configurable
const uint32_t METIC_UPDATE_FREQUENCY = 1000; // 1000 microseconds by default
RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode):
group_settings_(new RdcGroupSettingsImpl())
, cache_mgr_(new RdcCacheManagerImpl())
, metric_fetcher_(new RdcMetricFetcherImpl())
, rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_))
, rdc_notif_(new RdcNotificationImpl())
, watch_table_(new RdcWatchTableImpl(group_settings_,
cache_mgr_, rdc_module_mgr_, rdc_notif_))
, metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_,
METIC_UPDATE_FREQUENCY)) {
if (mode == RDC_OPERATION_MODE_AUTO) {
RDC_LOG(RDC_DEBUG, "Run RDC with RDC_OPERATION_MODE_AUTO");
metrics_updater_->start();
}
RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode)
: group_settings_(new RdcGroupSettingsImpl()),
cache_mgr_(new RdcCacheManagerImpl()),
metric_fetcher_(new RdcMetricFetcherImpl()),
rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)),
rdc_notif_(new RdcNotificationImpl()),
watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, rdc_module_mgr_, rdc_notif_)),
metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)) {
if (mode == RDC_OPERATION_MODE_AUTO) {
RDC_LOG(RDC_DEBUG, "Run RDC with RDC_OPERATION_MODE_AUTO");
metrics_updater_->start();
}
}
RdcEmbeddedHandler::~RdcEmbeddedHandler() {
metrics_updater_->stop();
}
RdcEmbeddedHandler::~RdcEmbeddedHandler() { metrics_updater_->stop(); }
// JOB API
rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats(rdc_gpu_group_t groupId,
const char job_id[64], uint64_t update_freq) {
rdc_gpu_gauges_t gpu_gauges;
rdc_status_t status = get_gpu_gauges(&gpu_gauges);
if (status != RDC_ST_OK) return status;
rdc_status_t RdcEmbeddedHandler::rdc_job_start_stats(rdc_gpu_group_t groupId, const char job_id[64],
uint64_t update_freq) {
rdc_gpu_gauges_t gpu_gauges;
rdc_status_t status = get_gpu_gauges(&gpu_gauges);
if (status != RDC_ST_OK) return status;
return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq,
gpu_gauges);
return watch_table_->rdc_job_start_stats(groupId, job_id, update_freq, gpu_gauges);
}
rdc_status_t RdcEmbeddedHandler::get_gpu_gauges(rdc_gpu_gauges_t* gpu_gauges) {
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
uint32_t count = 0;
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
uint32_t count = 0;
if (gpu_gauges == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t status = rdc_device_get_all(
gpu_index_list, &count);
if (gpu_gauges == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t status = rdc_device_get_all(gpu_index_list, &count);
if (status != RDC_ST_OK) {
return status;
}
// Fetch total memory and current ecc errors
for (uint32_t i = 0; i < count; i++) {
rdc_field_value value;
status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], RDC_FI_GPU_MEMORY_TOTAL, &value);
if (status != RDC_ST_OK) {
return status;
RDC_LOG(RDC_ERROR, "Fail to get total memory of GPU " << gpu_index_list[i]);
return status;
}
gpu_gauges->insert({{gpu_index_list[i], RDC_FI_GPU_MEMORY_TOTAL}, value.value.l_int});
// Fetch total memory and current ecc errors
for (uint32_t i = 0; i < count ; i++) {
rdc_field_value value;
status = metric_fetcher_->fetch_smi_field(gpu_index_list[i],
RDC_FI_GPU_MEMORY_TOTAL, &value);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Fail to get total memory of GPU "
<< gpu_index_list[i]);
return status;
}
gpu_gauges->insert({{gpu_index_list[i], RDC_FI_GPU_MEMORY_TOTAL},
value.value.l_int});
status = metric_fetcher_->fetch_smi_field(gpu_index_list[i],
RDC_FI_ECC_CORRECT_TOTAL, &value);
if (status == RDC_ST_OK) {
gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_CORRECT_TOTAL},
value.value.l_int});
}
status = metric_fetcher_->fetch_smi_field(gpu_index_list[i],
RDC_FI_ECC_UNCORRECT_TOTAL, &value);
if (status == RDC_ST_OK) {
gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_UNCORRECT_TOTAL},
value.value.l_int});
}
status = metric_fetcher_->fetch_smi_field(gpu_index_list[i], RDC_FI_ECC_CORRECT_TOTAL, &value);
if (status == RDC_ST_OK) {
gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_CORRECT_TOTAL}, value.value.l_int});
}
return RDC_ST_OK;
status =
metric_fetcher_->fetch_smi_field(gpu_index_list[i], RDC_FI_ECC_UNCORRECT_TOTAL, &value);
if (status == RDC_ST_OK) {
gpu_gauges->insert({{gpu_index_list[i], RDC_FI_ECC_UNCORRECT_TOTAL}, value.value.l_int});
}
}
return RDC_ST_OK;
}
rdc_status_t RdcEmbeddedHandler::rdc_job_get_stats(const char job_id[64],
rdc_job_info_t* p_job_info) {
if (p_job_info == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_job_info_t* p_job_info) {
if (p_job_info == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_gpu_gauges_t gpu_gauges;
rdc_status_t status = get_gpu_gauges(&gpu_gauges);
if (status != RDC_ST_OK) return status;
rdc_gpu_gauges_t gpu_gauges;
rdc_status_t status = get_gpu_gauges(&gpu_gauges);
if (status != RDC_ST_OK) return status;
return cache_mgr_->rdc_job_get_stats(job_id, gpu_gauges, p_job_info);
return cache_mgr_->rdc_job_get_stats(job_id, gpu_gauges, p_job_info);
}
rdc_status_t RdcEmbeddedHandler::rdc_job_stop_stats(const char job_id[64]) {
rdc_gpu_gauges_t gpu_gauges;
rdc_status_t status = get_gpu_gauges(&gpu_gauges);
if (status != RDC_ST_OK) return status;
rdc_gpu_gauges_t gpu_gauges;
rdc_status_t status = get_gpu_gauges(&gpu_gauges);
if (status != RDC_ST_OK) return status;
return watch_table_->rdc_job_stop_stats(job_id, gpu_gauges);
return watch_table_->rdc_job_stop_stats(job_id, gpu_gauges);
}
rdc_status_t RdcEmbeddedHandler::rdc_job_remove(const char job_id[64]) {
return watch_table_->rdc_job_remove(job_id);
return watch_table_->rdc_job_remove(job_id);
}
rdc_status_t RdcEmbeddedHandler::rdc_job_remove_all() {
return watch_table_->rdc_job_remove_all();
}
rdc_status_t RdcEmbeddedHandler::rdc_job_remove_all() { return watch_table_->rdc_job_remove_all(); }
// Discovery API
rdc_status_t RdcEmbeddedHandler::rdc_device_get_all(
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) {
if (!count) {
return RDC_ST_BAD_PARAMETER;
}
rdc_field_value device_count;
rdc_status_t status = metric_fetcher_->
fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count);
if (status != RDC_ST_OK) {
return status;
}
rdc_status_t RdcEmbeddedHandler::rdc_device_get_all(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES],
uint32_t* count) {
if (!count) {
return RDC_ST_BAD_PARAMETER;
}
rdc_field_value device_count;
rdc_status_t status = metric_fetcher_->fetch_smi_field(0, RDC_FI_GPU_COUNT, &device_count);
if (status != RDC_ST_OK) {
return status;
}
// Assign the index to the index list
*count = device_count.value.l_int;
for (uint32_t i=0; i < *count; i++) {
gpu_index_list[i] = i;
}
// Assign the index to the index list
*count = device_count.value.l_int;
for (uint32_t i = 0; i < *count; i++) {
gpu_index_list[i] = i;
}
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcEmbeddedHandler::rdc_device_get_attributes(uint32_t gpu_index,
rdc_device_attributes_t* p_rdc_attr) {
if (!p_rdc_attr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_field_value device_name;
rdc_status_t status = metric_fetcher_->
fetch_smi_field(gpu_index, RDC_FI_DEV_NAME, &device_name);
strncpy_with_null(p_rdc_attr->device_name, device_name.value.str,
RDC_MAX_STR_LENGTH);
return status;
rdc_device_attributes_t* p_rdc_attr) {
if (!p_rdc_attr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_field_value device_name;
rdc_status_t status = metric_fetcher_->fetch_smi_field(gpu_index, RDC_FI_DEV_NAME, &device_name);
strncpy_with_null(p_rdc_attr->device_name, device_name.value.str, RDC_MAX_STR_LENGTH);
return status;
}
// Group API
rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_create(rdc_group_type_t type,
const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) {
if (!group_name || !p_rdc_group_id) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_create(rdc_group_type_t type, const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) {
if (!group_name || !p_rdc_group_id) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t status = group_settings_->
rdc_group_gpu_create(group_name, p_rdc_group_id);
if (status != RDC_ST_OK || type == RDC_GROUP_EMPTY) {
return status;
}
// Add All GPUs to the group
uint32_t count = 0;
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
status = rdc_device_get_all(
gpu_index_list, &count);
if (status != RDC_ST_OK) {
return status;
}
for (uint32_t i=0; i < count; i++) {
status = rdc_group_gpu_add(*p_rdc_group_id, gpu_index_list[i]);
}
rdc_status_t status = group_settings_->rdc_group_gpu_create(group_name, p_rdc_group_id);
if (status != RDC_ST_OK || type == RDC_GROUP_EMPTY) {
return status;
}
// Add All GPUs to the group
uint32_t count = 0;
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
status = rdc_device_get_all(gpu_index_list, &count);
if (status != RDC_ST_OK) {
return status;
}
for (uint32_t i = 0; i < count; i++) {
status = rdc_group_gpu_add(*p_rdc_group_id, gpu_index_list[i]);
}
return status;
}
rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id,
uint32_t gpu_index) {
uint32_t count = 0;
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
rdc_status_t status = rdc_device_get_all(
gpu_index_list, &count);
if (status != RDC_ST_OK) {
return status;
}
bool is_gpu_exist = false;
for (uint32_t i=0; i < count; i++) {
if (gpu_index_list[i] == gpu_index) {
is_gpu_exist = true;
break;
}
rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_add(rdc_gpu_group_t group_id, uint32_t gpu_index) {
uint32_t count = 0;
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
rdc_status_t status = rdc_device_get_all(gpu_index_list, &count);
if (status != RDC_ST_OK) {
return status;
}
bool is_gpu_exist = false;
for (uint32_t i = 0; i < count; i++) {
if (gpu_index_list[i] == gpu_index) {
is_gpu_exist = true;
break;
}
}
if (!is_gpu_exist) {
RDC_LOG(RDC_INFO, "Fail to add GPU index " << gpu_index << " to group "
<< group_id <<" as the GPU index is invalid.");
return RDC_ST_NOT_FOUND;
}
if (!is_gpu_exist) {
RDC_LOG(RDC_INFO, "Fail to add GPU index " << gpu_index << " to group " << group_id
<< " as the GPU index is invalid.");
return RDC_ST_NOT_FOUND;
}
return group_settings_->rdc_group_gpu_add(group_id, gpu_index);
return group_settings_->rdc_group_gpu_add(group_id, gpu_index);
}
rdc_status_t RdcEmbeddedHandler::rdc_group_field_create(uint32_t num_field_ids,
rdc_field_t* field_ids, const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) {
if (!field_group_name || !rdc_field_group_id || !field_ids) {
return RDC_ST_BAD_PARAMETER;
}
rdc_field_t* field_ids,
const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) {
if (!field_group_name || !rdc_field_group_id || !field_ids) {
return RDC_ST_BAD_PARAMETER;
}
// Check the field is valid or not
if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) {
for (uint32_t i = 0; i < num_field_ids; i++) {
if (!is_field_valid(field_ids[i])) {
RDC_LOG(RDC_INFO,
"Fail to create field group with unknown field id "
<< field_ids[i]);
return RDC_ST_NOT_SUPPORTED;
}
}
} else {
return RDC_ST_MAX_LIMIT;
// Check the field is valid or not
if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) {
for (uint32_t i = 0; i < num_field_ids; i++) {
if (!is_field_valid(field_ids[i])) {
RDC_LOG(RDC_INFO, "Fail to create field group with unknown field id " << field_ids[i]);
return RDC_ST_NOT_SUPPORTED;
}
}
} else {
return RDC_ST_MAX_LIMIT;
}
return group_settings_->rdc_group_field_create(
num_field_ids, field_ids, field_group_name, rdc_field_group_id);
return group_settings_->rdc_group_field_create(num_field_ids, field_ids, field_group_name,
rdc_field_group_id);
}
rdc_status_t RdcEmbeddedHandler::rdc_group_field_get_info(
rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) {
if (!field_group_info) {
return RDC_ST_BAD_PARAMETER;
}
return group_settings_->rdc_group_field_get_info(
rdc_field_group_id, field_group_info);
rdc_field_grp_t rdc_field_group_id, rdc_field_group_info_t* field_group_info) {
if (!field_group_info) {
return RDC_ST_BAD_PARAMETER;
}
return group_settings_->rdc_group_field_get_info(rdc_field_group_id, field_group_info);
}
rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_get_info(
rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) {
if (!p_rdc_group_info) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) {
if (!p_rdc_group_info) {
return RDC_ST_BAD_PARAMETER;
}
return group_settings_->rdc_group_gpu_get_info(
p_rdc_group_id, p_rdc_group_info);
return group_settings_->rdc_group_gpu_get_info(p_rdc_group_id, p_rdc_group_info);
}
rdc_status_t RdcEmbeddedHandler::rdc_group_get_all_ids(
rdc_gpu_group_t group_id_list[], uint32_t* count) {
if (!count) {
return RDC_ST_BAD_PARAMETER;
}
return group_settings_->rdc_group_get_all_ids(group_id_list, count);
rdc_status_t RdcEmbeddedHandler::rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[],
uint32_t* count) {
if (!count) {
return RDC_ST_BAD_PARAMETER;
}
return group_settings_->rdc_group_get_all_ids(group_id_list, count);
}
rdc_status_t RdcEmbeddedHandler::rdc_group_field_get_all_ids(
rdc_field_grp_t field_group_id_list[], uint32_t* count) {
if (!count) {
return RDC_ST_BAD_PARAMETER;
}
return group_settings_->rdc_group_field_get_all_ids(
field_group_id_list, count);
rdc_status_t RdcEmbeddedHandler::rdc_group_field_get_all_ids(rdc_field_grp_t field_group_id_list[],
uint32_t* count) {
if (!count) {
return RDC_ST_BAD_PARAMETER;
}
return group_settings_->rdc_group_field_get_all_ids(field_group_id_list, count);
}
rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_destroy(
rdc_gpu_group_t p_rdc_group_id) {
return group_settings_->rdc_group_gpu_destroy(p_rdc_group_id);
rdc_status_t RdcEmbeddedHandler::rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) {
return group_settings_->rdc_group_gpu_destroy(p_rdc_group_id);
}
rdc_status_t RdcEmbeddedHandler::rdc_group_field_destroy(
rdc_field_grp_t rdc_field_group_id) {
return group_settings_->rdc_group_field_destroy(rdc_field_group_id);
rdc_status_t RdcEmbeddedHandler::rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) {
return group_settings_->rdc_group_field_destroy(rdc_field_group_id);
}
// Field API
rdc_status_t RdcEmbeddedHandler::rdc_field_watch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, uint64_t update_freq,
double max_keep_age, uint32_t max_keep_samples) {
return watch_table_->rdc_field_watch(group_id, field_group_id,
update_freq, max_keep_age, max_keep_samples);
rdc_field_grp_t field_group_id,
uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) {
return watch_table_->rdc_field_watch(group_id, field_group_id, update_freq, max_keep_age,
max_keep_samples);
}
rdc_status_t RdcEmbeddedHandler::rdc_field_get_latest_value(
uint32_t gpu_index, rdc_field_t field, rdc_field_value* value) {
if (!value) {
return RDC_ST_BAD_PARAMETER;
}
if (!is_field_valid(field)) {
RDC_LOG(RDC_INFO,
"Fail to get latest value with unknown field id "
<< field);
return RDC_ST_NOT_SUPPORTED;
}
return cache_mgr_->rdc_field_get_latest_value(gpu_index, field, value);
rdc_status_t RdcEmbeddedHandler::rdc_field_get_latest_value(uint32_t gpu_index, rdc_field_t field,
rdc_field_value* value) {
if (!value) {
return RDC_ST_BAD_PARAMETER;
}
if (!is_field_valid(field)) {
RDC_LOG(RDC_INFO, "Fail to get latest value with unknown field id " << field);
return RDC_ST_NOT_SUPPORTED;
}
return cache_mgr_->rdc_field_get_latest_value(gpu_index, field, value);
}
rdc_status_t RdcEmbeddedHandler::rdc_field_get_value_since(uint32_t gpu_index,
rdc_field_t field, uint64_t since_time_stamp,
uint64_t *next_since_time_stamp, rdc_field_value* value) {
if (!next_since_time_stamp || !value) {
return RDC_ST_BAD_PARAMETER;
}
if (!is_field_valid(field)) {
RDC_LOG(RDC_INFO,
"Fail to get value since with unknown field id "
<< field);
return RDC_ST_NOT_SUPPORTED;
}
return cache_mgr_->rdc_field_get_value_since(gpu_index, field,
since_time_stamp, next_since_time_stamp, value);
rdc_status_t RdcEmbeddedHandler::rdc_field_get_value_since(uint32_t gpu_index, rdc_field_t field,
uint64_t since_time_stamp,
uint64_t* next_since_time_stamp,
rdc_field_value* value) {
if (!next_since_time_stamp || !value) {
return RDC_ST_BAD_PARAMETER;
}
if (!is_field_valid(field)) {
RDC_LOG(RDC_INFO, "Fail to get value since with unknown field id " << field);
return RDC_ST_NOT_SUPPORTED;
}
return cache_mgr_->rdc_field_get_value_since(gpu_index, field, since_time_stamp,
next_since_time_stamp, value);
}
rdc_status_t RdcEmbeddedHandler::rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id) {
return watch_table_->rdc_field_unwatch(group_id, field_group_id);
rdc_field_grp_t field_group_id) {
return watch_table_->rdc_field_unwatch(group_id, field_group_id);
}
// Diagnostic API
rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(
rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_response_t* response) {
if (!response) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id,
rdc_diag_level_t level,
rdc_diag_response_t* response) {
if (!response) {
return RDC_ST_BAD_PARAMETER;
}
// Get GPU group information
rdc_group_info_t rdc_group_info;
rdc_status_t status = rdc_group_gpu_get_info(
group_id, &rdc_group_info);
if (status != RDC_ST_OK) return status;
// Get GPU group information
rdc_group_info_t rdc_group_info;
rdc_status_t status = rdc_group_gpu_get_info(group_id, &rdc_group_info);
if (status != RDC_ST_OK) return status;
auto diag = rdc_module_mgr_->get_diagnostic_module();
return diag->rdc_diagnostic_run(rdc_group_info, level, response);
auto diag = rdc_module_mgr_->get_diagnostic_module();
return diag->rdc_diagnostic_run(rdc_group_info, level, response);
}
rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(
rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) {
if (!result) {
return RDC_ST_BAD_PARAMETER;
}
// Get GPU group information
rdc_group_info_t rdc_group_info;
rdc_status_t status = rdc_group_gpu_get_info(
group_id, &rdc_group_info);
if (status != RDC_ST_OK) return status;
rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(rdc_gpu_group_t group_id,
rdc_diag_test_cases_t test_case,
rdc_diag_test_result_t* result) {
if (!result) {
return RDC_ST_BAD_PARAMETER;
}
// Get GPU group information
rdc_group_info_t rdc_group_info;
rdc_status_t status = rdc_group_gpu_get_info(group_id, &rdc_group_info);
if (status != RDC_ST_OK) return status;
auto diag = rdc_module_mgr_->get_diagnostic_module();
return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids,
rdc_group_info.count, result);
auto diag = rdc_module_mgr_->get_diagnostic_module();
return diag->rdc_test_case_run(test_case, rdc_group_info.entity_ids, rdc_group_info.count,
result);
}
// Control API
rdc_status_t RdcEmbeddedHandler::rdc_field_update_all(
uint32_t wait_for_update) {
if (wait_for_update == 1) {
return watch_table_->rdc_field_update_all();
}
rdc_status_t RdcEmbeddedHandler::rdc_field_update_all(uint32_t wait_for_update) {
if (wait_for_update == 1) {
return watch_table_->rdc_field_update_all();
}
// Async update the field and return immediately.
updater_ = std::async(std::launch::async, [this](){
watch_table_->rdc_field_update_all();
});
// Async update the field and return immediately.
updater_ = std::async(std::launch::async, [this]() { watch_table_->rdc_field_update_all(); });
return RDC_ST_OK;
return RDC_ST_OK;
}
} // namespace rdc
+148 -159
查看文件
@@ -20,201 +20,190 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcGroupSettingsImpl.h"
#include <ctime>
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdcGroupSettingsImpl::RdcGroupSettingsImpl() {
// Add the default job stats fields
rdc_field_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE,
RDC_FI_POWER_USAGE, RDC_FI_GPU_CLOCK, RDC_FI_GPU_UTIL,
RDC_FI_PCIE_TX, RDC_FI_PCIE_RX, RDC_FI_MEM_CLOCK,
RDC_FI_GPU_TEMP};
char job_field_group[] = "JobStatsFields";
rdc_field_grp_t fgid = JOB_FIELD_ID;
// Add the default job stats fields
rdc_field_t job_fields[] = {RDC_FI_GPU_MEMORY_USAGE, RDC_FI_POWER_USAGE, RDC_FI_GPU_CLOCK,
RDC_FI_GPU_UTIL, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX,
RDC_FI_MEM_CLOCK, RDC_FI_GPU_TEMP};
char job_field_group[] = "JobStatsFields";
rdc_field_grp_t fgid = JOB_FIELD_ID;
rdc_group_field_create(sizeof(job_fields)/sizeof(uint32_t),
job_fields, job_field_group, &fgid);
rdc_group_field_create(sizeof(job_fields) / sizeof(uint32_t), job_fields, job_field_group, &fgid);
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_create(
const char* group_name, rdc_gpu_group_t* p_rdc_group_id) {
RDC_LOG(RDC_DEBUG, "Create group " << group_name);
rdc_group_info_t ginfo;
strncpy_with_null(ginfo.group_name, group_name, RDC_MAX_STR_LENGTH);
ginfo.count = 0;
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_create(const char* group_name,
rdc_gpu_group_t* p_rdc_group_id) {
RDC_LOG(RDC_DEBUG, "Create group " << group_name);
rdc_group_info_t ginfo;
strncpy_with_null(ginfo.group_name, group_name, RDC_MAX_STR_LENGTH);
ginfo.count = 0;
std::lock_guard<std::mutex> guard(group_mutex_);
if (gpu_group_.size() >= RDC_MAX_NUM_GROUPS) {
return RDC_ST_MAX_LIMIT;
}
gpu_group_.emplace(cur_group_id_, ginfo);
*p_rdc_group_id = cur_group_id_;
cur_group_id_++;
std::lock_guard<std::mutex> guard(group_mutex_);
if (gpu_group_.size() >= RDC_MAX_NUM_GROUPS) {
return RDC_ST_MAX_LIMIT;
}
gpu_group_.emplace(cur_group_id_, ginfo);
*p_rdc_group_id = cur_group_id_;
cur_group_id_++;
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_destroy(
rdc_gpu_group_t p_rdc_group_id) {
std::lock_guard<std::mutex> guard(group_mutex_);
if (!gpu_group_.erase(p_rdc_group_id))
return RDC_ST_NOT_FOUND;
return RDC_ST_OK;
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_destroy(rdc_gpu_group_t p_rdc_group_id) {
std::lock_guard<std::mutex> guard(group_mutex_);
if (!gpu_group_.erase(p_rdc_group_id)) return RDC_ST_NOT_FOUND;
return RDC_ST_OK;
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add(
rdc_gpu_group_t groupId, uint32_t gpu_index ) {
std::lock_guard<std::mutex> guard(group_mutex_);
auto ite = gpu_group_.find(groupId);
if (ite != gpu_group_.end()) {
// Check whether the index already exists
for (uint32_t i=0; i < ite->second.count; i++) {
if (ite->second.entity_ids[i] == gpu_index) {
RDC_LOG(RDC_INFO, "Fail to add " << gpu_index
<<" to GPU group " << groupId << " as it is already exists");
return RDC_ST_BAD_PARAMETER;
}
}
if (ite->second.count < RDC_GROUP_MAX_ENTITIES) {
ite->second.entity_ids[ite->second.count] = gpu_index;
ite->second.count++;
} else {
return RDC_ST_MAX_LIMIT;
}
} else {
return RDC_ST_NOT_FOUND;
}
return RDC_ST_OK;
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_get_info(
rdc_gpu_group_t p_rdc_group_id, rdc_group_info_t* p_rdc_group_info) {
std::lock_guard<std::mutex> guard(group_mutex_);
auto ite = gpu_group_.find(p_rdc_group_id);
if (ite != gpu_group_.end()) {
auto info = ite->second;
strncpy_with_null(p_rdc_group_info->group_name,
info.group_name, RDC_MAX_STR_LENGTH);
p_rdc_group_info->count = info.count;
for (uint32_t i=0 ; i < info.count; i++) {
p_rdc_group_info->entity_ids[i]= info.entity_ids[i];
}
} else {
return RDC_ST_NOT_FOUND;
}
return RDC_ST_OK;
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_get_all_ids(
rdc_gpu_group_t group_id_list[], uint32_t* count) {
if (!count) {
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_add(rdc_gpu_group_t groupId, uint32_t gpu_index) {
std::lock_guard<std::mutex> guard(group_mutex_);
auto ite = gpu_group_.find(groupId);
if (ite != gpu_group_.end()) {
// Check whether the index already exists
for (uint32_t i = 0; i < ite->second.count; i++) {
if (ite->second.entity_ids[i] == gpu_index) {
RDC_LOG(RDC_INFO, "Fail to add " << gpu_index << " to GPU group " << groupId
<< " as it is already exists");
return RDC_ST_BAD_PARAMETER;
}
}
*count = 0;
std::lock_guard<std::mutex> guard(group_mutex_);
auto ite = gpu_group_.begin();
for (; ite != gpu_group_.end(); ite++) {
if (*count >= RDC_MAX_NUM_GROUPS) {
return RDC_ST_MAX_LIMIT;
}
group_id_list[*count] = ite->first;
(*count)++;
}
return RDC_ST_OK;
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_field_create(
uint32_t num_field_ids, rdc_field_t* field_ids,
const char* field_group_name, rdc_field_grp_t* rdc_field_group_id) {
RDC_LOG(RDC_DEBUG, "Create field group " << field_group_name);
rdc_field_group_info_t finfo;
finfo.count = num_field_ids;
strncpy_with_null(finfo.group_name, field_group_name, RDC_MAX_STR_LENGTH);
if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) {
for (uint32_t i = 0; i < num_field_ids; i++) {
finfo.field_ids[i] = field_ids[i];
}
if (ite->second.count < RDC_GROUP_MAX_ENTITIES) {
ite->second.entity_ids[ite->second.count] = gpu_index;
ite->second.count++;
} else {
return RDC_ST_MAX_LIMIT;
return RDC_ST_MAX_LIMIT;
}
} else {
return RDC_ST_NOT_FOUND;
}
std::lock_guard<std::mutex> guard(field_group_mutex_);
if (field_group_.size() >= RDC_MAX_NUM_FIELD_GROUPS) {
return RDC_ST_MAX_LIMIT;
}
field_group_.emplace(cur_field_group_id_, finfo);
*rdc_field_group_id = cur_field_group_id_;
cur_field_group_id_++;
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_field_destroy(
rdc_field_grp_t rdc_field_group_id) {
if (rdc_field_group_id == JOB_FIELD_ID) {
RDC_LOG(RDC_INFO, "Cannot delete system JOB_FIELD_ID field group");
return RDC_ST_BAD_PARAMETER;
rdc_status_t RdcGroupSettingsImpl::rdc_group_gpu_get_info(rdc_gpu_group_t p_rdc_group_id,
rdc_group_info_t* p_rdc_group_info) {
std::lock_guard<std::mutex> guard(group_mutex_);
auto ite = gpu_group_.find(p_rdc_group_id);
if (ite != gpu_group_.end()) {
auto info = ite->second;
strncpy_with_null(p_rdc_group_info->group_name, info.group_name, RDC_MAX_STR_LENGTH);
p_rdc_group_info->count = info.count;
for (uint32_t i = 0; i < info.count; i++) {
p_rdc_group_info->entity_ids[i] = info.entity_ids[i];
}
std::lock_guard<std::mutex> guard(field_group_mutex_);
if (!field_group_.erase(rdc_field_group_id))
return RDC_ST_NOT_FOUND;
return RDC_ST_OK;
} else {
return RDC_ST_NOT_FOUND;
}
return RDC_ST_OK;
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_get_all_ids(rdc_gpu_group_t group_id_list[],
uint32_t* count) {
if (!count) {
return RDC_ST_BAD_PARAMETER;
}
*count = 0;
std::lock_guard<std::mutex> guard(group_mutex_);
auto ite = gpu_group_.begin();
for (; ite != gpu_group_.end(); ite++) {
if (*count >= RDC_MAX_NUM_GROUPS) {
return RDC_ST_MAX_LIMIT;
}
group_id_list[*count] = ite->first;
(*count)++;
}
return RDC_ST_OK;
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_field_create(uint32_t num_field_ids,
rdc_field_t* field_ids,
const char* field_group_name,
rdc_field_grp_t* rdc_field_group_id) {
RDC_LOG(RDC_DEBUG, "Create field group " << field_group_name);
rdc_field_group_info_t finfo;
finfo.count = num_field_ids;
strncpy_with_null(finfo.group_name, field_group_name, RDC_MAX_STR_LENGTH);
if (num_field_ids <= RDC_MAX_FIELD_IDS_PER_FIELD_GROUP) {
for (uint32_t i = 0; i < num_field_ids; i++) {
finfo.field_ids[i] = field_ids[i];
}
} else {
return RDC_ST_MAX_LIMIT;
}
std::lock_guard<std::mutex> guard(field_group_mutex_);
if (field_group_.size() >= RDC_MAX_NUM_FIELD_GROUPS) {
return RDC_ST_MAX_LIMIT;
}
field_group_.emplace(cur_field_group_id_, finfo);
*rdc_field_group_id = cur_field_group_id_;
cur_field_group_id_++;
return RDC_ST_OK;
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_field_destroy(rdc_field_grp_t rdc_field_group_id) {
if (rdc_field_group_id == JOB_FIELD_ID) {
RDC_LOG(RDC_INFO, "Cannot delete system JOB_FIELD_ID field group");
return RDC_ST_BAD_PARAMETER;
}
std::lock_guard<std::mutex> guard(field_group_mutex_);
if (!field_group_.erase(rdc_field_group_id)) return RDC_ST_NOT_FOUND;
return RDC_ST_OK;
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_field_get_info(
rdc_field_grp_t rdc_field_group_id,
rdc_field_group_info_t* field_group_info) {
std::lock_guard<std::mutex> guard(field_group_mutex_);
auto ite = field_group_.find(rdc_field_group_id);
if (ite != field_group_.end()) {
auto info = ite->second;
strncpy_with_null(field_group_info->group_name, info.group_name,
RDC_MAX_STR_LENGTH);
field_group_info->count = info.count;
for (uint32_t i=0 ; i < info.count; i++) {
field_group_info->field_ids[i]= info.field_ids[i];
}
} else {
return RDC_ST_NOT_FOUND;
rdc_field_grp_t rdc_field_group_id, rdc_field_group_info_t* field_group_info) {
std::lock_guard<std::mutex> guard(field_group_mutex_);
auto ite = field_group_.find(rdc_field_group_id);
if (ite != field_group_.end()) {
auto info = ite->second;
strncpy_with_null(field_group_info->group_name, info.group_name, RDC_MAX_STR_LENGTH);
field_group_info->count = info.count;
for (uint32_t i = 0; i < info.count; i++) {
field_group_info->field_ids[i] = info.field_ids[i];
}
return RDC_ST_OK;
} else {
return RDC_ST_NOT_FOUND;
}
return RDC_ST_OK;
}
rdc_status_t RdcGroupSettingsImpl::rdc_group_field_get_all_ids(
rdc_field_grp_t field_group_id_list[], uint32_t* count) {
if (!count) {
return RDC_ST_BAD_PARAMETER;
rdc_field_grp_t field_group_id_list[], uint32_t* count) {
if (!count) {
return RDC_ST_BAD_PARAMETER;
}
*count = 0;
std::lock_guard<std::mutex> guard(field_group_mutex_);
auto ite = field_group_.begin();
for (; ite != field_group_.end(); ite++) {
if (*count >= RDC_MAX_NUM_FIELD_GROUPS) {
return RDC_ST_MAX_LIMIT;
}
*count = 0;
std::lock_guard<std::mutex> guard(field_group_mutex_);
auto ite = field_group_.begin();
for (; ite != field_group_.end(); ite++) {
if (*count >= RDC_MAX_NUM_FIELD_GROUPS) {
return RDC_ST_MAX_LIMIT;
}
// Skip system defined JOB_FIELD_ID
if (ite->first == JOB_FIELD_ID) continue;
// Skip system defined JOB_FIELD_ID
if (ite->first == JOB_FIELD_ID) continue;
field_group_id_list[*count] = ite->first;
(*count)++;
}
field_group_id_list[*count] = ite->first;
(*count)++;
}
return RDC_ST_OK;
return RDC_ST_OK;
}
} // namespace rdc
} // namespace amd
+378 -407
查看文件
@@ -20,34 +20,35 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcMetricFetcherImpl.h"
#include <sys/time.h>
#include <string.h>
#include <assert.h>
#include <chrono> //NOLINT
#include <assert.h>
#include <string.h>
#include <sys/time.h>
#include <algorithm>
#include <vector>
#include <chrono> //NOLINT
#include <set>
#include "rdc_lib/rdc_common.h"
#include <vector>
#include "common/rdc_capabilities.h"
#include "common/rdc_fields_supported.h"
#include "rdc_lib/RdcLogger.h"
#include "rocm_smi/rocm_smi.h"
#include "rdc_lib/impl/RsmiUtils.h"
#include "common/rdc_capabilities.h"
#include "rdc_lib/rdc_common.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace rdc {
static const std::unordered_map<rdc_field_t, rsmi_event_type_t>
rdc_evnt_2_rsmi_field = {
{RDC_EVNT_XGMI_0_NOP_TX, RSMI_EVNT_XGMI_0_NOP_TX},
{RDC_EVNT_XGMI_0_REQ_TX, RSMI_EVNT_XGMI_0_REQUEST_TX},
static const std::unordered_map<rdc_field_t, rsmi_event_type_t> rdc_evnt_2_rsmi_field = {
{RDC_EVNT_XGMI_0_NOP_TX, RSMI_EVNT_XGMI_0_NOP_TX},
{RDC_EVNT_XGMI_0_REQ_TX, RSMI_EVNT_XGMI_0_REQUEST_TX},
{RDC_EVNT_XGMI_0_RESP_TX, RSMI_EVNT_XGMI_0_RESPONSE_TX},
{RDC_EVNT_XGMI_0_BEATS_TX, RSMI_EVNT_XGMI_0_BEATS_TX},
{RDC_EVNT_XGMI_1_NOP_TX, RSMI_EVNT_XGMI_1_NOP_TX},
{RDC_EVNT_XGMI_1_REQ_TX, RSMI_EVNT_XGMI_1_REQUEST_TX},
{RDC_EVNT_XGMI_0_BEATS_TX, RSMI_EVNT_XGMI_0_BEATS_TX},
{RDC_EVNT_XGMI_1_NOP_TX, RSMI_EVNT_XGMI_1_NOP_TX},
{RDC_EVNT_XGMI_1_REQ_TX, RSMI_EVNT_XGMI_1_REQUEST_TX},
{RDC_EVNT_XGMI_1_RESP_TX, RSMI_EVNT_XGMI_1_RESPONSE_TX},
{RDC_EVNT_XGMI_1_BEATS_TX, RSMI_EVNT_XGMI_1_BEATS_TX},
{RDC_EVNT_XGMI_1_BEATS_TX, RSMI_EVNT_XGMI_1_BEATS_TX},
{RDC_EVNT_XGMI_0_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_0},
{RDC_EVNT_XGMI_1_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_1},
@@ -58,447 +59,420 @@ static const std::unordered_map<rdc_field_t, rsmi_event_type_t>
};
RdcMetricFetcherImpl::RdcMetricFetcherImpl() {
task_started_ = true;
task_started_ = true;
// kick off another thread for async fetch
updater_ = std::async(std::launch::async, [this]() {
while (task_started_) {
std::unique_lock<std::mutex> lk(task_mutex_);
// Wait for tasks or stop signal
cv_.wait(lk, [this]{
return !updated_tasks_.empty() || !task_started_;
});
if (updated_tasks_.empty()) continue;
// kick off another thread for async fetch
updater_ = std::async(std::launch::async, [this]() {
while (task_started_) {
std::unique_lock<std::mutex> lk(task_mutex_);
// Wait for tasks or stop signal
cv_.wait(lk, [this] { return !updated_tasks_.empty() || !task_started_; });
if (updated_tasks_.empty()) continue;
// Get the tasks
auto item = updated_tasks_.front();
updated_tasks_.pop();
// The task may take long time, release lock
lk.unlock();
// Get the tasks
auto item = updated_tasks_.front();
updated_tasks_.pop();
// The task may take long time, release lock
lk.unlock();
// run task
item.task(*this, item.field);
} // end while (task_started_)
});
// run task
item.task(*this, item.field);
} // end while (task_started_)
});
}
RdcMetricFetcherImpl::~RdcMetricFetcherImpl() {
// Notify the async task to stop
task_started_ = false;
cv_.notify_all();
// Notify the async task to stop
task_started_ = false;
cv_.notify_all();
}
uint64_t RdcMetricFetcherImpl::now() {
struct timeval tv;
gettimeofday(&tv, NULL);
return static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
struct timeval tv;
gettimeofday(&tv, NULL);
return static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
}
void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index,
rdc_field_t field_id, rdc_field_value* value) {
rsmi_status_t err = RSMI_STATUS_SUCCESS;
uint64_t correctable_err = 0;
uint64_t uncorrectable_err = 0;
rsmi_ras_err_state_t err_state;
void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index, rdc_field_t field_id,
rdc_field_value* value) {
rsmi_status_t err = RSMI_STATUS_SUCCESS;
uint64_t correctable_err = 0;
uint64_t uncorrectable_err = 0;
rsmi_ras_err_state_t err_state;
if (!value) {
return;
}
for (uint32_t b = RSMI_GPU_BLOCK_FIRST;
b <= RSMI_GPU_BLOCK_LAST; b = b*2) {
err = rsmi_dev_ecc_status_get(gpu_index, static_cast<rsmi_gpu_block_t>(b),
&err_state);
if (err != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO, "Get the ecc Status error " << b
<< ":" << err);
continue;
}
rsmi_error_count_t ec;
err = rsmi_dev_ecc_count_get(gpu_index,
static_cast<rsmi_gpu_block_t>(b), &ec);
if (err == RSMI_STATUS_SUCCESS) {
correctable_err += ec.correctable_err;
uncorrectable_err += ec.uncorrectable_err;
}
if (!value) {
return;
}
for (uint32_t b = RSMI_GPU_BLOCK_FIRST; b <= RSMI_GPU_BLOCK_LAST; b = b * 2) {
err = rsmi_dev_ecc_status_get(gpu_index, static_cast<rsmi_gpu_block_t>(b), &err_state);
if (err != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO, "Get the ecc Status error " << b << ":" << err);
continue;
}
value->status = RSMI_STATUS_SUCCESS;
value->type = INTEGER;
if (field_id == RDC_FI_ECC_CORRECT_TOTAL) {
value->value.l_int = correctable_err;
}
if (field_id == RDC_FI_ECC_UNCORRECT_TOTAL) {
value->value.l_int = uncorrectable_err;
rsmi_error_count_t ec;
err = rsmi_dev_ecc_count_get(gpu_index, static_cast<rsmi_gpu_block_t>(b), &ec);
if (err == RSMI_STATUS_SUCCESS) {
correctable_err += ec.correctable_err;
uncorrectable_err += ec.uncorrectable_err;
}
}
value->status = RSMI_STATUS_SUCCESS;
value->type = INTEGER;
if (field_id == RDC_FI_ECC_CORRECT_TOTAL) {
value->value.l_int = correctable_err;
}
if (field_id == RDC_FI_ECC_UNCORRECT_TOTAL) {
value->value.l_int = uncorrectable_err;
}
}
bool RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index,
rdc_field_t field_id, rdc_field_value* value) {
if (!value) {
bool RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, rdc_field_t field_id,
rdc_field_value* value) {
if (!value) {
return false;
}
do {
std::lock_guard<std::mutex> guard(task_mutex_);
auto metric = async_metrics_.find({gpu_index, field_id});
if (metric != async_metrics_.end()) {
if (now() < metric->second.last_time + metric->second.cache_ttl) {
RDC_LOG(RDC_DEBUG,
"Fetch " << gpu_index << ":" << field_id_string(field_id) << " from cache");
value->status = metric->second.value.status;
value->type = metric->second.value.type;
value->value = metric->second.value.value;
return false;
}
}
do {
std::lock_guard<std::mutex> guard(task_mutex_);
auto metric = async_metrics_.find({gpu_index, field_id});
if ( metric != async_metrics_.end() ) {
if (now() < metric->second.last_time + metric->second.cache_ttl) {
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" <<
field_id_string(field_id) << " from cache");
value->status = metric->second.value.status;
value->type = metric->second.value.type;
value->value = metric->second.value.value;
return false;
}
}
// add to the async task queue
MetricTask t;
t.field = {gpu_index, field_id};
t.task = &RdcMetricFetcherImpl::get_pcie_throughput;
updated_tasks_.push(t);
// add to the async task queue
MetricTask t;
t.field = {gpu_index, field_id};
t.task = &RdcMetricFetcherImpl::get_pcie_throughput;
updated_tasks_.push(t);
RDC_LOG(RDC_DEBUG,
"Start async fetch " << gpu_index << ":" << field_id_string(field_id) << " to cache.");
} while (0);
cv_.notify_all();
RDC_LOG(RDC_DEBUG, "Start async fetch " << gpu_index << ":" <<
field_id_string(field_id) << " to cache.");
} while (0);
cv_.notify_all();
return true;
return true;
}
void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) {
uint32_t gpu_index = key.first;
uint64_t sent, received, max_pkt_sz;
rsmi_status_t ret;
uint32_t gpu_index = key.first;
uint64_t sent, received, max_pkt_sz;
rsmi_status_t ret;
// Return if the cache does not expire yet
do {
std::lock_guard<std::mutex> guard(task_mutex_);
auto metric = async_metrics_.find(key);
if (metric != async_metrics_.end() &&
now() < metric->second.last_time + metric->second.cache_ttl) {
return;
}
} while (0);
// Return if the cache does not expire yet
do {
std::lock_guard<std::mutex> guard(task_mutex_);
auto metric = async_metrics_.find(key);
if (metric != async_metrics_.end() &&
now() < metric->second.last_time + metric->second.cache_ttl) {
return;
}
} while (0);
ret = rsmi_dev_pci_throughput_get(gpu_index, &sent, &received, &max_pkt_sz);
ret = rsmi_dev_pci_throughput_get(gpu_index, &sent, &received, &max_pkt_sz);
uint64_t curTime = now();
MetricValue value;
value.cache_ttl = 30*1000; // cache 30 seconds
value.value.type = INTEGER;
do {
std::lock_guard<std::mutex> guard(task_mutex_);
// Create new cache entry it does not exist
auto tx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_TX});
if (tx_metric == async_metrics_.end()) {
tx_metric = async_metrics_.insert(
{{gpu_index, RDC_FI_PCIE_TX}, value}).first;
tx_metric->second.value.field_id = RDC_FI_PCIE_TX;
}
auto rx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_RX});
if (rx_metric == async_metrics_.end()) {
rx_metric = async_metrics_.insert(
{{gpu_index, RDC_FI_PCIE_RX}, value}).first;
rx_metric->second.value.field_id = RDC_FI_PCIE_RX;
}
uint64_t curTime = now();
MetricValue value;
value.cache_ttl = 30 * 1000; // cache 30 seconds
value.value.type = INTEGER;
do {
std::lock_guard<std::mutex> guard(task_mutex_);
// Create new cache entry it does not exist
auto tx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_TX});
if (tx_metric == async_metrics_.end()) {
tx_metric = async_metrics_.insert({{gpu_index, RDC_FI_PCIE_TX}, value}).first;
tx_metric->second.value.field_id = RDC_FI_PCIE_TX;
}
auto rx_metric = async_metrics_.find({gpu_index, RDC_FI_PCIE_RX});
if (rx_metric == async_metrics_.end()) {
rx_metric = async_metrics_.insert({{gpu_index, RDC_FI_PCIE_RX}, value}).first;
rx_metric->second.value.field_id = RDC_FI_PCIE_RX;
}
// Always update the status and last_time
tx_metric->second.last_time = curTime;
tx_metric->second.value.status = ret;
tx_metric->second.value.ts = curTime;
// Always update the status and last_time
tx_metric->second.last_time = curTime;
tx_metric->second.value.status = ret;
tx_metric->second.value.ts = curTime;
rx_metric->second.last_time = curTime;
rx_metric->second.value.status = ret;
rx_metric->second.value.ts = curTime;
rx_metric->second.last_time = curTime;
rx_metric->second.value.status = ret;
rx_metric->second.value.ts = curTime;
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
RDC_LOG(RDC_ERROR,
"PCIe throughput not supported on GPU " << gpu_index);
return;
}
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
RDC_LOG(RDC_ERROR, "PCIe throughput not supported on GPU " << gpu_index);
return;
}
if (ret == RSMI_STATUS_SUCCESS) {
rx_metric->second.value.value.l_int = received;
tx_metric->second.value.value.l_int = sent;
RDC_LOG(RDC_DEBUG, "Async updated " << gpu_index << ":" <<
"RDC_FI_PCIE_RX and RDC_FI_PCIE_TX to cache.");
}
} while (0);
if (ret == RSMI_STATUS_SUCCESS) {
rx_metric->second.value.value.l_int = received;
tx_metric->second.value.value.l_int = sent;
RDC_LOG(RDC_DEBUG, "Async updated " << gpu_index << ":"
<< "RDC_FI_PCIE_RX and RDC_FI_PCIE_TX to cache.");
}
} while (0);
}
rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields(
rdc_gpu_field_t* fields, uint32_t fields_count,
std::vector<rdc_gpu_field_value_t>& results) { // NOLINT
const std::set<rdc_field_t> rdc_bulk_fields = {
RDC_FI_GPU_CLOCK, // current_gfxclk * 1000000
rdc_gpu_field_t* fields, uint32_t fields_count,
std::vector<rdc_gpu_field_value_t>& results) { // NOLINT
const std::set<rdc_field_t> rdc_bulk_fields = {
RDC_FI_GPU_CLOCK, // current_gfxclk * 1000000
RDC_FI_MEMORY_TEMP, // temperature_mem * 1000
RDC_FI_GPU_TEMP, // temperature_edge * 1000
RDC_FI_GPU_TEMP, // temperature_edge * 1000
RDC_FI_POWER_USAGE, // average_socket_power * 1000000
RDC_FI_GPU_UTIL // average_gfx_activity
};
RDC_FI_GPU_UTIL // average_gfx_activity
};
// To prevent always call the bulk API even if it is not supported,
// the static is used to cache last try.
static rsmi_status_t rs = RSMI_STATUS_SUCCESS;
if (rs != RSMI_STATUS_SUCCESS) {
// To prevent always call the bulk API even if it is not supported,
// the static is used to cache last try.
static rsmi_status_t rs = RSMI_STATUS_SUCCESS;
if (rs != RSMI_STATUS_SUCCESS) {
results.clear();
return RDC_ST_NOT_SUPPORTED;
}
// Organize the fields per GPU
std::map<uint32_t, std::vector<rdc_field_t>> bulk_fields;
for (uint32_t i = 0; i < fields_count; i++) {
if (rdc_bulk_fields.find(fields[i].field_id) != rdc_bulk_fields.end()) {
bulk_fields[fields[i].gpu_index].push_back(fields[i].field_id);
}
}
// Call the rocm_smi_lib API to bulk fetch the data
auto cur_time = now();
auto ite = bulk_fields.begin();
for (; ite != bulk_fields.end(); ite++) {
rsmi_gpu_metrics_t gpu_metrics;
rs = rsmi_dev_gpu_metrics_info_get(ite->first, &gpu_metrics);
if (rs != RSMI_STATUS_SUCCESS) {
results.clear();
return RDC_ST_NOT_SUPPORTED;
}
for (uint32_t j = 0; j < ite->second.size(); j++) {
auto field_id = ite->second[j];
rdc_gpu_field_value_t value;
value.gpu_index = ite->first;
value.field_value.field_id = field_id;
value.field_value.type = INTEGER;
value.field_value.status = RSMI_STATUS_SUCCESS;
value.field_value.ts = cur_time;
// Organize the fields per GPU
std::map<uint32_t, std::vector<rdc_field_t>> bulk_fields;
for (uint32_t i = 0; i < fields_count; i++) {
if (rdc_bulk_fields.find(fields[i].field_id) != rdc_bulk_fields.end()) {
bulk_fields[fields[i].gpu_index].push_back(fields[i].field_id);
}
}
// Call the rocm_smi_lib API to bulk fetch the data
auto cur_time = now();
auto ite = bulk_fields.begin();
for (; ite != bulk_fields.end(); ite++) {
rsmi_gpu_metrics_t gpu_metrics;
rs = rsmi_dev_gpu_metrics_info_get(ite->first, &gpu_metrics);
if (rs != RSMI_STATUS_SUCCESS) {
results.clear();
return RDC_ST_NOT_SUPPORTED;
}
for (uint32_t j=0; j < ite->second.size(); j++) {
auto field_id = ite->second[j];
rdc_gpu_field_value_t value;
value.gpu_index = ite->first;
value.field_value.field_id = field_id;
value.field_value.type = INTEGER;
value.field_value.status = RSMI_STATUS_SUCCESS;
value.field_value.ts = cur_time;
switch (field_id) {
case RDC_FI_GPU_CLOCK: // current_gfxclk * 1000000
value.field_value.value.l_int =
switch (field_id) {
case RDC_FI_GPU_CLOCK: // current_gfxclk * 1000000
value.field_value.value.l_int =
static_cast<int64_t>(gpu_metrics.current_gfxclk * 1000000);
break;
case RDC_FI_MEMORY_TEMP: // temperature_mem * 1000
value.field_value.value.l_int =
static_cast<int64_t>(gpu_metrics.temperature_mem * 1000);
break;
case RDC_FI_GPU_TEMP: // temperature_edge * 1000
value.field_value.value.l_int =
static_cast<int64_t>(gpu_metrics.temperature_edge * 1000);
break;
case RDC_FI_POWER_USAGE: // average_socket_power * 1000000
value.field_value.value.l_int =
break;
case RDC_FI_MEMORY_TEMP: // temperature_mem * 1000
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.temperature_mem * 1000);
break;
case RDC_FI_GPU_TEMP: // temperature_edge * 1000
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.temperature_edge * 1000);
break;
case RDC_FI_POWER_USAGE: // average_socket_power * 1000000
value.field_value.value.l_int =
static_cast<int64_t>(gpu_metrics.average_socket_power * 1000000);
// Ignore if the power is 0, which will fallback to non-bulk fetch.
if (value.field_value.value.l_int == 0) {
RDC_LOG(RDC_DEBUG, "Bulk fetch " << value.gpu_index << ":" <<
"RDC_FI_POWER_USAGE fallback to regular way.");
continue;
}
break;
case RDC_FI_GPU_UTIL: // average_gfx_activity
value.field_value.value.l_int =
static_cast<int64_t>(gpu_metrics.average_gfx_activity);
break;
default:
value.field_value.status = RSMI_STATUS_NOT_SUPPORTED;
break;
}
if (value.field_value.status == RSMI_STATUS_SUCCESS) {
results.push_back(value);
}
// Ignore if the power is 0, which will fallback to non-bulk fetch.
if (value.field_value.value.l_int == 0) {
RDC_LOG(RDC_DEBUG, "Bulk fetch " << value.gpu_index << ":"
<< "RDC_FI_POWER_USAGE fallback to regular way.");
continue;
}
break;
case RDC_FI_GPU_UTIL: // average_gfx_activity
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.average_gfx_activity);
break;
default:
value.field_value.status = RSMI_STATUS_NOT_SUPPORTED;
break;
}
if (value.field_value.status == RSMI_STATUS_SUCCESS) {
results.push_back(value);
}
}
}
return RDC_ST_OK;
return RDC_ST_OK;
}
static const uint64_t kGig = 1000000000;
rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index,
rdc_field_t field_id, rdc_field_value* value) {
if (!value) {
return RDC_ST_BAD_PARAMETER;
}
uint64_t i64 = 0;
rsmi_temperature_type_t sensor_type;
rsmi_clk_type_t clk_type;
bool async_fetching = false;
RdcFieldKey f_key(gpu_index, field_id);
std::shared_ptr<FieldRSMIData> rsmi_data;
double coll_time_sec;
rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id,
rdc_field_value* value) {
if (!value) {
return RDC_ST_BAD_PARAMETER;
}
uint64_t i64 = 0;
rsmi_temperature_type_t sensor_type;
rsmi_clk_type_t clk_type;
bool async_fetching = false;
RdcFieldKey f_key(gpu_index, field_id);
std::shared_ptr<FieldRSMIData> rsmi_data;
double coll_time_sec;
if (!is_field_valid(field_id)) {
RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id
<< " which is not supported");
return RDC_ST_NOT_SUPPORTED;
if (!is_field_valid(field_id)) {
RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id << " which is not supported");
return RDC_ST_NOT_SUPPORTED;
}
value->ts = now();
value->field_id = field_id;
value->status = RSMI_STATUS_NOT_SUPPORTED;
auto read_rsmi_counter = [&](void) {
rsmi_data = get_rsmi_data(f_key);
if (rsmi_data == nullptr) {
value->status = RSMI_STATUS_NOT_SUPPORTED;
return;
}
value->ts = now();
value->field_id = field_id;
value->status = RSMI_STATUS_NOT_SUPPORTED;
value->status = rsmi_counter_read(rsmi_data->evt_handle, &rsmi_data->counter_val);
value->value.l_int = rsmi_data->counter_val.value;
value->type = INTEGER;
};
auto read_rsmi_counter = [&](void) {
rsmi_data = get_rsmi_data(f_key);
if (rsmi_data == nullptr) {
value->status = RSMI_STATUS_NOT_SUPPORTED;
return;
}
value->status = rsmi_counter_read(rsmi_data->evt_handle,
&rsmi_data->counter_val);
value->value.l_int = rsmi_data->counter_val.value;
switch (field_id) {
case RDC_FI_GPU_MEMORY_USAGE:
value->status = rsmi_dev_memory_usage_get(gpu_index, RSMI_MEM_TYPE_VRAM, &i64);
value->type = INTEGER;
};
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(i64);
}
break;
case RDC_FI_GPU_MEMORY_TOTAL:
value->status = rsmi_dev_memory_total_get(gpu_index, RSMI_MEM_TYPE_VRAM, &i64);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(i64);
}
break;
case RDC_FI_GPU_COUNT:
uint32_t num_gpu;
value->status = rsmi_num_monitor_devices(&num_gpu);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(num_gpu);
}
break;
case RDC_FI_POWER_USAGE:
value->status = rsmi_dev_power_ave_get(gpu_index, RSMI_TEMP_CURRENT, &i64);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(i64);
}
break;
case RDC_FI_GPU_CLOCK:
case RDC_FI_MEM_CLOCK:
rsmi_frequencies_t f;
clk_type = RSMI_CLK_TYPE_SYS;
if (field_id == RDC_FI_MEM_CLOCK) {
clk_type = RSMI_CLK_TYPE_MEM;
}
value->status = rsmi_dev_gpu_clk_freq_get(gpu_index, clk_type, &f);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = f.frequency[f.current];
}
break;
case RDC_FI_GPU_UTIL:
uint32_t busy_percent;
value->status = rsmi_dev_busy_percent_get(gpu_index, &busy_percent);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(busy_percent);
}
break;
case RDC_FI_DEV_NAME:
value->status = rsmi_dev_name_get(gpu_index, value->value.str, RDC_MAX_STR_LENGTH);
value->type = STRING;
break;
case RDC_FI_GPU_TEMP:
case RDC_FI_MEMORY_TEMP:
int64_t val_i64;
sensor_type = RSMI_TEMP_TYPE_EDGE;
if (field_id == RDC_FI_MEMORY_TEMP) {
sensor_type = RSMI_TEMP_TYPE_MEMORY;
}
value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, RSMI_TEMP_CURRENT, &val_i64);
switch (field_id) {
case RDC_FI_GPU_MEMORY_USAGE:
value->status = rsmi_dev_memory_usage_get(gpu_index,
RSMI_MEM_TYPE_VRAM, &i64);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(i64);
}
break;
case RDC_FI_GPU_MEMORY_TOTAL:
value->status = rsmi_dev_memory_total_get(gpu_index,
RSMI_MEM_TYPE_VRAM, &i64);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(i64);
}
break;
case RDC_FI_GPU_COUNT:
uint32_t num_gpu;
value->status = rsmi_num_monitor_devices(&num_gpu);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(num_gpu);
}
break;
case RDC_FI_POWER_USAGE:
value->status = rsmi_dev_power_ave_get(gpu_index,
RSMI_TEMP_CURRENT, &i64);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(i64);
}
break;
case RDC_FI_GPU_CLOCK:
case RDC_FI_MEM_CLOCK:
rsmi_frequencies_t f;
clk_type = RSMI_CLK_TYPE_SYS;
if (field_id == RDC_FI_MEM_CLOCK) {
clk_type = RSMI_CLK_TYPE_MEM;
}
value->status = rsmi_dev_gpu_clk_freq_get(gpu_index,
clk_type, &f);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = f.frequency[f.current];
}
break;
case RDC_FI_GPU_UTIL:
uint32_t busy_percent;
value->status = rsmi_dev_busy_percent_get(gpu_index, &busy_percent);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(busy_percent);
}
break;
case RDC_FI_DEV_NAME:
value->status = rsmi_dev_name_get(gpu_index,
value->value.str, RDC_MAX_STR_LENGTH);
value->type = STRING;
break;
case RDC_FI_GPU_TEMP:
case RDC_FI_MEMORY_TEMP:
int64_t val_i64;
sensor_type = RSMI_TEMP_TYPE_EDGE;
if (field_id == RDC_FI_MEMORY_TEMP) {
sensor_type = RSMI_TEMP_TYPE_MEMORY;
}
value->status = rsmi_dev_temp_metric_get(gpu_index,
sensor_type , RSMI_TEMP_CURRENT, &val_i64);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = val_i64;
}
break;
case RDC_FI_ECC_CORRECT_TOTAL:
case RDC_FI_ECC_UNCORRECT_TOTAL:
get_ecc_error(gpu_index, field_id, value);
break;
case RDC_FI_PCIE_TX:
case RDC_FI_PCIE_RX:
async_fetching = async_get_pcie_throughput(
gpu_index, field_id, value);
break;
case RDC_EVNT_XGMI_0_NOP_TX:
case RDC_EVNT_XGMI_0_REQ_TX:
case RDC_EVNT_XGMI_0_RESP_TX:
case RDC_EVNT_XGMI_0_BEATS_TX:
case RDC_EVNT_XGMI_1_NOP_TX:
case RDC_EVNT_XGMI_1_REQ_TX:
case RDC_EVNT_XGMI_1_RESP_TX:
case RDC_EVNT_XGMI_1_BEATS_TX:
read_rsmi_counter();
break;
case RDC_EVNT_XGMI_0_THRPUT:
case RDC_EVNT_XGMI_1_THRPUT:
case RDC_EVNT_XGMI_2_THRPUT:
case RDC_EVNT_XGMI_3_THRPUT:
case RDC_EVNT_XGMI_4_THRPUT:
case RDC_EVNT_XGMI_5_THRPUT:
read_rsmi_counter();
if (value->status == RDC_ST_OK) {
if (rsmi_data->counter_val.time_running > 0) {
coll_time_sec =
static_cast<float>(rsmi_data->counter_val.time_running)/kGig;
value->value.l_int = (value->value.l_int * 32)/coll_time_sec;
} else {
value->value.l_int = 0;
}
}
break;
default:
break;
}
int64_t latency = now()-value->ts;
if (value->status != RSMI_STATUS_SUCCESS) {
if (async_fetching) { //!< Async fetching is not an error
RDC_LOG(RDC_DEBUG, "Async fetch " << field_id_string(field_id));
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = val_i64;
}
break;
case RDC_FI_ECC_CORRECT_TOTAL:
case RDC_FI_ECC_UNCORRECT_TOTAL:
get_ecc_error(gpu_index, field_id, value);
break;
case RDC_FI_PCIE_TX:
case RDC_FI_PCIE_RX:
async_fetching = async_get_pcie_throughput(gpu_index, field_id, value);
break;
case RDC_EVNT_XGMI_0_NOP_TX:
case RDC_EVNT_XGMI_0_REQ_TX:
case RDC_EVNT_XGMI_0_RESP_TX:
case RDC_EVNT_XGMI_0_BEATS_TX:
case RDC_EVNT_XGMI_1_NOP_TX:
case RDC_EVNT_XGMI_1_REQ_TX:
case RDC_EVNT_XGMI_1_RESP_TX:
case RDC_EVNT_XGMI_1_BEATS_TX:
read_rsmi_counter();
break;
case RDC_EVNT_XGMI_0_THRPUT:
case RDC_EVNT_XGMI_1_THRPUT:
case RDC_EVNT_XGMI_2_THRPUT:
case RDC_EVNT_XGMI_3_THRPUT:
case RDC_EVNT_XGMI_4_THRPUT:
case RDC_EVNT_XGMI_5_THRPUT:
read_rsmi_counter();
if (value->status == RDC_ST_OK) {
if (rsmi_data->counter_val.time_running > 0) {
coll_time_sec = static_cast<float>(rsmi_data->counter_val.time_running) / kGig;
value->value.l_int = (value->value.l_int * 32) / coll_time_sec;
} else {
RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" <<
field_id_string(field_id) << " with rsmi error code "
<< value->status <<", latency " << latency);
value->value.l_int = 0;
}
} else if (value->type == INTEGER) {
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" <<
field_id_string(field_id) << ":" << value->value.l_int
<< ", latency " << latency);
} else if (value->type == DOUBLE) {
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" <<
field_id_string(field_id) << ":" << value->value.dbl
<< ", latency " << latency);
} else if (value->type == STRING) {
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" <<
field_id_string(field_id) << ":" << value->value.str
<< ", latency " << latency);
}
}
break;
return value->status == RSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR;
default:
break;
}
int64_t latency = now() - value->ts;
if (value->status != RSMI_STATUS_SUCCESS) {
if (async_fetching) { //!< Async fetching is not an error
RDC_LOG(RDC_DEBUG, "Async fetch " << field_id_string(field_id));
} else {
RDC_LOG(RDC_ERROR, "Fail to fetch " << gpu_index << ":" << field_id_string(field_id)
<< " with rsmi error code " << value->status
<< ", latency " << latency);
}
} else if (value->type == INTEGER) {
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << field_id_string(field_id) << ":"
<< value->value.l_int << ", latency " << latency);
} else if (value->type == DOUBLE) {
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << field_id_string(field_id) << ":"
<< value->value.dbl << ", latency " << latency);
} else if (value->type == STRING) {
RDC_LOG(RDC_DEBUG, "Fetch " << gpu_index << ":" << field_id_string(field_id) << ":"
<< value->value.str << ", latency " << latency);
}
return value->status == RSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR;
}
std::shared_ptr<FieldRSMIData>
RdcMetricFetcherImpl::get_rsmi_data(RdcFieldKey key) {
std::map<RdcFieldKey, std::shared_ptr<FieldRSMIData>>::iterator r_info =
rsmi_data_.find(key);
std::shared_ptr<FieldRSMIData> RdcMetricFetcherImpl::get_rsmi_data(RdcFieldKey key) {
std::map<RdcFieldKey, std::shared_ptr<FieldRSMIData>>::iterator r_info = rsmi_data_.find(key);
if (r_info != rsmi_data_.end()) {
return r_info->second;
@@ -506,8 +480,8 @@ RdcMetricFetcherImpl::get_rsmi_data(RdcFieldKey key) {
return nullptr;
}
static rdc_status_t init_rsmi_counter(RdcFieldKey fk,
rsmi_event_group_t grp, rsmi_event_handle_t *handle) {
static rdc_status_t init_rsmi_counter(RdcFieldKey fk, rsmi_event_group_t grp,
rsmi_event_handle_t* handle) {
rsmi_status_t ret;
uint32_t counters_available;
uint32_t dv_ind = fk.first;
@@ -535,8 +509,7 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk,
ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE);
if (sc.error()) {
RDC_LOG(RDC_ERROR,
"Failed to acquire required capabilities. Errno " << sc.error());
RDC_LOG(RDC_ERROR, "Failed to acquire required capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
}
@@ -551,8 +524,7 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk,
sc.Relinquish();
if (sc.error()) {
RDC_LOG(RDC_ERROR,
"Failed to relinquish capabilities. Errno " << sc.error());
RDC_LOG(RDC_ERROR, "Failed to relinquish capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
}
@@ -589,8 +561,7 @@ rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) {
if (ret != RSMI_STATUS_SUCCESS) {
rsmi_data_.erase(fk);
RDC_LOG(RDC_ERROR, "Error in stopping event counter: " <<
Rsmi2RdcError(ret));
RDC_LOG(RDC_ERROR, "Error in stopping event counter: " << Rsmi2RdcError(ret));
return Rsmi2RdcError(ret);
}
@@ -662,11 +633,11 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
}
if (ret == RDC_ST_INSUFF_RESOURCES) {
amd::rdc::fld_id2name_map_t &field_id_to_descript =
amd::rdc::get_field_id_description_from_id();
amd::rdc::fld_id2name_map_t& field_id_to_descript =
amd::rdc::get_field_id_description_from_id();
RDC_LOG(RDC_ERROR, "No event counters are available for " <<
field_id_to_descript.at(fk.second).enum_name << " event.");
RDC_LOG(RDC_ERROR, "No event counters are available for "
<< field_id_to_descript.at(fk.second).enum_name << " event.");
} else if (ret != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Error in getting event counter handle: " << ret);
}
+23 -28
查看文件
@@ -20,22 +20,21 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcMetricsUpdaterImpl.h"
#include <sys/time.h>
#include <chrono> // NOLINT(build/c++11)
#include <ctime>
#include <thread>
#include <chrono> // NOLINT(build/c++11)
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdcMetricsUpdaterImpl::RdcMetricsUpdaterImpl(
const RdcWatchTablePtr& watch_table,
const uint32_t check_frequency):
watch_table_(watch_table)
, started_(false)
, _check_frequency(check_frequency) {
}
RdcMetricsUpdaterImpl::RdcMetricsUpdaterImpl(const RdcWatchTablePtr& watch_table,
const uint32_t check_frequency)
: watch_table_(watch_table), started_(false), _check_frequency(check_frequency) {}
// Make the listen time for notifications a relatively long time.
// There's no point in starting/stopping it constantly.
@@ -43,29 +42,25 @@ static const uint32_t kRdcFieldListenNotifTime_mS = 10000;
static const uint32_t kRdcEventCheck_ms = 1000;
void RdcMetricsUpdaterImpl::start() {
if (started_) {
return;
if (started_) {
return;
}
started_ = true;
notif_updater_ = std::async(std::launch::async, [this]() {
while (started_) {
watch_table_->rdc_field_listen_notif(kRdcFieldListenNotifTime_mS);
std::this_thread::sleep_for(std::chrono::milliseconds(kRdcEventCheck_ms));
}
started_ = true;
notif_updater_ = std::async(std::launch::async, [this](){
while (started_) {
watch_table_->rdc_field_listen_notif(kRdcFieldListenNotifTime_mS);
std::this_thread::sleep_for(
std::chrono::milliseconds(kRdcEventCheck_ms));
}
});
updater_ = std::async(std::launch::async, [this](){
while (started_) {
watch_table_->rdc_field_update_all();
std::this_thread::sleep_for(
std::chrono::microseconds(_check_frequency));
}
});
});
updater_ = std::async(std::launch::async, [this]() {
while (started_) {
watch_table_->rdc_field_update_all();
std::this_thread::sleep_for(std::chrono::microseconds(_check_frequency));
}
});
}
void RdcMetricsUpdaterImpl::stop() {
started_ = false;
}
void RdcMetricsUpdaterImpl::stop() { started_ = false; }
} // namespace rdc
} // namespace amd
+17 -20
查看文件
@@ -29,33 +29,30 @@ THE SOFTWARE.
namespace amd {
namespace rdc {
RdcModuleMgrImpl::RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher)
: fetcher_(fetcher) {}
RdcModuleMgrImpl::RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher) : fetcher_(fetcher) {}
RdcTelemetryPtr RdcModuleMgrImpl::get_telemetry_module() {
if (rdc_telemetry_module_) {
return rdc_telemetry_module_;
}
if (!rdc_telemetry_module_) {
rdc_telemetry_module_.reset(
new RdcTelemetryModule(fetcher_));
}
if (rdc_telemetry_module_) {
return rdc_telemetry_module_;
}
if (!rdc_telemetry_module_) {
rdc_telemetry_module_.reset(new RdcTelemetryModule(fetcher_));
}
return rdc_telemetry_module_;
}
RdcDiagnosticPtr RdcModuleMgrImpl::get_diagnostic_module() {
if (rdc_diagnostic_module_) {
return rdc_diagnostic_module_;
}
if (!rdc_diagnostic_module_) {
rdc_diagnostic_module_.reset(
new RdcDiagnosticModule(fetcher_));
}
if (rdc_diagnostic_module_) {
return rdc_diagnostic_module_;
}
if (!rdc_diagnostic_module_) {
rdc_diagnostic_module_.reset(new RdcDiagnosticModule(fetcher_));
}
return rdc_diagnostic_module_;
}
} // namespace rdc
+48 -63
查看文件
@@ -19,73 +19,66 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcNotificationImpl.h"
#include <assert.h>
#include <sys/time.h>
#include <ctime>
#include <ctime>
#include <mutex> // NOLINT
#include <unordered_map>
#include <vector>
#include <mutex> // NOLINT
#include "common/rdc_capabilities.h"
#include "rdc/rdc.h"
#include "rdc_lib/impl/RdcTelemetryModule.h"
#include "rdc_lib/impl/RdcNotificationImpl.h"
#include "rdc_lib/impl/RsmiUtils.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RdcSmiLib.h"
#include "rdc_lib/impl/RdcTelemetryModule.h"
#include "rdc_lib/impl/RsmiUtils.h"
#include "rocm_smi/rocm_smi.h"
#include "common/rdc_capabilities.h"
namespace amd {
namespace rdc {
static std::unordered_map<rdc_field_t, rsmi_evt_notification_type_t>
rdc_2_rsmi_event_notif_map = {
{RDC_EVNT_NOTIF_VMFAULT, RSMI_EVT_NOTIF_VMFAULT},
{RDC_EVNT_NOTIF_FIRST, RSMI_EVT_NOTIF_FIRST},
{RDC_EVNT_NOTIF_THERMAL_THROTTLE, RSMI_EVT_NOTIF_THERMAL_THROTTLE},
{RDC_EVNT_NOTIF_PRE_RESET, RSMI_EVT_NOTIF_GPU_PRE_RESET},
{RDC_EVNT_NOTIF_POST_RESET, RSMI_EVT_NOTIF_GPU_POST_RESET},
static std::unordered_map<rdc_field_t, rsmi_evt_notification_type_t> rdc_2_rsmi_event_notif_map = {
{RDC_EVNT_NOTIF_VMFAULT, RSMI_EVT_NOTIF_VMFAULT},
{RDC_EVNT_NOTIF_FIRST, RSMI_EVT_NOTIF_FIRST},
{RDC_EVNT_NOTIF_THERMAL_THROTTLE, RSMI_EVT_NOTIF_THERMAL_THROTTLE},
{RDC_EVNT_NOTIF_PRE_RESET, RSMI_EVT_NOTIF_GPU_PRE_RESET},
{RDC_EVNT_NOTIF_POST_RESET, RSMI_EVT_NOTIF_GPU_POST_RESET},
};
static std::unordered_map<rsmi_evt_notification_type_t, rdc_field_t>
rsmi_event_notif_2_rdc_map = {
{RSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT},
{RSMI_EVT_NOTIF_FIRST, RDC_EVNT_NOTIF_FIRST},
{RSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE},
{RSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET},
{RSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET},
static std::unordered_map<rsmi_evt_notification_type_t, rdc_field_t> rsmi_event_notif_2_rdc_map = {
{RSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT},
{RSMI_EVT_NOTIF_FIRST, RDC_EVNT_NOTIF_FIRST},
{RSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE},
{RSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET},
{RSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET},
};
// This const determines space allocated on stack for notification events.
const uint32_t kMaxRSMIEvents = 64;
RdcNotificationImpl::RdcNotificationImpl() {
}
RdcNotificationImpl::RdcNotificationImpl() {}
RdcNotificationImpl::~RdcNotificationImpl() {
}
RdcNotificationImpl::~RdcNotificationImpl() {}
bool
RdcNotificationImpl::is_notification_event(rdc_field_t field) const {
if (rdc_2_rsmi_event_notif_map.find(field) ==
rdc_2_rsmi_event_notif_map.end()) {
bool RdcNotificationImpl::is_notification_event(rdc_field_t field) const {
if (rdc_2_rsmi_event_notif_map.find(field) == rdc_2_rsmi_event_notif_map.end()) {
return false;
}
return true;
}
rdc_status_t
RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
rdc_status_t RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
rsmi_status_t ret;
std::map<uint32_t, uint64_t> new_masks;
for (uint32_t i = 0; i < fk_arr.size(); ++i) {
if (rdc_2_rsmi_event_notif_map.find(fk_arr[i].second) ==
rdc_2_rsmi_event_notif_map.end()) {
if (rdc_2_rsmi_event_notif_map.find(fk_arr[i].second) == rdc_2_rsmi_event_notif_map.end()) {
continue;
}
new_masks[fk_arr[i].first] |=
RSMI_EVENT_MASK_FROM_INDEX(rdc_2_rsmi_event_notif_map[fk_arr[i].second]);
RSMI_EVENT_MASK_FROM_INDEX(rdc_2_rsmi_event_notif_map[fk_arr[i].second]);
}
std::map<uint32_t, uint64_t>::iterator it = new_masks.begin();
@@ -101,17 +94,15 @@ RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE);
if (sc.error()) {
RDC_LOG(RDC_ERROR,
"Failed to acquire required capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
RDC_LOG(RDC_ERROR, "Failed to acquire required capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
}
ret = rsmi_event_notification_init(it->first);
if (ret != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"rsmi_event_notification_init() returned " << ret << " for device " <<
it->first << ". " << std::endl <<
" Will not listen for events on this device");
RDC_LOG(RDC_ERROR, "rsmi_event_notification_init() returned "
<< ret << " for device " << it->first << ". " << std::endl
<< " Will not listen for events on this device");
continue;
}
@@ -120,18 +111,17 @@ RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
sc.Relinquish();
if (sc.error()) {
RDC_LOG(RDC_ERROR,
"Failed to relinquish capabilities. Errno " << sc.error());
RDC_LOG(RDC_ERROR, "Failed to relinquish capabilities. Errno " << sc.error());
return RDC_ST_PERM_ERROR;
}
if (ret == RSMI_STATUS_SUCCESS) {
gpu_evnt_notif_masks_[it->first] = it->second;
RDC_LOG(RDC_INFO, "Event notification mask for gpu " << it->first <<
"is set to 0x" << std::hex << it->second);
RDC_LOG(RDC_INFO, "Event notification mask for gpu " << it->first << "is set to 0x"
<< std::hex << it->second);
} else {
RDC_LOG(RDC_INFO, "rsmi_event_notification_mask_set() returned " << ret
<< " for device " << it->first);
RDC_LOG(RDC_INFO,
"rsmi_event_notification_mask_set() returned " << ret << " for device " << it->first);
return Rsmi2RdcError(ret);
}
}
@@ -139,9 +129,8 @@ RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
}
// Blocking
rdc_status_t
RdcNotificationImpl::listen(rdc_evnt_notification_t *events,
uint32_t *num_events, uint32_t timeout_ms) {
rdc_status_t RdcNotificationImpl::listen(rdc_evnt_notification_t* events, uint32_t* num_events,
uint32_t timeout_ms) {
if (events == nullptr || *num_events == 0) {
return RDC_ST_BAD_PARAMETER;
}
@@ -149,40 +138,37 @@ RdcNotificationImpl::listen(rdc_evnt_notification_t *events,
uint32_t f_cnt = std::min(*num_events, kMaxRSMIEvents);
rsmi_evt_notification_data_t rsmi_events[kMaxRSMIEvents];
rsmi_status_t ret =
rsmi_event_notification_get(timeout_ms, &f_cnt, rsmi_events);
rsmi_status_t ret = rsmi_event_notification_get(timeout_ms, &f_cnt, rsmi_events);
if (ret != RSMI_STATUS_SUCCESS) {
return Rsmi2RdcError(ret);
}
struct timeval tv;
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t now = static_cast<uint64_t>(tv.tv_sec)*1000+tv.tv_usec/1000;
uint64_t now = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
*num_events = f_cnt;
for (uint32_t i = 0; i < f_cnt; ++i) {
assert(rsmi_event_notif_2_rdc_map.find(rsmi_events[i].event) !=
rsmi_event_notif_2_rdc_map.end());
rsmi_event_notif_2_rdc_map.end());
events[i].gpu_id = rsmi_events[i].dv_ind;
events[i].field.field_id = rsmi_event_notif_2_rdc_map[rsmi_events[i].event];
events[i].field.status = RDC_ST_OK;
events[i].field.ts = now;
events[i].field.type = STRING;
strncpy_with_null(events[i].field.value.str,
rsmi_events[i].message, RDC_MAX_STR_LENGTH);
strncpy_with_null(events[i].field.value.str, rsmi_events[i].message, RDC_MAX_STR_LENGTH);
}
return RDC_ST_OK;
}
rdc_status_t
RdcNotificationImpl::stop_listening(uint32_t gpu_id) {
rdc_status_t RdcNotificationImpl::stop_listening(uint32_t gpu_id) {
rsmi_status_t ret;
ret = rsmi_event_notification_mask_set(gpu_id, 0);
if (ret != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "rsmi_event_notification_mask_set() returned " << ret
<< " for device " << gpu_id);
RDC_LOG(RDC_ERROR,
"rsmi_event_notification_mask_set() returned " << ret << " for device " << gpu_id);
}
ret = rsmi_event_notification_stop(gpu_id);
@@ -190,12 +176,11 @@ RdcNotificationImpl::stop_listening(uint32_t gpu_id) {
std::lock_guard<std::mutex> guard(notif_mutex_);
gpu_evnt_notif_masks_[gpu_id] = 0;
} else {
RDC_LOG(RDC_ERROR, "rsmi_event_notification_stop() returned " << ret
<< " for device " << gpu_id);
RDC_LOG(RDC_ERROR,
"rsmi_event_notification_stop() returned " << ret << " for device " << gpu_id);
}
return RDC_ST_OK;
}
} // namespace rdc
} // namespace amd
+5 -9
查看文件
@@ -21,6 +21,7 @@ THE SOFTWARE.
*/
#include "rdc_lib/RdcPerfTimer.h"
#include <x86intrin.h>
namespace amd {
@@ -28,9 +29,7 @@ namespace rdc {
static const uint64_t kNanosecondsPerSecond = 1000000000;
RdcPerfTimer::RdcPerfTimer(void) {
freq_in_100mhz = MeasureTSCFreqHz();
}
RdcPerfTimer::RdcPerfTimer(void) { freq_in_100mhz = MeasureTSCFreqHz(); }
RdcPerfTimer::~RdcPerfTimer() {
while (!_timers.empty()) {
@@ -62,8 +61,7 @@ int RdcPerfTimer::StartTimer(int index) {
#ifndef _AMD
struct timespec s;
clock_gettime(CLOCK_MONOTONIC, &s);
_timers[index]->_start = (uint64_t) s.tv_sec * kNanosecondsPerSecond
+ (uint64_t) s.tv_nsec;
_timers[index]->_start = (uint64_t)s.tv_sec * kNanosecondsPerSecond + (uint64_t)s.tv_nsec;
#else
// AMD timing method
@@ -88,7 +86,7 @@ int RdcPerfTimer::StopTimer(int index) {
#ifndef _AMD
struct timespec s;
clock_gettime(CLOCK_MONOTONIC, &s);
n = (uint64_t) s.tv_sec * kNanosecondsPerSecond + (uint64_t) s.tv_nsec;
n = (uint64_t)s.tv_sec * kNanosecondsPerSecond + (uint64_t)s.tv_nsec;
#else
// AMD Linux timing
@@ -110,9 +108,7 @@ int RdcPerfTimer::StopTimer(int index) {
return 0;
}
void RdcPerfTimer::Error(std::string str) {
std::cout << str << std::endl;
}
void RdcPerfTimer::Error(std::string str) { std::cout << str << std::endl; }
double RdcPerfTimer::ReadTimer(int index) {
if (index >= static_cast<int>(_timers.size())) {
+121 -138
查看文件
@@ -19,176 +19,159 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <functional>
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RdcRasLib.h"
#include <functional>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdcRasLib::RdcRasLib():
fields_value_get_(nullptr)
, fields_query_(nullptr)
, fields_watch_(nullptr)
, fields_unwatch_(nullptr)
, rdc_module_init_(nullptr)
, rdc_module_destroy_(nullptr) {
rdc_status_t status = lib_loader_.load("librdc_ras.so");
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "RAS related function will not work.");
return;
}
RdcRasLib::RdcRasLib()
: fields_value_get_(nullptr),
fields_query_(nullptr),
fields_watch_(nullptr),
fields_unwatch_(nullptr),
rdc_module_init_(nullptr),
rdc_module_destroy_(nullptr) {
rdc_status_t status = lib_loader_.load("librdc_ras.so");
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "RAS related function will not work.");
return;
}
status = lib_loader_.load_symbol(&rdc_module_init_,
"rdc_module_init");
if (status != RDC_ST_OK) {
rdc_module_init_ = nullptr;
return;
}
status = lib_loader_.load_symbol(&rdc_module_init_, "rdc_module_init");
if (status != RDC_ST_OK) {
rdc_module_init_ = nullptr;
return;
}
status = rdc_module_init_(0);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Fail to init librdc_ras.so:"
<< rdc_status_string(status)
<< ". RAS related function will not work.");
return;
}
status = rdc_module_init_(0);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Fail to init librdc_ras.so:" << rdc_status_string(status)
<< ". RAS related function will not work.");
return;
}
status = lib_loader_.load_symbol(&rdc_module_destroy_, "rdc_module_destroy");
if (status != RDC_ST_OK) {
rdc_module_destroy_ = nullptr;
}
status = lib_loader_.load_symbol(&rdc_module_destroy_,
"rdc_module_destroy");
if (status != RDC_ST_OK) {
rdc_module_destroy_ = nullptr;
}
status = lib_loader_.load_symbol(&fields_value_get_,
"rdc_telemetry_fields_value_get");
if (status != RDC_ST_OK) {
fields_value_get_ = nullptr;
}
status = lib_loader_.load_symbol(&fields_query_,
"rdc_telemetry_fields_query");
if (status != RDC_ST_OK) {
fields_query_ = nullptr;
}
status = lib_loader_.load_symbol(&fields_watch_,
"rdc_telemetry_fields_watch");
if (status != RDC_ST_OK) {
fields_watch_ = nullptr;
}
status = lib_loader_.load_symbol(&fields_unwatch_,
"rdc_telemetry_fields_unwatch");
if (status != RDC_ST_OK) {
fields_unwatch_ = nullptr;
}
status = lib_loader_.load_symbol(&fields_value_get_, "rdc_telemetry_fields_value_get");
if (status != RDC_ST_OK) {
fields_value_get_ = nullptr;
}
status = lib_loader_.load_symbol(&fields_query_, "rdc_telemetry_fields_query");
if (status != RDC_ST_OK) {
fields_query_ = nullptr;
}
status = lib_loader_.load_symbol(&fields_watch_, "rdc_telemetry_fields_watch");
if (status != RDC_ST_OK) {
fields_watch_ = nullptr;
}
status = lib_loader_.load_symbol(&fields_unwatch_, "rdc_telemetry_fields_unwatch");
if (status != RDC_ST_OK) {
fields_unwatch_ = nullptr;
}
}
RdcRasLib::~RdcRasLib() {
if (rdc_module_destroy_) {
rdc_module_destroy_();
}
if (rdc_module_destroy_) {
rdc_module_destroy_();
}
}
rdc_status_t RdcRasLib::rdc_telemetry_fields_query(
uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) {
if (field_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!fields_query_) {
*field_count = 0;
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t RdcRasLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) {
if (field_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!fields_query_) {
*field_count = 0;
return RDC_ST_FAIL_LOAD_MODULE;
}
auto status = fields_query_(field_ids, field_count);
RDC_LOG(RDC_DEBUG, "RAS support " << *field_count << " fields");
return status;
auto status = fields_query_(field_ids, field_count);
RDC_LOG(RDC_DEBUG, "RAS support " << *field_count << " fields");
return status;
}
rdc_status_t RdcRasLib::rdc_telemetry_fields_value_get(
rdc_gpu_field_t* fields, uint32_t fields_count, rdc_field_value_f callback,
void* user_data) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!fields_value_get_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = fields_value_get_(fields,
fields_count, callback, user_data);
RDC_LOG(RDC_DEBUG, "Bulk fetched " << fields_count << " fields from RAS: "
<< rdc_status_string(status));
return status;
rdc_status_t RdcRasLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!fields_value_get_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = fields_value_get_(fields, fields_count, callback, user_data);
RDC_LOG(RDC_DEBUG,
"Bulk fetched " << fields_count << " fields from RAS: " << rdc_status_string(status));
return status;
}
rdc_status_t RdcRasLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!fields_watch_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = fields_watch_(fields, fields_count);
RDC_LOG(RDC_DEBUG, "Watch " << fields_count << " fields from RAS: "
<< rdc_status_string(status));
return status;
rdc_status_t RdcRasLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!fields_watch_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = fields_watch_(fields, fields_count);
RDC_LOG(RDC_DEBUG, "Watch " << fields_count << " fields from RAS: " << rdc_status_string(status));
return status;
}
rdc_status_t RdcRasLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!fields_unwatch_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = fields_unwatch_(fields, fields_count);
RDC_LOG(RDC_DEBUG, "Unwatch " << fields_count << " fields from RAS: "
<< rdc_status_string(status));
return status;
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!fields_unwatch_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = fields_unwatch_(fields, fields_count);
RDC_LOG(RDC_DEBUG,
"Unwatch " << fields_count << " fields from RAS: " << rdc_status_string(status));
return status;
}
rdc_status_t RdcRasLib::rdc_diag_test_cases_query(
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) {
(void)test_cases;
(void)test_case_count;
return RDC_ST_NOT_SUPPORTED;
rdc_status_t RdcRasLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) {
(void)test_cases;
(void)test_case_count;
return RDC_ST_NOT_SUPPORTED;
}
// Run a specific test case
rdc_status_t RdcRasLib::rdc_test_case_run(
rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
(void)test_case;
(void)gpu_index;
(void)result;
(void)gpu_count;
return RDC_ST_NOT_SUPPORTED;
// Run a specific test case
rdc_status_t RdcRasLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, rdc_diag_test_result_t* result) {
(void)test_case;
(void)gpu_index;
(void)result;
(void)gpu_count;
return RDC_ST_NOT_SUPPORTED;
}
rdc_status_t RdcRasLib::rdc_diagnostic_run(
const rdc_group_info_t& gpus,
rdc_diag_level_t level,
rdc_diag_response_t* response) {
(void)gpus;
(void)level;
(void)response;
return RDC_ST_NOT_SUPPORTED;
rdc_status_t RdcRasLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
rdc_diag_response_t* response) {
(void)gpus;
(void)level;
(void)response;
return RDC_ST_NOT_SUPPORTED;
}
rdc_status_t RdcRasLib::rdc_diag_init(uint64_t flags) {
(void)flags;
return RDC_ST_NOT_SUPPORTED;
}
rdc_status_t RdcRasLib::rdc_diag_destroy() {
return RDC_ST_NOT_SUPPORTED;
(void)flags;
return RDC_ST_NOT_SUPPORTED;
}
rdc_status_t RdcRasLib::rdc_diag_destroy() { return RDC_ST_NOT_SUPPORTED; }
} // namespace rdc
} // namespace amd
+116 -128
查看文件
@@ -40,168 +40,156 @@ RdcRocpLib::RdcRocpLib(const char* lib_name)
telemetry_fields_value_get_(nullptr),
telemetry_fields_watch_(nullptr),
telemetry_fields_unwatch_(nullptr) {
rdc_status_t status = lib_loader_.load(lib_name);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Rocp related function will not work.");
return;
}
rdc_status_t status = lib_loader_.load(lib_name);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Rocp related function will not work.");
return;
}
status = set_rocmtools_path();
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Rocp related function will not work.");
return;
}
status = set_rocmtools_path();
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Rocp related function will not work.");
return;
}
status = lib_loader_.load_symbol(
&telemetry_fields_query_, "rdc_telemetry_fields_query");
if (status != RDC_ST_OK) {
telemetry_fields_query_ = nullptr;
}
status = lib_loader_.load_symbol(&telemetry_fields_query_, "rdc_telemetry_fields_query");
if (status != RDC_ST_OK) {
telemetry_fields_query_ = nullptr;
}
status = lib_loader_.load_symbol(
&telemetry_fields_value_get_, "rdc_telemetry_fields_value_get");
if (status != RDC_ST_OK) {
telemetry_fields_value_get_ = nullptr;
}
status = lib_loader_.load_symbol(&telemetry_fields_value_get_, "rdc_telemetry_fields_value_get");
if (status != RDC_ST_OK) {
telemetry_fields_value_get_ = nullptr;
}
status = lib_loader_.load_symbol(
&telemetry_fields_watch_, "rdc_telemetry_fields_watch");
if (status != RDC_ST_OK) {
telemetry_fields_watch_ = nullptr;
}
status = lib_loader_.load_symbol(&telemetry_fields_watch_, "rdc_telemetry_fields_watch");
if (status != RDC_ST_OK) {
telemetry_fields_watch_ = nullptr;
}
status = lib_loader_.load_symbol(
&telemetry_fields_unwatch_, "rdc_telemetry_fields_unwatch");
if (status != RDC_ST_OK) {
telemetry_fields_unwatch_ = nullptr;
}
status = lib_loader_.load_symbol(&telemetry_fields_unwatch_, "rdc_telemetry_fields_unwatch");
if (status != RDC_ST_OK) {
telemetry_fields_unwatch_ = nullptr;
}
}
RdcRocpLib::~RdcRocpLib() = default;
// get support field ids
rdc_status_t RdcRocpLib::rdc_telemetry_fields_query(
uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) {
if (field_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (telemetry_fields_query_ == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t RdcRocpLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) {
if (field_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (telemetry_fields_query_ == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
return telemetry_fields_query_(field_ids, field_count);
return telemetry_fields_query_(field_ids, field_count);
}
// Fetch
rdc_status_t RdcRocpLib::rdc_telemetry_fields_value_get(
rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (telemetry_fields_value_get_ == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t RdcRocpLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (telemetry_fields_value_get_ == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
return telemetry_fields_value_get_(
fields, fields_count, callback, user_data);
return telemetry_fields_value_get_(fields, fields_count, callback, user_data);
}
rdc_status_t RdcRocpLib::rdc_telemetry_fields_watch(
rdc_gpu_field_t* fields,
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (telemetry_fields_watch_ == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t RdcRocpLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (telemetry_fields_watch_ == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
return telemetry_fields_watch_(fields, fields_count);
return telemetry_fields_watch_(fields, fields_count);
}
rdc_status_t RdcRocpLib::rdc_telemetry_fields_unwatch(
rdc_gpu_field_t* fields,
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (telemetry_fields_unwatch_ == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t RdcRocpLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (telemetry_fields_unwatch_ == nullptr) {
return RDC_ST_FAIL_LOAD_MODULE;
}
return telemetry_fields_unwatch_(fields, fields_count);
return telemetry_fields_unwatch_(fields, fields_count);
}
std::string RdcRocpLib::get_rocm_path() {
// set default rocm path in case lookup fails
std::string rocm_path("/opt/rocm");
const char* rocm_path_env = getenv("ROCM_PATH");
if (rocm_path_env != nullptr) {
rocm_path = rocm_path_env;
}
// set default rocm path in case lookup fails
std::string rocm_path("/opt/rocm");
const char* rocm_path_env = getenv("ROCM_PATH");
if (rocm_path_env != nullptr) {
rocm_path = rocm_path_env;
}
std::ifstream file("/proc/self/maps");
if (!file.is_open()) {
return rocm_path;
}
std::string line;
while (getline(file, line)) {
size_t index_end = line.find("librocmtools.so");
size_t index_start = index_end;
if (index_end == std::string::npos) {
// no library on this line
continue;
}
// walk index backwards until it reaches a space
while ((index_start > 0) && (line[index_start - 1] != ' ')) {
index_start--;
}
// extract library path, drop library name
rocm_path = line.substr(index_start, index_end - index_start);
// appending "../" should result in "/opt/rocm/lib/.." or similar
rocm_path += "..";
return rocm_path;
}
std::ifstream file("/proc/self/maps");
if (!file.is_open()) {
return rocm_path;
}
std::string line;
while (getline(file, line)) {
size_t index_end = line.find("librocmtools.so");
size_t index_start = index_end;
if (index_end == std::string::npos) {
// no library on this line
continue;
}
// walk index backwards until it reaches a space
while ((index_start > 0) && (line[index_start - 1] != ' ')) {
index_start--;
}
// extract library path, drop library name
rocm_path = line.substr(index_start, index_end - index_start);
// appending "../" should result in "/opt/rocm/lib/.." or similar
rocm_path += "..";
return rocm_path;
}
return rocm_path;
}
rdc_status_t RdcRocpLib::set_rocmtools_path() {
// librocmtools requires ROCMTOOLS_METRICS_PATH to be set
std::string rocmtools_metrics_path =
get_rocm_path() + "/libexec/rocmtools/counters/derived_counters.xml";
// librocmtools requires ROCMTOOLS_METRICS_PATH to be set
std::string rocmtools_metrics_path =
get_rocm_path() + "/libexec/rocmtools/counters/derived_counters.xml";
// set rocm prefix
int result =
setenv("ROCMTOOLS_METRICS_PATH", rocmtools_metrics_path.c_str(), 0);
if (result != 0) {
RDC_LOG(RDC_ERROR, "setenv ROCMTOOLS_METRICS_PATH failed! " << result);
return RDC_ST_PERM_ERROR;
}
// set rocm prefix
int result = setenv("ROCMTOOLS_METRICS_PATH", rocmtools_metrics_path.c_str(), 0);
if (result != 0) {
RDC_LOG(RDC_ERROR, "setenv ROCMTOOLS_METRICS_PATH failed! " << result);
return RDC_ST_PERM_ERROR;
}
// check that env exists
const char* rocmtools_metrics_env = getenv("ROCMTOOLS_METRICS_PATH");
if (rocmtools_metrics_env == nullptr) {
RDC_LOG(RDC_ERROR, "ROCMTOOLS_METRICS_PATH is not set!");
return RDC_ST_NO_DATA;
}
// check that env exists
const char* rocmtools_metrics_env = getenv("ROCMTOOLS_METRICS_PATH");
if (rocmtools_metrics_env == nullptr) {
RDC_LOG(RDC_ERROR, "ROCMTOOLS_METRICS_PATH is not set!");
return RDC_ST_NO_DATA;
}
// check that file can be accessed
std::ifstream test_file(rocmtools_metrics_env);
if (!test_file.good()) {
RDC_LOG(
RDC_ERROR,
"failed to open ROCMTOOLS_METRICS_PATH: " << rocmtools_metrics_env);
return RDC_ST_FILE_ERROR;
}
// check that file can be accessed
std::ifstream test_file(rocmtools_metrics_env);
if (!test_file.good()) {
RDC_LOG(RDC_ERROR, "failed to open ROCMTOOLS_METRICS_PATH: " << rocmtools_metrics_env);
return RDC_ST_FILE_ERROR;
}
return RDC_ST_OK;
return RDC_ST_OK;
}
} // namespace rdc
+80 -91
查看文件
@@ -19,125 +19,114 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <functional>
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RdcRocrLib.h"
#include <functional>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdcRocrLib::RdcRocrLib():
test_case_run_(nullptr)
, diag_test_cases_query_(nullptr)
, diag_init_(nullptr)
, diag_destroy_(nullptr) {
rdc_status_t status = lib_loader_.load("librdc_rocr.so");
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Rocr related function will not work.");
return;
}
RdcRocrLib::RdcRocrLib()
: test_case_run_(nullptr),
diag_test_cases_query_(nullptr),
diag_init_(nullptr),
diag_destroy_(nullptr) {
rdc_status_t status = lib_loader_.load("librdc_rocr.so");
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Rocr related function will not work.");
return;
}
status = lib_loader_.load_symbol(&diag_init_,
"rdc_diag_init");
if (status != RDC_ST_OK) {
diag_init_ = nullptr;
return;
}
status = lib_loader_.load_symbol(&diag_init_, "rdc_diag_init");
if (status != RDC_ST_OK) {
diag_init_ = nullptr;
return;
}
status = diag_init_(0);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Fail to init librdc_rocr.so:"
<< rdc_status_string(status)
<< ". Rocr related function will not work.");
return;
}
status = diag_init_(0);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Fail to init librdc_rocr.so:" << rdc_status_string(status)
<< ". Rocr related function will not work.");
return;
}
status = lib_loader_.load_symbol(&diag_destroy_,
"rdc_diag_destroy");
if (status != RDC_ST_OK) {
diag_destroy_ = nullptr;
}
status = lib_loader_.load_symbol(&diag_destroy_, "rdc_diag_destroy");
if (status != RDC_ST_OK) {
diag_destroy_ = nullptr;
}
status = lib_loader_.load_symbol(&test_case_run_,
"rdc_diag_test_case_run");
if (status != RDC_ST_OK) {
test_case_run_ = nullptr;
}
status = lib_loader_.load_symbol(&diag_test_cases_query_,
"rdc_diag_test_cases_query");
if (status != RDC_ST_OK) {
diag_test_cases_query_ = nullptr;
}
status = lib_loader_.load_symbol(&test_case_run_, "rdc_diag_test_case_run");
if (status != RDC_ST_OK) {
test_case_run_ = nullptr;
}
status = lib_loader_.load_symbol(&diag_test_cases_query_, "rdc_diag_test_cases_query");
if (status != RDC_ST_OK) {
diag_test_cases_query_ = nullptr;
}
}
RdcRocrLib::~RdcRocrLib() {
if (diag_destroy_) {
diag_destroy_();
}
if (diag_destroy_) {
diag_destroy_();
}
}
rdc_status_t RdcRocrLib::rdc_diag_test_cases_query(
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) {
if (test_case_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!diag_test_cases_query_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t RdcRocrLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) {
if (test_case_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!diag_test_cases_query_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = diag_test_cases_query_(test_cases, test_case_count);
RDC_LOG(RDC_DEBUG, "Query " << *test_case_count << " test cases from Rocr: "
<< rdc_status_string(status));
return status;
rdc_status_t status = diag_test_cases_query_(test_cases, test_case_count);
RDC_LOG(RDC_DEBUG,
"Query " << *test_case_count << " test cases from Rocr: " << rdc_status_string(status));
return status;
}
// Run a specific test case
rdc_status_t RdcRocrLib::rdc_test_case_run(
rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!test_case_run_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
// Run a specific test case
rdc_status_t RdcRocrLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (!test_case_run_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
rdc_status_t status = test_case_run_(test_case, gpu_index,
gpu_count, result);
RDC_LOG(RDC_DEBUG, "Run " << test_case << " test case from Rocr: "
<< rdc_status_string(status));
return status;
rdc_status_t status = test_case_run_(test_case, gpu_index, gpu_count, result);
RDC_LOG(RDC_DEBUG, "Run " << test_case << " test case from Rocr: " << rdc_status_string(status));
return status;
}
rdc_status_t RdcRocrLib::rdc_diagnostic_run(
const rdc_group_info_t& gpus,
rdc_diag_level_t level,
rdc_diag_response_t* response) {
(void)gpus;
(void)level;
(void)response;
return RDC_ST_NOT_SUPPORTED;
rdc_status_t RdcRocrLib::rdc_diagnostic_run(const rdc_group_info_t& gpus, rdc_diag_level_t level,
rdc_diag_response_t* response) {
(void)gpus;
(void)level;
(void)response;
return RDC_ST_NOT_SUPPORTED;
}
rdc_status_t RdcRocrLib::rdc_diag_init(uint64_t flags) {
if (!diag_init_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
if (!diag_init_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
return diag_init_(flags);
return diag_init_(flags);
}
rdc_status_t RdcRocrLib::rdc_diag_destroy() {
if (!diag_destroy_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
if (!diag_destroy_) {
return RDC_ST_FAIL_LOAD_MODULE;
}
return diag_destroy_();
return diag_destroy_();
}
} // namespace rdc
} // namespace amd
檔案差異因為檔案過大而無法顯示 載入差異
+147 -168
查看文件
@@ -19,220 +19,199 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <functional>
#include <stdlib.h>
#include <strings.h>
#include "rdc_lib/rdc_common.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RdcSmiLib.h"
#include <stdlib.h>
#include <strings.h>
#include <functional>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdcSmiLib::RdcSmiLib(const RdcMetricFetcherPtr& mf): metric_fetcher_(mf)
, bulk_fetch_enabled_(false)
, smi_diag_(std::make_shared<RdcSmiDiagnosticImpl>()) {
char* bulk_env = getenv("RDC_BULK_FETCH_ENABLED");
if (bulk_env != nullptr && strcasecmp(bulk_env, "true") == 0) {
RDC_LOG(RDC_DEBUG, "Bulk fetch enabled.");
bulk_fetch_enabled_ = true;
} else {
RDC_LOG(RDC_DEBUG, "Bulk fetch disabled.");
}
RdcSmiLib::RdcSmiLib(const RdcMetricFetcherPtr& mf)
: metric_fetcher_(mf),
bulk_fetch_enabled_(false),
smi_diag_(std::make_shared<RdcSmiDiagnosticImpl>()) {
char* bulk_env = getenv("RDC_BULK_FETCH_ENABLED");
if (bulk_env != nullptr && strcasecmp(bulk_env, "true") == 0) {
RDC_LOG(RDC_DEBUG, "Bulk fetch enabled.");
bulk_fetch_enabled_ = true;
} else {
RDC_LOG(RDC_DEBUG, "Bulk fetch disabled.");
}
}
// Bulk fetch wrapper for the rocm_smi_lib. This will be replaced after
// rocm_smi_lib can support bulk fetch.
rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
uint32_t fields_count, rdc_field_value_f callback,
void* user_data) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
RDC_LOG(RDC_DEBUG, "Fetch " << fields_count
<< " fields from rocm_smi_lib.");
RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from rocm_smi_lib.");
// Bulk fetch fields
std::vector<rdc_gpu_field_value_t> bulk_results;
if (bulk_fetch_enabled_) {
rdc_status_t status = metric_fetcher_->bulk_fetch_smi_fields(
fields, fields_count, bulk_results);
RDC_LOG(RDC_DEBUG, "Bulk fetched " << bulk_results.size()
<< " fields from rocm_smi_lib which return " << status);
if (bulk_results.size() > 0) {
rdc_status_t status = callback(&bulk_results[0],
bulk_results.size(), user_data);
if (status != RDC_ST_OK) {
return status;
}
}
// Bulk fetch fields
std::vector<rdc_gpu_field_value_t> bulk_results;
if (bulk_fetch_enabled_) {
rdc_status_t status =
metric_fetcher_->bulk_fetch_smi_fields(fields, fields_count, bulk_results);
RDC_LOG(RDC_DEBUG, "Bulk fetched " << bulk_results.size()
<< " fields from rocm_smi_lib which return " << status);
if (bulk_results.size() > 0) {
rdc_status_t status = callback(&bulk_results[0], bulk_results.size(), user_data);
if (status != RDC_ST_OK) {
return status;
}
}
}
// Fetch it one by one for left fields
const int BULK_FIELDS_MAX = 16;
rdc_gpu_field_value_t values[BULK_FIELDS_MAX];
uint32_t bulk_count = 0;
for (uint32_t i = 0; i < fields_count; i++) {
bool is_fetched = false;
for (std::size_t j = 0; j < bulk_results.size(); j++) {
if (bulk_results[j].gpu_index == fields[i].gpu_index &&
bulk_results[j].field_value.field_id == fields[i].field_id) {
is_fetched = true;
break;
}
}
if (is_fetched) continue;
if (bulk_count >= BULK_FIELDS_MAX) {
rdc_status_t status = callback(values, bulk_count, user_data);
// When the callback returns errors, stop processing and return.
if (status != RDC_ST_OK) {
return status;
}
bulk_count = 0;
}
values[bulk_count].gpu_index = fields[i].gpu_index;
metric_fetcher_->fetch_smi_field(
fields[i].gpu_index,
static_cast<rdc_field_t>(fields[i].field_id),
&(values[bulk_count].field_value));
bulk_count++;
// Fetch it one by one for left fields
const int BULK_FIELDS_MAX = 16;
rdc_gpu_field_value_t values[BULK_FIELDS_MAX];
uint32_t bulk_count = 0;
for (uint32_t i = 0; i < fields_count; i++) {
bool is_fetched = false;
for (std::size_t j = 0; j < bulk_results.size(); j++) {
if (bulk_results[j].gpu_index == fields[i].gpu_index &&
bulk_results[j].field_value.field_id == fields[i].field_id) {
is_fetched = true;
break;
}
}
if (bulk_count != 0) {
rdc_status_t status = callback(values, bulk_count, user_data);
if (status != RDC_ST_OK) {
return status;
}
bulk_count = 0;
if (is_fetched) continue;
if (bulk_count >= BULK_FIELDS_MAX) {
rdc_status_t status = callback(values, bulk_count, user_data);
// When the callback returns errors, stop processing and return.
if (status != RDC_ST_OK) {
return status;
}
bulk_count = 0;
}
values[bulk_count].gpu_index = fields[i].gpu_index;
metric_fetcher_->fetch_smi_field(fields[i].gpu_index,
static_cast<rdc_field_t>(fields[i].field_id),
&(values[bulk_count].field_value));
bulk_count++;
}
if (bulk_count != 0) {
rdc_status_t status = callback(values, bulk_count, user_data);
if (status != RDC_ST_OK) {
return status;
}
bulk_count = 0;
}
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) {
rdc_status_t ret;
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
for (uint32_t i = 0; i < fields_count; i++) {
ret = metric_fetcher_->acquire_rsmi_handle(
{fields[i].gpu_index, fields[i].field_id});
if (ret != RDC_ST_OK) {
RDC_LOG(RDC_ERROR,
"Failed to acquire rocm_smi handle for field.");
}
for (uint32_t i = 0; i < fields_count; i++) {
ret = metric_fetcher_->acquire_rsmi_handle({fields[i].gpu_index, fields[i].field_id});
if (ret != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Failed to acquire rocm_smi handle for field.");
}
RDC_LOG(RDC_DEBUG, "acquire " << fields_count
<< " field handles from rocm_smi_lib");
}
RDC_LOG(RDC_DEBUG, "acquire " << fields_count << " field handles from rocm_smi_lib");
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcSmiLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
for (uint32_t i = 0; i < fields_count; i++) {
metric_fetcher_->delete_rsmi_handle(
{fields[i].gpu_index, fields[i].field_id});
}
RDC_LOG(RDC_DEBUG, "delete " << fields_count
<< " field handles from rocm_smi_lib");
for (uint32_t i = 0; i < fields_count; i++) {
metric_fetcher_->delete_rsmi_handle({fields[i].gpu_index, fields[i].field_id});
}
RDC_LOG(RDC_DEBUG, "delete " << fields_count << " field handles from rocm_smi_lib");
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(
uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) {
if (field_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) {
if (field_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
// List of fields supported by rocm_smi_lib
const std::vector<uint32_t> fields{
RDC_FI_GPU_COUNT, RDC_FI_DEV_NAME, RDC_FI_GPU_CLOCK,
RDC_FI_MEM_CLOCK, RDC_FI_MEMORY_TEMP, RDC_FI_GPU_TEMP,
RDC_FI_POWER_USAGE, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX,
RDC_FI_GPU_UTIL, RDC_FI_GPU_MEMORY_USAGE, RDC_FI_GPU_MEMORY_TOTAL,
RDC_FI_ECC_CORRECT_TOTAL, RDC_FI_ECC_UNCORRECT_TOTAL,
RDC_EVNT_XGMI_0_NOP_TX, RDC_EVNT_XGMI_0_REQ_TX,
RDC_EVNT_XGMI_0_RESP_TX, RDC_EVNT_XGMI_0_BEATS_TX,
RDC_EVNT_XGMI_1_NOP_TX, RDC_EVNT_XGMI_1_REQ_TX,
RDC_EVNT_XGMI_1_RESP_TX, RDC_EVNT_XGMI_1_BEATS_TX,
RDC_EVNT_XGMI_0_THRPUT, RDC_EVNT_XGMI_1_THRPUT,
RDC_EVNT_XGMI_2_THRPUT, RDC_EVNT_XGMI_3_THRPUT,
RDC_EVNT_XGMI_4_THRPUT, RDC_EVNT_XGMI_5_THRPUT,
};
std::copy(fields.begin(), fields.end(), field_ids);
*field_count = fields.size();
// List of fields supported by rocm_smi_lib
const std::vector<uint32_t> fields{
RDC_FI_GPU_COUNT, RDC_FI_DEV_NAME,
RDC_FI_GPU_CLOCK, RDC_FI_MEM_CLOCK,
RDC_FI_MEMORY_TEMP, RDC_FI_GPU_TEMP,
RDC_FI_POWER_USAGE, RDC_FI_PCIE_TX,
RDC_FI_PCIE_RX, RDC_FI_GPU_UTIL,
RDC_FI_GPU_MEMORY_USAGE, RDC_FI_GPU_MEMORY_TOTAL,
RDC_FI_ECC_CORRECT_TOTAL, RDC_FI_ECC_UNCORRECT_TOTAL,
RDC_EVNT_XGMI_0_NOP_TX, RDC_EVNT_XGMI_0_REQ_TX,
RDC_EVNT_XGMI_0_RESP_TX, RDC_EVNT_XGMI_0_BEATS_TX,
RDC_EVNT_XGMI_1_NOP_TX, RDC_EVNT_XGMI_1_REQ_TX,
RDC_EVNT_XGMI_1_RESP_TX, RDC_EVNT_XGMI_1_BEATS_TX,
RDC_EVNT_XGMI_0_THRPUT, RDC_EVNT_XGMI_1_THRPUT,
RDC_EVNT_XGMI_2_THRPUT, RDC_EVNT_XGMI_3_THRPUT,
RDC_EVNT_XGMI_4_THRPUT, RDC_EVNT_XGMI_5_THRPUT,
};
std::copy(fields.begin(), fields.end(), field_ids);
*field_count = fields.size();
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcSmiLib::rdc_diag_test_cases_query(
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) {
if (test_case_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t RdcSmiLib::rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) {
if (test_case_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
const std::vector<rdc_diag_test_cases_t> tests {
RDC_DIAG_COMPUTE_PROCESS,
RDC_DIAG_NODE_TOPOLOGY,
RDC_DIAG_GPU_PARAMETERS
};
std::copy(tests.begin(), tests.end(), test_cases);
*test_case_count = tests.size();
return RDC_ST_OK;
const std::vector<rdc_diag_test_cases_t> tests{RDC_DIAG_COMPUTE_PROCESS, RDC_DIAG_NODE_TOPOLOGY,
RDC_DIAG_GPU_PARAMETERS};
std::copy(tests.begin(), tests.end(), test_cases);
*test_case_count = tests.size();
return RDC_ST_OK;
}
// Run a specific test case
rdc_status_t RdcSmiLib::rdc_test_case_run(
rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
switch (test_case) {
case RDC_DIAG_COMPUTE_PROCESS:
return smi_diag_->check_rsmi_process_info(
gpu_index, gpu_count, result);
case RDC_DIAG_NODE_TOPOLOGY:
return smi_diag_->check_rsmi_topo_info(
gpu_index, gpu_count, result);
case RDC_DIAG_GPU_PARAMETERS:
return smi_diag_->check_rsmi_param_info(
gpu_index, gpu_count, result);
default:
return RDC_ST_NOT_SUPPORTED;
}
rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count, rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
switch (test_case) {
case RDC_DIAG_COMPUTE_PROCESS:
return smi_diag_->check_rsmi_process_info(gpu_index, gpu_count, result);
case RDC_DIAG_NODE_TOPOLOGY:
return smi_diag_->check_rsmi_topo_info(gpu_index, gpu_count, result);
case RDC_DIAG_GPU_PARAMETERS:
return smi_diag_->check_rsmi_param_info(gpu_index, gpu_count, result);
default:
return RDC_ST_NOT_SUPPORTED;
}
}
rdc_status_t RdcSmiLib::rdc_diagnostic_run(
const rdc_group_info_t&,
rdc_diag_level_t,
rdc_diag_response_t*) {
return RDC_ST_NOT_SUPPORTED;
rdc_status_t RdcSmiLib::rdc_diagnostic_run(const rdc_group_info_t&, rdc_diag_level_t,
rdc_diag_response_t*) {
return RDC_ST_NOT_SUPPORTED;
}
rdc_status_t RdcSmiLib::rdc_diag_init(uint64_t) {
return RDC_ST_OK;
}
rdc_status_t RdcSmiLib::rdc_diag_init(uint64_t) { return RDC_ST_OK; }
rdc_status_t RdcSmiLib::rdc_diag_destroy() {
return RDC_ST_OK;
}
rdc_status_t RdcSmiLib::rdc_diag_destroy() { return RDC_ST_OK; }
} // namespace rdc
} // namespace amd
+98 -108
查看文件
@@ -20,8 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcTelemetryModule.h"
#include <functional>
#include <memory>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/impl/RdcRasLib.h"
@@ -31,140 +33,128 @@ namespace amd {
namespace rdc {
// Return all supported fields
rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_query(
uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) {
if (field_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
auto ite = telemetry_modules_.begin();
*field_count = 0;
for (; ite != telemetry_modules_.end(); ite++) {
uint32_t count = 0;
rdc_status_t status = (*ite)->rdc_telemetry_fields_query(
&(field_ids[*field_count]), &count);
if (status == RDC_ST_OK) {
*field_count += count;
}
rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) {
if (field_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
auto ite = telemetry_modules_.begin();
*field_count = 0;
for (; ite != telemetry_modules_.end(); ite++) {
uint32_t count = 0;
rdc_status_t status = (*ite)->rdc_telemetry_fields_query(&(field_ids[*field_count]), &count);
if (status == RDC_ST_OK) {
*field_count += count;
}
}
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_watch(
rdc_gpu_field_t* fields, uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>> fields_in_module;
std::vector<rdc_gpu_field_value_t> unsupport_fields;
get_fields_for_module(fields, fields_count, fields_in_module, unsupport_fields);
auto ite = fields_in_module.begin();
for (; ite != fields_in_module.end(); ite++) {
if (ite->second.size() > 0) {
ite->first->rdc_telemetry_fields_watch(&ite->second[0], ite->second.size());
}
}
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>> fields_in_module;
std::vector<rdc_gpu_field_value_t> unsupport_fields;
get_fields_for_module(fields, fields_count,
fields_in_module, unsupport_fields);
auto ite = fields_in_module.begin();
for (; ite != fields_in_module.end(); ite++) {
if (ite->second.size() > 0) {
ite->first->rdc_telemetry_fields_watch(
&ite->second[0], ite->second.size());
}
}
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch(
rdc_gpu_field_t* fields, uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>> fields_in_module;
std::vector<rdc_gpu_field_value_t> unsupport_fields;
get_fields_for_module(fields, fields_count, fields_in_module, unsupport_fields);
auto ite = fields_in_module.begin();
for (; ite != fields_in_module.end(); ite++) {
if (ite->second.size() > 0) {
ite->first->rdc_telemetry_fields_unwatch(&ite->second[0], ite->second.size());
}
}
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>> fields_in_module;
std::vector<rdc_gpu_field_value_t> unsupport_fields;
get_fields_for_module(fields, fields_count,
fields_in_module, unsupport_fields);
auto ite = fields_in_module.begin();
for (; ite != fields_in_module.end(); ite++) {
if (ite->second.size() > 0) {
ite->first->rdc_telemetry_fields_unwatch(
&ite->second[0], ite->second.size());
}
}
return RDC_ST_OK;
return RDC_ST_OK;
}
RdcTelemetryModule::RdcTelemetryModule(RdcMetricFetcherPtr fetcher) {
const RdcSmiLibPtr smi_module = std::make_shared<RdcSmiLib>(fetcher);
const RdcRasLibPtr ras_module = std::make_shared<RdcRasLib>();
telemetry_modules_.push_back(smi_module);
if (ras_module) {
telemetry_modules_.push_back(ras_module);
}
const RdcSmiLibPtr smi_module = std::make_shared<RdcSmiLib>(fetcher);
const RdcRasLibPtr ras_module = std::make_shared<RdcRasLib>();
telemetry_modules_.push_back(smi_module);
if (ras_module) {
telemetry_modules_.push_back(ras_module);
}
auto ite = telemetry_modules_.begin();
for (; ite != telemetry_modules_.end(); ite++) {
uint32_t field_ids[MAX_NUM_FIELDS];
uint32_t field_count = 0;
rdc_status_t status = (*ite)->
rdc_telemetry_fields_query(field_ids, &field_count);
if (status == RDC_ST_OK) {
for (uint32_t index = 0; index < field_count; index++) {
fields_id_module_.insert({field_ids[index], (*ite)});
}
}
auto ite = telemetry_modules_.begin();
for (; ite != telemetry_modules_.end(); ite++) {
uint32_t field_ids[MAX_NUM_FIELDS];
uint32_t field_count = 0;
rdc_status_t status = (*ite)->rdc_telemetry_fields_query(field_ids, &field_count);
if (status == RDC_ST_OK) {
for (uint32_t index = 0; index < field_count; index++) {
fields_id_module_.insert({field_ids[index], (*ite)});
}
}
}
}
void RdcTelemetryModule::get_fields_for_module(
rdc_gpu_field_t* fields,
uint32_t fields_count,
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>>&
fields_in_module,
std::vector<rdc_gpu_field_value_t>& unsupport_fields) {
for (uint32_t findex = 0; findex < fields_count; findex++) {
RdcTelemetryPtr module = fields_id_module_[fields[findex].field_id];
if (module) {
fields_in_module[module].push_back(fields[findex]);
} else {
RDC_LOG(RDC_DEBUG, "Unsupported field " <<
field_id_string(fields[findex].field_id));
rdc_gpu_field_value_t value;
value.gpu_index = fields[findex].gpu_index;
value.field_value.field_id = fields[findex].field_id;
value.field_value.status = RDC_ST_NOT_SUPPORTED;
unsupport_fields.push_back(value);
}
rdc_gpu_field_t* fields, uint32_t fields_count,
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>>& fields_in_module,
std::vector<rdc_gpu_field_value_t>& unsupport_fields) {
for (uint32_t findex = 0; findex < fields_count; findex++) {
RdcTelemetryPtr module = fields_id_module_[fields[findex].field_id];
if (module) {
fields_in_module[module].push_back(fields[findex]);
} else {
RDC_LOG(RDC_DEBUG, "Unsupported field " << field_id_string(fields[findex].field_id));
rdc_gpu_field_value_t value;
value.gpu_index = fields[findex].gpu_index;
value.field_value.field_id = fields[findex].field_id;
value.field_value.status = RDC_ST_NOT_SUPPORTED;
unsupport_fields.push_back(value);
}
}
}
rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get(
rdc_gpu_field_t* fields, uint32_t fields_count,
rdc_field_value_f callback, void* user_data) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
// Dispatch the fields to the libraries
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>> fields_to_fetch;
std::vector<rdc_gpu_field_value_t> unsupport_fields;
get_fields_for_module(fields, fields_count, fields_to_fetch, unsupport_fields);
// Dispatch the fields to the libraries
std::map<RdcTelemetryPtr, std::vector<rdc_gpu_field_t>> fields_to_fetch;
std::vector<rdc_gpu_field_value_t> unsupport_fields;
get_fields_for_module(fields, fields_count,
fields_to_fetch, unsupport_fields);
auto ite = fields_to_fetch.begin();
for (; ite != fields_to_fetch.end(); ite++) {
rdc_gpu_field_t f[MAX_NUM_FIELDS];
std::copy(ite->second.begin(), ite->second.end(), f);
ite->first->rdc_telemetry_fields_value_get(f, ite->second.size(), callback, user_data);
}
auto ite = fields_to_fetch.begin();
for (; ite != fields_to_fetch.end(); ite ++) {
rdc_gpu_field_t f[MAX_NUM_FIELDS];
std::copy(ite->second.begin(), ite->second.end(), f);
ite->first->rdc_telemetry_fields_value_get(f,
ite->second.size(), callback, user_data);
}
// Notify the caller unsupported fields
callback(&unsupport_fields[0], unsupport_fields.size(), user_data);
// Notify the caller unsupported fields
callback(&unsupport_fields[0], unsupport_fields.size(), user_data);
return RDC_ST_OK;
return RDC_ST_OK;
}
} // namespace rdc
+426 -460
查看文件
@@ -20,482 +20,450 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <sys/time.h>
#include <ctime>
#include <sstream>
#include <algorithm>
#include <map>
#include <unordered_map>
#include "rdc_lib/impl/RdcWatchTableImpl.h"
#include <sys/time.h>
#include <algorithm>
#include <ctime>
#include <map>
#include <sstream>
#include <unordered_map>
#include "common/rdc_utils.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RdcMetricFetcherImpl.h"
#include "rdc_lib/rdc_common.h"
#include "common/rdc_utils.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc/rdc.h"
namespace amd {
namespace rdc {
RdcWatchTableImpl::RdcWatchTableImpl(const RdcGroupSettingsPtr& group_settings,
const RdcCacheManagerPtr& cache_mgr,
const RdcModuleMgrPtr& module_mgr,
const RdcNotificationPtr& notif):
group_settings_(group_settings)
, cache_mgr_(cache_mgr)
, rdc_module_mgr_(module_mgr)
, notifications_(notif)
, last_cleanup_time_(0) {
}
const RdcCacheManagerPtr& cache_mgr,
const RdcModuleMgrPtr& module_mgr,
const RdcNotificationPtr& notif)
: group_settings_(group_settings),
cache_mgr_(cache_mgr),
rdc_module_mgr_(module_mgr),
notifications_(notif),
last_cleanup_time_(0) {}
rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id,
const char job_id[64], uint64_t update_freq,
const rdc_gpu_gauges_t& gpu_gauges) {
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
if (job_watch_table_.find(job_id) != job_watch_table_.end()) {
return RDC_ST_ALREADY_EXIST;
}
} while (0);
std::vector<RdcFieldKey> fields_in_watch;
rdc_status_t result = get_fields_from_group(group_id,
JOB_FIELD_ID, fields_in_watch);
if (result != RDC_ST_OK) {
return result;
}
if (fields_in_watch.size() == 0) {
RDC_LOG(RDC_ERROR, "Fail to start job " << job_id <<". The group "
<< group_id << " must contain at least one GPU.");
return RDC_ST_NOT_FOUND;
rdc_status_t RdcWatchTableImpl::rdc_job_start_stats(rdc_gpu_group_t group_id, const char job_id[64],
uint64_t update_freq,
const rdc_gpu_gauges_t& gpu_gauges) {
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
if (job_watch_table_.find(job_id) != job_watch_table_.end()) {
return RDC_ST_ALREADY_EXIST;
}
} while (0);
JobWatchTableEntry jentry {group_id, fields_in_watch};
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
job_watch_table_.insert({job_id, jentry});
} while (0);
rdc_field_group_info_t finfo;
rdc_group_info_t ginfo;
result = group_settings_->rdc_group_gpu_get_info(
group_id, &ginfo);
if (result != RDC_ST_OK) {
return result;
}
result = group_settings_->rdc_group_field_get_info(JOB_FIELD_ID, &finfo);
if (result != RDC_ST_OK) {
return result;
}
result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo, gpu_gauges);
if (result != RDC_ST_OK) {
return result;
}
// At last, when every thing sets up, starts to watch the fields.
result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0);
std::vector<RdcFieldKey> fields_in_watch;
rdc_status_t result = get_fields_from_group(group_id, JOB_FIELD_ID, fields_in_watch);
if (result != RDC_ST_OK) {
return result;
}
if (fields_in_watch.size() == 0) {
RDC_LOG(RDC_ERROR, "Fail to start job " << job_id << ". The group " << group_id
<< " must contain at least one GPU.");
return RDC_ST_NOT_FOUND;
}
JobWatchTableEntry jentry{group_id, fields_in_watch};
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
job_watch_table_.insert({job_id, jentry});
} while (0);
rdc_field_group_info_t finfo;
rdc_group_info_t ginfo;
result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
if (result != RDC_ST_OK) {
return result;
}
result = group_settings_->rdc_group_field_get_info(JOB_FIELD_ID, &finfo);
if (result != RDC_ST_OK) {
return result;
}
result = cache_mgr_->rdc_job_start_stats(job_id, ginfo, finfo, gpu_gauges);
if (result != RDC_ST_OK) {
return result;
}
// At last, when every thing sets up, starts to watch the fields.
result = rdc_field_watch(group_id, JOB_FIELD_ID, update_freq, 0, 0);
return result;
}
rdc_status_t RdcWatchTableImpl::rdc_job_stop_stats(const char job_id[64],
const rdc_gpu_gauges_t& gpu_gauge) {
uint32_t job_group_id;
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
auto job = job_watch_table_.find(job_id);
if (job == job_watch_table_.end()) {
return RDC_ST_NOT_FOUND;
}
job_group_id = job->second.group_id;
} while (0);
rdc_status_t result = rdc_field_unwatch(job_group_id, JOB_FIELD_ID);
if (result != RDC_ST_OK) {
return result;
const rdc_gpu_gauges_t& gpu_gauge) {
uint32_t job_group_id;
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
auto job = job_watch_table_.find(job_id);
if (job == job_watch_table_.end()) {
return RDC_ST_NOT_FOUND;
}
job_group_id = job->second.group_id;
} while (0);
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
job_watch_table_.erase(job_id);
} while (0);
result = cache_mgr_->rdc_job_stop_stats(job_id, gpu_gauge);
rdc_status_t result = rdc_field_unwatch(job_group_id, JOB_FIELD_ID);
if (result != RDC_ST_OK) {
return result;
}
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
job_watch_table_.erase(job_id);
} while (0);
result = cache_mgr_->rdc_job_stop_stats(job_id, gpu_gauge);
return result;
}
rdc_status_t RdcWatchTableImpl::rdc_job_remove(const char job_id[64]) {
rdc_gpu_gauges_t gpu_gauge;
rdc_job_stop_stats(job_id, gpu_gauge);
return cache_mgr_->rdc_job_remove(job_id);
rdc_gpu_gauges_t gpu_gauge;
rdc_job_stop_stats(job_id, gpu_gauge);
return cache_mgr_->rdc_job_remove(job_id);
}
rdc_status_t RdcWatchTableImpl::rdc_job_remove_all() {
// Get all the job ids;
std::vector<std::string> v;
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
for (auto ite = job_watch_table_.begin();
ite != job_watch_table_.end(); ite++) {
v.push_back(ite->first);
}
} while (0);
// Stop them
for (auto job = v.begin(); job != v.end(); job++) {
rdc_gpu_gauges_t gpu_gauge;
rdc_job_stop_stats(const_cast<char*>(job->c_str()), gpu_gauge);
// Get all the job ids;
std::vector<std::string> v;
do { //< lock guard for thread safe
std::lock_guard<std::mutex> guard(watch_mutex_);
for (auto ite = job_watch_table_.begin(); ite != job_watch_table_.end(); ite++) {
v.push_back(ite->first);
}
} while (0);
return cache_mgr_->rdc_job_remove_all();
// Stop them
for (auto job = v.begin(); job != v.end(); job++) {
rdc_gpu_gauges_t gpu_gauge;
rdc_job_stop_stats(const_cast<char*>(job->c_str()), gpu_gauge);
}
return cache_mgr_->rdc_job_remove_all();
}
rdc_status_t RdcWatchTableImpl::get_fields_from_group(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, std::vector<RdcFieldKey> & fields) {
rdc_field_group_info_t finfo;
rdc_group_info_t ginfo;
rdc_status_t result = group_settings_->
rdc_group_gpu_get_info(group_id, &ginfo);
if (result != RDC_ST_OK) {
return result;
}
rdc_field_grp_t field_group_id,
std::vector<RdcFieldKey>& fields) {
rdc_field_group_info_t finfo;
rdc_group_info_t ginfo;
rdc_status_t result = group_settings_->rdc_group_gpu_get_info(group_id, &ginfo);
if (result != RDC_ST_OK) {
return result;
}
result = group_settings_->rdc_group_field_get_info(field_group_id, &finfo);
if (result != RDC_ST_OK) {
return result;
}
result = group_settings_->rdc_group_field_get_info(field_group_id, &finfo);
if (result != RDC_ST_OK) {
return result;
}
for (uint32_t i = 0 ; i < ginfo.count; i++) { // GPUs
for (uint32_t j = 0; j < finfo.count; j++) { // Fields
fields.push_back(RdcFieldKey({ginfo.entity_ids[i],
finfo.field_ids[j]}));
}
for (uint32_t i = 0; i < ginfo.count; i++) { // GPUs
for (uint32_t j = 0; j < finfo.count; j++) { // Fields
fields.push_back(RdcFieldKey({ginfo.entity_ids[i], finfo.field_ids[j]}));
}
}
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::rdc_field_watch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id, uint64_t update_freq,
double max_keep_age, uint32_t max_keep_samples) {
std::lock_guard<std::mutex> guard(watch_mutex_);
RdcFieldGroupKey gkey({group_id, field_group_id});
auto table_iter = watch_table_.find(gkey);
rdc_field_grp_t field_group_id,
uint64_t update_freq, double max_keep_age,
uint32_t max_keep_samples) {
std::lock_guard<std::mutex> guard(watch_mutex_);
RdcFieldGroupKey gkey({group_id, field_group_id});
auto table_iter = watch_table_.find(gkey);
// Already in the watch table
if (table_iter != watch_table_.end()) {
if (table_iter->second.is_watching) {
return RDC_ST_CONFLICT;
} else { // delete to overwrite
watch_table_.erase(table_iter);
}
// Already in the watch table
if (table_iter != watch_table_.end()) {
if (table_iter->second.is_watching) {
return RDC_ST_CONFLICT;
} else { // delete to overwrite
watch_table_.erase(table_iter);
}
}
// The field settings for this watch
FieldSettings f;
f.update_freq = update_freq;
f.max_keep_age = max_keep_age;
f.max_keep_samples = max_keep_samples;
f.last_update_time = 0;
f.is_watching = true;
// The field settings for this watch
FieldSettings f;
f.update_freq = update_freq;
f.max_keep_age = max_keep_age;
f.max_keep_samples = max_keep_samples;
f.last_update_time = 0;
f.is_watching = true;
// Get individual fields for the watch
std::vector<RdcFieldKey> fields_in_watch;
rdc_status_t result = get_fields_from_group(group_id, field_group_id, fields_in_watch);
if (result != RDC_ST_OK) {
return result;
}
// Get individual fields for the watch
std::vector<RdcFieldKey> fields_in_watch;
rdc_status_t result = get_fields_from_group(group_id,
field_group_id, fields_in_watch);
if (result != RDC_ST_OK) {
return result;
}
// See if any of the fields are notification fields, and
// set them up, if so.
result = notifications_->set_listen_events(fields_in_watch);
if (result != RDC_ST_OK) {
RDC_LOG(RDC_DEBUG,
"Error in configuring for event notification. Return " << result);
}
// Skip not supported fields
uint32_t unsupported_fields = 0;
auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module();
if (rdc_telemetry) {
uint32_t field_ids[MAX_NUM_FIELDS];
uint32_t field_count;
result = rdc_telemetry->
rdc_telemetry_fields_query(field_ids, &field_count);
if (result == RDC_ST_OK) {
RDC_LOG(RDC_DEBUG, "The system support "
<< field_count << " fields");
for (auto it = fields_in_watch.begin();
it != fields_in_watch.end(); ) {
bool not_supported = true;
for (uint32_t fi = 0; fi < field_count; fi++) {
if (field_ids[fi] == it->second) {
not_supported = false;
break;
}
}
if (not_supported) {
if (!notifications_->is_notification_event(it->second)) {
unsupported_fields++;
}
it = fields_in_watch.erase(it);
} else {
it++;
}
} // end for
} // end if
}
if (unsupported_fields > 0) {
RDC_LOG(RDC_DEBUG, "Skip watch " << unsupported_fields
<<" fields as they are not supported.");
}
// Update the fields_to_watch_
auto f_in_watch_iter = fields_in_watch.begin();
for (; f_in_watch_iter != fields_in_watch.end(); f_in_watch_iter++) {
auto ite = fields_to_watch_.find(*f_in_watch_iter);
if (ite == fields_to_watch_.end()) { // A new field
fields_to_watch_.insert({*f_in_watch_iter, f});
} else { // Merge the settings
auto& f_in_table = ite->second;
f_in_table.max_keep_age =
std::max(f_in_table.max_keep_age, max_keep_age);
f_in_table.max_keep_samples =
std::max(f_in_table.max_keep_samples, max_keep_samples);
if (f_in_table.is_watching) { // Already watching
f_in_table.update_freq =
std::min(f_in_table.update_freq, update_freq);
} else { // Not watching before
f_in_table.is_watching = true;
f_in_table.update_freq = update_freq;
// See if any of the fields are notification fields, and
// set them up, if so.
result = notifications_->set_listen_events(fields_in_watch);
if (result != RDC_ST_OK) {
RDC_LOG(RDC_DEBUG, "Error in configuring for event notification. Return " << result);
}
// Skip not supported fields
uint32_t unsupported_fields = 0;
auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module();
if (rdc_telemetry) {
uint32_t field_ids[MAX_NUM_FIELDS];
uint32_t field_count;
result = rdc_telemetry->rdc_telemetry_fields_query(field_ids, &field_count);
if (result == RDC_ST_OK) {
RDC_LOG(RDC_DEBUG, "The system support " << field_count << " fields");
for (auto it = fields_in_watch.begin(); it != fields_in_watch.end();) {
bool not_supported = true;
for (uint32_t fi = 0; fi < field_count; fi++) {
if (field_ids[fi] == it->second) {
not_supported = false;
break;
}
}
}
// Add to the watch table
watch_table_.insert({gkey, f});
// Notify the telemetry_module to watch those fields
if (rdc_telemetry) {
std::vector<rdc_gpu_field_t> fields;
auto fields_to_watch_iter = fields_to_watch_.begin();
for (; fields_to_watch_iter != fields_to_watch_.end();
fields_to_watch_iter++) {
if (fields_to_watch_iter->second.is_watching) {
fields.push_back({fields_to_watch_iter->first.first,
fields_to_watch_iter->first.second});
}
}
rdc_telemetry->rdc_telemetry_fields_watch(&fields[0],
fields.size());
}
if (not_supported) {
if (!notifications_->is_notification_event(it->second)) {
unsupported_fields++;
}
it = fields_in_watch.erase(it);
} else {
it++;
}
} // end for
} // end if
}
if (unsupported_fields > 0) {
RDC_LOG(RDC_DEBUG, "Skip watch " << unsupported_fields << " fields as they are not supported.");
}
return RDC_ST_OK;
// Update the fields_to_watch_
auto f_in_watch_iter = fields_in_watch.begin();
for (; f_in_watch_iter != fields_in_watch.end(); f_in_watch_iter++) {
auto ite = fields_to_watch_.find(*f_in_watch_iter);
if (ite == fields_to_watch_.end()) { // A new field
fields_to_watch_.insert({*f_in_watch_iter, f});
} else { // Merge the settings
auto& f_in_table = ite->second;
f_in_table.max_keep_age = std::max(f_in_table.max_keep_age, max_keep_age);
f_in_table.max_keep_samples = std::max(f_in_table.max_keep_samples, max_keep_samples);
if (f_in_table.is_watching) { // Already watching
f_in_table.update_freq = std::min(f_in_table.update_freq, update_freq);
} else { // Not watching before
f_in_table.is_watching = true;
f_in_table.update_freq = update_freq;
}
}
}
// Add to the watch table
watch_table_.insert({gkey, f});
// Notify the telemetry_module to watch those fields
if (rdc_telemetry) {
std::vector<rdc_gpu_field_t> fields;
auto fields_to_watch_iter = fields_to_watch_.begin();
for (; fields_to_watch_iter != fields_to_watch_.end(); fields_to_watch_iter++) {
if (fields_to_watch_iter->second.is_watching) {
fields.push_back({fields_to_watch_iter->first.first, fields_to_watch_iter->first.second});
}
}
rdc_telemetry->rdc_telemetry_fields_watch(&fields[0], fields.size());
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::update_field_in_table_when_unwatch(
const RdcFieldGroupKey& entry) {
// Get individual fields for this unwatch
std::vector<RdcFieldKey> fields;
rdc_status_t result = get_fields_from_group(
entry.first, entry.second, fields);
rdc_status_t RdcWatchTableImpl::update_field_in_table_when_unwatch(const RdcFieldGroupKey& entry) {
// Get individual fields for this unwatch
std::vector<RdcFieldKey> fields;
rdc_status_t result = get_fields_from_group(entry.first, entry.second, fields);
if (result != RDC_ST_OK) {
return result;
}
// Unwatch will only impact the update_freq, but not the max_keep_age
// and max_keep_samples. Walk through watch_table_ to get new update
// frequency for all fields and store it in update_frequencies
std::map<RdcFieldKey, uint64_t> update_frequencies;
auto w_iter = watch_table_.begin();
for (; w_iter != watch_table_.end(); w_iter++) {
// Skip the table is not in watching status
if (w_iter->second.is_watching == false) {
continue;
}
// Get all fields in current table
std::vector<RdcFieldKey> watch_fields;
result = get_fields_from_group(w_iter->first.first, w_iter->first.second, watch_fields);
if (result != RDC_ST_OK) {
return result;
return result;
}
// Unwatch will only impact the update_freq, but not the max_keep_age
// and max_keep_samples. Walk through watch_table_ to get new update
// frequency for all fields and store it in update_frequencies
std::map<RdcFieldKey, uint64_t> update_frequencies;
auto w_iter = watch_table_.begin();
for (; w_iter != watch_table_.end(); w_iter++) {
// Skip the table is not in watching status
if (w_iter->second.is_watching == false) {
continue;
}
// Get the update_freq
auto fields_in_table_iter = watch_fields.begin();
for (; fields_in_table_iter != watch_fields.end(); fields_in_table_iter++) {
auto f_in_freq_iter = update_frequencies.find(*fields_in_table_iter);
if (f_in_freq_iter == update_frequencies.end()) {
update_frequencies.insert({*fields_in_table_iter, w_iter->second.update_freq});
} else {
f_in_freq_iter->second = std::min(f_in_freq_iter->second, w_iter->second.update_freq);
}
}
}
// Get all fields in current table
std::vector<RdcFieldKey> watch_fields;
result = get_fields_from_group(w_iter->first.first,
w_iter->first.second, watch_fields);
if (result != RDC_ST_OK) {
return result;
}
// Get the update_freq
auto fields_in_table_iter = watch_fields.begin();
for (; fields_in_table_iter != watch_fields.end();
fields_in_table_iter++) {
auto f_in_freq_iter = update_frequencies.find(
*fields_in_table_iter);
if (f_in_freq_iter == update_frequencies.end()) {
update_frequencies.insert(
{*fields_in_table_iter, w_iter->second.update_freq});
} else {
f_in_freq_iter->second =
std::min(f_in_freq_iter->second,
w_iter->second.update_freq);
}
}
// Update the fields that impacted by this unwatch
auto fite = fields.begin();
std::vector<rdc_gpu_field_t> unwatch_fields;
for (; fite != fields.end(); fite++) {
// Turn off any notification fields
if (notifications_->is_notification_event(fite->second)) {
notifications_->stop_listening(fite->first);
continue;
}
// Update the fields that impacted by this unwatch
auto fite = fields.begin();
std::vector<rdc_gpu_field_t> unwatch_fields;
for (; fite != fields.end(); fite++) {
// Turn off any notification fields
if (notifications_->is_notification_event(fite->second)) {
notifications_->stop_listening(fite->first);
continue;
}
auto f_in_table = fields_to_watch_.find((*fite));
if (f_in_table == fields_to_watch_.end()) { // Not in fields_to_watch_
unwatch_fields.push_back({fite->first, fite->second});
continue;
}
auto freq_iter = update_frequencies.find(*fite);
if (freq_iter == update_frequencies.end()) {
f_in_table->second.is_watching = false;
unwatch_fields.push_back({fite->first, fite->second});
} else {
f_in_table->second.update_freq = freq_iter->second;
}
auto f_in_table = fields_to_watch_.find((*fite));
if (f_in_table == fields_to_watch_.end()) { // Not in fields_to_watch_
unwatch_fields.push_back({fite->first, fite->second});
continue;
}
// Notify the telemetry_module to unwatch those fields
auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module();
if (rdc_telemetry) {
rdc_telemetry->rdc_telemetry_fields_unwatch(&unwatch_fields[0],
unwatch_fields.size());
auto freq_iter = update_frequencies.find(*fite);
if (freq_iter == update_frequencies.end()) {
f_in_table->second.is_watching = false;
unwatch_fields.push_back({fite->first, fite->second});
} else {
f_in_table->second.update_freq = freq_iter->second;
}
}
return RDC_ST_OK;
// Notify the telemetry_module to unwatch those fields
auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module();
if (rdc_telemetry) {
rdc_telemetry->rdc_telemetry_fields_unwatch(&unwatch_fields[0], unwatch_fields.size());
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::rdc_field_unwatch(
rdc_gpu_group_t group_id, rdc_field_grp_t field_group_id) {
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t now = static_cast<uint64_t>(tv.tv_sec)*1000+tv.tv_usec/1000;
rdc_status_t RdcWatchTableImpl::rdc_field_unwatch(rdc_gpu_group_t group_id,
rdc_field_grp_t field_group_id) {
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t now = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
std::lock_guard<std::mutex> guard(watch_mutex_);
// Set is_watching = false
auto ite = watch_table_.find(RdcFieldGroupKey({group_id, field_group_id}));
if (ite == watch_table_.end()) {
return RDC_ST_NOT_FOUND;
}
ite->second.is_watching = false;
ite->second.last_update_time = now;
std::lock_guard<std::mutex> guard(watch_mutex_);
// Set is_watching = false
auto ite = watch_table_.find(RdcFieldGroupKey({group_id, field_group_id}));
if (ite == watch_table_.end()) {
return RDC_ST_NOT_FOUND;
}
ite->second.is_watching = false;
ite->second.last_update_time = now;
// Update the fields_to_watch_
return update_field_in_table_when_unwatch(ite->first);
// Update the fields_to_watch_
return update_field_in_table_when_unwatch(ite->first);
}
bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index,
rdc_field_t field_id, std::string& job_id) const {
RdcFieldKey key{gpu_index, field_id};
bool RdcWatchTableImpl::is_job_watch_field(uint32_t gpu_index, rdc_field_t field_id,
std::string& job_id) const {
RdcFieldKey key{gpu_index, field_id};
for (auto ite = job_watch_table_.begin();
ite != job_watch_table_.end(); ite++) {
auto& fields = ite->second.fields;
if (std::find(fields.begin(), fields.end(), key) != fields.end()) {
job_id = ite->first;
return true;
}
for (auto ite = job_watch_table_.begin(); ite != job_watch_table_.end(); ite++) {
auto& fields = ite->second.fields;
if (std::find(fields.begin(), fields.end(), key) != fields.end()) {
job_id = ite->first;
return true;
}
}
return false;
return false;
}
rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values,
uint32_t num_values, void* user_data) {
if (values == nullptr || user_data == nullptr) {
return RDC_ST_BAD_PARAMETER;
rdc_status_t RdcWatchTableImpl::handle_fields(rdc_gpu_field_value_t* values, uint32_t num_values,
void* user_data) {
if (values == nullptr || user_data == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
RdcWatchTableImpl* watchTable = static_cast<RdcWatchTableImpl*>(user_data);
for (uint32_t i = 0; i < num_values; i++) {
auto gpu_index = values[i].gpu_index;
auto field_id = values[i].field_value.field_id;
// Always Update the timestamp
auto ite = watchTable->fields_to_watch_.find({gpu_index, field_id});
if (ite != watchTable->fields_to_watch_.end()) {
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t now = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
ite->second.last_update_time = now;
}
RdcWatchTableImpl* watchTable = static_cast<RdcWatchTableImpl*>(user_data);
for (uint32_t i = 0; i < num_values; i++) {
auto gpu_index = values[i].gpu_index;
auto field_id = values[i].field_value.field_id;
// Always Update the timestamp
auto ite = watchTable->fields_to_watch_.find({gpu_index, field_id});
if (ite != watchTable->fields_to_watch_.end()) {
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t now = static_cast<uint64_t>(tv.tv_sec) * 1000
+ tv.tv_usec / 1000;
ite->second.last_update_time = now;
}
// Only cache valid results
if (values[i].field_value.status != RDC_ST_OK) {
continue;
}
// Update the cache
watchTable->cache_mgr_->rdc_update_cache(gpu_index,
values[i].field_value);
// Update the job stats cache
std::string job_id;
if (watchTable->is_job_watch_field(gpu_index, field_id, job_id)) {
watchTable->cache_mgr_->rdc_update_job_stats(gpu_index,
job_id, values[i].field_value);
}
// Only cache valid results
if (values[i].field_value.status != RDC_ST_OK) {
continue;
}
return RDC_ST_OK;
// Update the cache
watchTable->cache_mgr_->rdc_update_cache(gpu_index, values[i].field_value);
// Update the job stats cache
std::string job_id;
if (watchTable->is_job_watch_field(gpu_index, field_id, job_id)) {
watchTable->cache_mgr_->rdc_update_job_stats(gpu_index, job_id, values[i].field_value);
}
}
return RDC_ST_OK;
}
rdc_status_t RdcWatchTableImpl::rdc_field_update_all() {
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t now = static_cast<uint64_t>(tv.tv_sec)*1000+tv.tv_usec/1000;
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t now = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
// Collect all fields need to be updated for bulk fetch
std::vector<rdc_gpu_field_t> fields;
std::lock_guard<std::mutex> guard(watch_mutex_);
auto fite = fields_to_watch_.begin();
for (; fite != fields_to_watch_.end(); fite++) {
// Is this field need to be updated?
uint64_t track_freq = fite->second.update_freq/1000;
if (!fite->second.is_watching ||
fite->second.last_update_time+track_freq > now) {
continue;
}
fields.push_back({fite->first.first, fite->first.second});
// Collect all fields need to be updated for bulk fetch
std::vector<rdc_gpu_field_t> fields;
std::lock_guard<std::mutex> guard(watch_mutex_);
auto fite = fields_to_watch_.begin();
for (; fite != fields_to_watch_.end(); fite++) {
// Is this field need to be updated?
uint64_t track_freq = fite->second.update_freq / 1000;
if (!fite->second.is_watching || fite->second.last_update_time + track_freq > now) {
continue;
}
fields.push_back({fite->first.first, fite->first.second});
}
if (fields.size() != 0) {
auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module();
if (rdc_telemetry) {
rdc_telemetry->rdc_telemetry_fields_value_get(&fields[0],
fields.size(), RdcWatchTableImpl::handle_fields, this);
} else {
RDC_LOG(RDC_ERROR,
"RdcWatchTableImpl: Fail to get the telemetry module");
}
if (fields.size() != 0) {
auto rdc_telemetry = rdc_module_mgr_->get_telemetry_module();
if (rdc_telemetry) {
rdc_telemetry->rdc_telemetry_fields_value_get(&fields[0], fields.size(),
RdcWatchTableImpl::handle_fields, this);
} else {
RDC_LOG(RDC_ERROR, "RdcWatchTableImpl: Fail to get the telemetry module");
}
}
// Clean up is expensive, only do it once per second
if (now - last_cleanup_time_ > 1000) {
clean_up();
last_cleanup_time_ = now;
}
// Clean up is expensive, only do it once per second
if (now - last_cleanup_time_ > 1000) {
clean_up();
last_cleanup_time_ = now;
}
return RDC_ST_OK;
return RDC_ST_OK;
}
rdc_status_t
RdcWatchTableImpl::rdc_notif_update_cache(
rdc_evnt_notification_t *events, uint32_t num_events) {
rdc_status_t RdcWatchTableImpl::rdc_notif_update_cache(rdc_evnt_notification_t* events,
uint32_t num_events) {
if (events == nullptr || num_events == 0) {
return RDC_ST_BAD_PARAMETER;
}
@@ -543,83 +511,81 @@ rdc_status_t RdcWatchTableImpl::rdc_field_listen_notif(uint32_t timeout_ms) {
}
void RdcWatchTableImpl::clean_up() {
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t now = static_cast<uint64_t>(tv.tv_sec)*1000+tv.tv_usec/1000;
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t now = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
// Clean the cache and the fields_to_watch_ table
auto fite = fields_to_watch_.begin();
while (fite != fields_to_watch_.end()) {
cache_mgr_->evict_cache(fite->first.first, fite->first.second,
fite->second.max_keep_samples, fite->second.max_keep_age);
if (!fite->second.is_watching && fite->second.last_update_time +
fite->second.max_keep_age*1000 < now ) {
fite = fields_to_watch_.erase(fite);
} else {
++fite;
}
// Clean the cache and the fields_to_watch_ table
auto fite = fields_to_watch_.begin();
while (fite != fields_to_watch_.end()) {
cache_mgr_->evict_cache(fite->first.first, fite->first.second, fite->second.max_keep_samples,
fite->second.max_keep_age);
if (!fite->second.is_watching &&
fite->second.last_update_time + fite->second.max_keep_age * 1000 < now) {
fite = fields_to_watch_.erase(fite);
} else {
++fite;
}
}
// Clean the watch table
auto wite = watch_table_.begin();
while (wite != watch_table_.end()) {
if (!wite->second.is_watching && wite->second.last_update_time +
wite->second.max_keep_age*1000 < now ) {
wite = watch_table_.erase(wite);
} else {
++wite;
}
// Clean the watch table
auto wite = watch_table_.begin();
while (wite != watch_table_.end()) {
if (!wite->second.is_watching &&
wite->second.last_update_time + wite->second.max_keep_age * 1000 < now) {
wite = watch_table_.erase(wite);
} else {
++wite;
}
}
// Debug log every 30 seconds
if (now/1000%30 == 0) {
debug_status();
}
// Debug log every 30 seconds
if (now / 1000 % 30 == 0) {
debug_status();
}
}
void RdcWatchTableImpl::debug_status() {
RDC_LOG(RDC_DEBUG, "fields_to_watch_:" << fields_to_watch_.size()
<< " watch_table_:" << watch_table_.size()
<< " job_watch_table_:" << job_watch_table_.size()
<< " cache stats:" << cache_mgr_->get_cache_stats());
RDC_LOG(RDC_DEBUG, "fields_to_watch_:" << fields_to_watch_.size()
<< " watch_table_:" << watch_table_.size()
<< " job_watch_table_:" << job_watch_table_.size()
<< " cache stats:" << cache_mgr_->get_cache_stats());
if (watch_table_.size() > 0) {
RDC_LOG(RDC_DEBUG, "watch table details:");
}
for (auto wite = watch_table_.begin(); wite != watch_table_.end(); wite++) {
RDC_LOG(RDC_DEBUG, wite->first.first << "," << wite->first.second
<< ": age:" << wite->second.max_keep_age << ", samples:"
<< wite->second.max_keep_samples << ", is_watching:"
<< wite->second.is_watching << ", last_update_time:"
<< wite->second.last_update_time <<", update_freq:"
<< wite->second.update_freq);
}
if (watch_table_.size() > 0) {
RDC_LOG(RDC_DEBUG, "watch table details:");
}
for (auto wite = watch_table_.begin(); wite != watch_table_.end(); wite++) {
RDC_LOG(RDC_DEBUG, wite->first.first << "," << wite->first.second
<< ": age:" << wite->second.max_keep_age
<< ", samples:" << wite->second.max_keep_samples
<< ", is_watching:" << wite->second.is_watching
<< ", last_update_time:" << wite->second.last_update_time
<< ", update_freq:" << wite->second.update_freq);
}
if (job_watch_table_.size() > 0) {
RDC_LOG(RDC_DEBUG, "job watch table details: ");
}
for (auto jite = job_watch_table_.begin();
jite !=job_watch_table_.end(); jite++) {
std::stringstream strstream;
for (const auto& p : jite->second.fields) {
strstream << "<" << p.first << "," << p.second << "> ";
}
RDC_LOG(RDC_DEBUG, jite->first << ": " << jite->second.group_id
<< " fields : "<< strstream.str());
if (job_watch_table_.size() > 0) {
RDC_LOG(RDC_DEBUG, "job watch table details: ");
}
for (auto jite = job_watch_table_.begin(); jite != job_watch_table_.end(); jite++) {
std::stringstream strstream;
for (const auto& p : jite->second.fields) {
strstream << "<" << p.first << "," << p.second << "> ";
}
RDC_LOG(RDC_DEBUG,
jite->first << ": " << jite->second.group_id << " fields : " << strstream.str());
}
if (fields_to_watch_.size() > 0) {
RDC_LOG(RDC_DEBUG, "fields to watch details:");
}
for (auto fite = fields_to_watch_.begin(); fite != fields_to_watch_.end();
fite++) {
RDC_LOG(RDC_DEBUG, fite->first.first << "," << fite->first.second
<< ": age:" << fite->second.max_keep_age << ", samples:"
<< fite->second.max_keep_samples << ", is_watching:"
<< fite->second.is_watching << ", last_update_time:"
<< fite->second.last_update_time <<", update_freq:"
<< fite->second.update_freq);
}
if (fields_to_watch_.size() > 0) {
RDC_LOG(RDC_DEBUG, "fields to watch details:");
}
for (auto fite = fields_to_watch_.begin(); fite != fields_to_watch_.end(); fite++) {
RDC_LOG(RDC_DEBUG, fite->first.first << "," << fite->first.second
<< ": age:" << fite->second.max_keep_age
<< ", samples:" << fite->second.max_keep_samples
<< ", is_watching:" << fite->second.is_watching
<< ", last_update_time:" << fite->second.last_update_time
<< ", update_freq:" << fite->second.update_freq);
}
}
} // namespace rdc
+1 -2
查看文件
@@ -20,8 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rocm_smi/rocm_smi.h"
#include "rdc/rdc.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace rdc {
@@ -70,4 +70,3 @@ rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi) {
} // namespace rdc
} // namespace amd
檔案差異因為檔案過大而無法顯示 載入差異
+105 -117
查看文件
@@ -20,164 +20,152 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_modules/rdc_rocp/RdcRocpBase.h"
#include <rocmtools.h>
#include <cassert>
#include <chrono>
#include <cstring>
#include <vector>
#include "hsa.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rocp/RdcRocpBase.h"
namespace amd {
namespace rdc {
RdcRocpBase::RdcRocpBase() {
hsa_status_t err = hsa_init();
if (err != HSA_STATUS_SUCCESS) {
const char* errstr = nullptr;
hsa_status_string(err, &errstr);
throw std::runtime_error(
"hsa error code: " + std::to_string(err) + " " + errstr);
}
hsa_status_t err = hsa_init();
if (err != HSA_STATUS_SUCCESS) {
const char* errstr = nullptr;
hsa_status_string(err, &errstr);
throw std::runtime_error("hsa error code: " + std::to_string(err) + " " + errstr);
}
auto status = rocmtools_initialize();
RDC_LOG(RDC_INFO, "rocmtools_initialize status: " << status);
auto status = rocmtools_initialize();
RDC_LOG(RDC_INFO, "rocmtools_initialize status: " << status);
}
RdcRocpBase::~RdcRocpBase() {
for (auto& session : sessions) {
const rdc_status_t status = destroy_session(session.first);
assert(status == RDC_ST_OK);
}
sessions.clear();
auto status = rocmtools_finalize();
RDC_LOG(RDC_INFO, "rocmtools_finalize status: " << status);
for (auto& session : sessions) {
const rdc_status_t status = destroy_session(session.first);
assert(status == RDC_ST_OK);
}
sessions.clear();
auto status = rocmtools_finalize();
RDC_LOG(RDC_INFO, "rocmtools_finalize status: " << status);
hsa_status_t err = hsa_shut_down();
if (err != HSA_STATUS_SUCCESS) {
const char* errstr = nullptr;
hsa_status_string(err, &errstr);
// cannot throw an error here. print instead
RDC_LOG(
RDC_ERROR, "hsa error code: " + std::to_string(err) + " " + errstr);
}
hsa_status_t err = hsa_shut_down();
if (err != HSA_STATUS_SUCCESS) {
const char* errstr = nullptr;
hsa_status_string(err, &errstr);
// cannot throw an error here. print instead
RDC_LOG(RDC_ERROR, "hsa error code: " + std::to_string(err) + " " + errstr);
}
}
rdc_status_t RdcRocpBase::rocp_lookup(
pair_gpu_field_t gpu_field,
double* value) {
if (sessions.empty()) {
return RDC_ST_NOT_FOUND;
}
rdc_status_t RdcRocpBase::rocp_lookup(pair_gpu_field_t gpu_field, double* value) {
if (sessions.empty()) {
return RDC_ST_NOT_FOUND;
}
if (value == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
if (value == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rocmtools_device_profile_metric_t counter;
session_info_t session = sessions.at(gpu_field);
const rocmtools_status_t status =
rocmtools_device_profiling_session_poll(session.id, &counter);
session.stop_time = std::chrono::high_resolution_clock::now();
if (status != ROCMTOOLS_STATUS_SUCCESS) {
return Rocp2RdcError(status);
}
const auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(
session.stop_time - session.start_time)
.count();
// some metrics are derived from others and depend on time passed
switch (gpu_field.second) {
case RDC_FI_PROF_GFLOPS_16:
case RDC_FI_PROF_GFLOPS_32:
case RDC_FI_PROF_GFLOPS_64:
case RDC_FI_PROF_MEMR_BW_KBPNS:
case RDC_FI_PROF_MEMW_BW_KBPNS:
*value = counter.value.value / elapsed;
break;
default:
*value = counter.value.value;
break;
}
rocmtools_device_profile_metric_t counter;
session_info_t session = sessions.at(gpu_field);
const rocmtools_status_t status = rocmtools_device_profiling_session_poll(session.id, &counter);
session.stop_time = std::chrono::high_resolution_clock::now();
if (status != ROCMTOOLS_STATUS_SUCCESS) {
return Rocp2RdcError(status);
}
const auto elapsed =
std::chrono::duration_cast<std::chrono::nanoseconds>(session.stop_time - session.start_time)
.count();
// some metrics are derived from others and depend on time passed
switch (gpu_field.second) {
case RDC_FI_PROF_GFLOPS_16:
case RDC_FI_PROF_GFLOPS_32:
case RDC_FI_PROF_GFLOPS_64:
case RDC_FI_PROF_MEMR_BW_KBPNS:
case RDC_FI_PROF_MEMW_BW_KBPNS:
*value = counter.value.value / elapsed;
break;
default:
*value = counter.value.value;
break;
}
return Rocp2RdcError(status);
}
rdc_status_t RdcRocpBase::create_session(pair_gpu_field_t gpu_field) {
if (sessions.count(gpu_field) != 0) {
RDC_LOG(
RDC_DEBUG, "Session for field (" << gpu_field.second << ") on GPU ["
<< gpu_field.first
if (sessions.count(gpu_field) != 0) {
RDC_LOG(RDC_DEBUG, "Session for field (" << gpu_field.second << ") on GPU [" << gpu_field.first
<< "] already exists!");
return RDC_ST_ALREADY_EXIST;
}
return RDC_ST_ALREADY_EXIST;
}
session_info_t session = {};
session_info_t session = {};
std::vector<const char*> rocmtools_fields = {
counter_map_k.at(gpu_field.second)};
// create session
rocmtools_status_t status = rocmtools_device_profiling_session_create(
rocmtools_fields.data(), rocmtools_fields.size(), &session.id, 0,
gpu_field.first);
if (status != ROCMTOOLS_STATUS_SUCCESS) {
return Rocp2RdcError(status);
}
// add start time
session.start_time = std::chrono::high_resolution_clock::now();
sessions.emplace(gpu_field, session);
// start session
status = rocmtools_device_profiling_session_start(session.id);
std::vector<const char*> rocmtools_fields = {counter_map_k.at(gpu_field.second)};
// create session
rocmtools_status_t status = rocmtools_device_profiling_session_create(
rocmtools_fields.data(), rocmtools_fields.size(), &session.id, 0, gpu_field.first);
if (status != ROCMTOOLS_STATUS_SUCCESS) {
return Rocp2RdcError(status);
}
// add start time
session.start_time = std::chrono::high_resolution_clock::now();
sessions.emplace(gpu_field, session);
// start session
status = rocmtools_device_profiling_session_start(session.id);
return Rocp2RdcError(status);
}
rdc_status_t RdcRocpBase::destroy_session(pair_gpu_field_t gpu_field) {
if (sessions.empty()) {
RDC_LOG(RDC_DEBUG, "Cannot destroy empty session...");
return RDC_ST_OK;
}
if (sessions.empty()) {
RDC_LOG(RDC_DEBUG, "Cannot destroy empty session...");
return RDC_ST_OK;
}
// no session with field
if (sessions.count(gpu_field) == 0) {
RDC_LOG(
RDC_DEBUG, "Cannot destroy session with field ("
<< gpu_field.second << ") on GPU ["
<< gpu_field.first
<< "] because it doesn't exist...");
return RDC_ST_OK;
}
// no session with field
if (sessions.count(gpu_field) == 0) {
RDC_LOG(RDC_DEBUG, "Cannot destroy session with field (" << gpu_field.second << ") on GPU ["
<< gpu_field.first
<< "] because it doesn't exist...");
return RDC_ST_OK;
}
const rocmtools_session_id_t session_id = sessions.at(gpu_field).id;
const rocmtools_status_t status =
rocmtools_device_profiling_session_destroy(session_id);
if (status == ROCMTOOLS_STATUS_SUCCESS) {
const auto num_of_destroyed_sessions = sessions.erase(gpu_field);
RDC_LOG(
RDC_DEBUG,
"destroyed (" << num_of_destroyed_sessions << ") sessions");
}
return Rocp2RdcError(status);
const rocmtools_session_id_t session_id = sessions.at(gpu_field).id;
const rocmtools_status_t status = rocmtools_device_profiling_session_destroy(session_id);
if (status == ROCMTOOLS_STATUS_SUCCESS) {
const auto num_of_destroyed_sessions = sessions.erase(gpu_field);
RDC_LOG(RDC_DEBUG, "destroyed (" << num_of_destroyed_sessions << ") sessions");
}
return Rocp2RdcError(status);
}
rdc_status_t RdcRocpBase::Rocp2RdcError(rocmtools_status_t rocm_status) {
switch (rocm_status) {
case ROCMTOOLS_STATUS_SUCCESS:
return RDC_ST_OK;
case ROCMTOOLS_STATUS_ERROR_HAS_ACTIVE_SESSION:
return RDC_ST_ALREADY_EXIST;
case ROCMTOOLS_STATUS_ERROR_SESSION_FILTER_DATA_MISMATCH:
case ROCMTOOLS_STATUS_ERROR_SESSION_MISSING_FILTER:
case ROCMTOOLS_STATUS_ERROR_SESSION_NOT_FOUND:
return RDC_ST_BAD_PARAMETER;
default:
return RDC_ST_UNKNOWN_ERROR;
}
switch (rocm_status) {
case ROCMTOOLS_STATUS_SUCCESS:
return RDC_ST_OK;
case ROCMTOOLS_STATUS_ERROR_HAS_ACTIVE_SESSION:
return RDC_ST_ALREADY_EXIST;
case ROCMTOOLS_STATUS_ERROR_SESSION_FILTER_DATA_MISMATCH:
case ROCMTOOLS_STATUS_ERROR_SESSION_MISSING_FILTER:
case ROCMTOOLS_STATUS_ERROR_SESSION_NOT_FOUND:
return RDC_ST_BAD_PARAMETER;
default:
return RDC_ST_UNKNOWN_ERROR;
}
}
} // namespace rdc
+75 -85
查看文件
@@ -21,6 +21,7 @@ THE SOFTWARE.
*/
#include <sys/time.h>
#include <cstring>
#include <map>
#include <memory>
@@ -37,104 +38,93 @@ amd::rdc::RdcRocpBase rocp;
// get supported field ids
// TODO: Query fields with rocprofiler
rdc_status_t rdc_telemetry_fields_query(
uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) {
// extract all keys from counter_map
std::vector<uint32_t> counter_keys;
counter_keys.reserve(amd::rdc::counter_map_k.size());
for (auto it : amd::rdc::counter_map_k) {
counter_keys.push_back(it.first);
}
rdc_status_t rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FIELDS], uint32_t* field_count) {
// extract all keys from counter_map
std::vector<uint32_t> counter_keys;
counter_keys.reserve(amd::rdc::counter_map_k.size());
for (auto it : amd::rdc::counter_map_k) {
counter_keys.push_back(it.first);
}
*field_count = counter_keys.size();
// copy from vector into array
std::copy(counter_keys.begin(), counter_keys.end(), field_ids);
*field_count = counter_keys.size();
// copy from vector into array
std::copy(counter_keys.begin(), counter_keys.end(), field_ids);
return RDC_ST_OK;
return RDC_ST_OK;
}
// Fetch
rdc_status_t rdc_telemetry_fields_value_get(
rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data) {
//
// Bulk fetch fields
std::vector<rdc_gpu_field_value_t> bulk_results;
rdc_status_t rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count,
rdc_field_value_f callback, void* user_data) {
//
// Bulk fetch fields
std::vector<rdc_gpu_field_value_t> bulk_results;
struct timeval tv {};
gettimeofday(&tv, nullptr);
const uint64_t curTime =
static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
struct timeval tv {};
gettimeofday(&tv, nullptr);
const uint64_t curTime = static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
// Fetch it one by one for left fields
const int BULK_FIELDS_MAX = 16;
rdc_gpu_field_value_t values[BULK_FIELDS_MAX];
uint32_t bulk_count = 0;
rdc_status_t status = RDC_ST_UNKNOWN_ERROR;
double value = 0;
for (uint32_t i = 0; i < fields_count; i++) {
if (bulk_count >= BULK_FIELDS_MAX) {
status = callback(values, bulk_count, user_data);
// When the callback returns errors, stop processing and return.
if (status != RDC_ST_OK) {
return status;
}
bulk_count = 0;
}
status = rocp.rocp_lookup(
std::make_pair(fields[i].gpu_index, fields[i].field_id), &value);
// get value
values[bulk_count].gpu_index = fields[i].gpu_index;
values[bulk_count].field_value.type = DOUBLE;
values[bulk_count].field_value.status = status;
values[bulk_count].field_value.ts = curTime;
values[bulk_count].field_value.value.dbl = value;
values[bulk_count].field_value.field_id = fields[i].field_id;
bulk_count++;
}
if (bulk_count != 0) {
rdc_status_t status = callback(values, bulk_count, user_data);
if (status != RDC_ST_OK) {
return status;
}
bulk_count = 0;
// Fetch it one by one for left fields
const int BULK_FIELDS_MAX = 16;
rdc_gpu_field_value_t values[BULK_FIELDS_MAX];
uint32_t bulk_count = 0;
rdc_status_t status = RDC_ST_UNKNOWN_ERROR;
double value = 0;
for (uint32_t i = 0; i < fields_count; i++) {
if (bulk_count >= BULK_FIELDS_MAX) {
status = callback(values, bulk_count, user_data);
// When the callback returns errors, stop processing and return.
if (status != RDC_ST_OK) {
return status;
}
bulk_count = 0;
}
return status;
status = rocp.rocp_lookup(std::make_pair(fields[i].gpu_index, fields[i].field_id), &value);
// get value
values[bulk_count].gpu_index = fields[i].gpu_index;
values[bulk_count].field_value.type = DOUBLE;
values[bulk_count].field_value.status = status;
values[bulk_count].field_value.ts = curTime;
values[bulk_count].field_value.value.dbl = value;
values[bulk_count].field_value.field_id = fields[i].field_id;
bulk_count++;
}
if (bulk_count != 0) {
rdc_status_t status = callback(values, bulk_count, user_data);
if (status != RDC_ST_OK) {
return status;
}
bulk_count = 0;
}
return status;
}
rdc_status_t rdc_telemetry_fields_watch(
rdc_gpu_field_t* fields,
uint32_t fields_count) {
rdc_status_t status = RDC_ST_OK;
for (uint32_t i = 0; i < fields_count; i++) {
RDC_LOG(RDC_DEBUG, "WATCH: " << fields[i].field_id);
const rdc_status_t temp_status = rocp.create_session(
std::make_pair(fields[i].gpu_index, fields[i].field_id));
if (temp_status != RDC_ST_OK) {
status = temp_status;
}
rdc_status_t rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint32_t fields_count) {
rdc_status_t status = RDC_ST_OK;
for (uint32_t i = 0; i < fields_count; i++) {
RDC_LOG(RDC_DEBUG, "WATCH: " << fields[i].field_id);
const rdc_status_t temp_status =
rocp.create_session(std::make_pair(fields[i].gpu_index, fields[i].field_id));
if (temp_status != RDC_ST_OK) {
status = temp_status;
}
return status;
}
return status;
}
rdc_status_t rdc_telemetry_fields_unwatch(
rdc_gpu_field_t* fields,
uint32_t fields_count) {
rdc_status_t status = RDC_ST_OK;
for (uint32_t i = 0; i < fields_count; i++) {
RDC_LOG(RDC_DEBUG, "UNWATCH: " << fields[i].field_id);
const rdc_status_t temp_status = rocp.destroy_session(
std::make_pair(fields[i].gpu_index, fields[i].field_id));
// return last non-ok status
if (temp_status != RDC_ST_OK) {
status = temp_status;
}
rdc_status_t rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, uint32_t fields_count) {
rdc_status_t status = RDC_ST_OK;
for (uint32_t i = 0; i < fields_count; i++) {
RDC_LOG(RDC_DEBUG, "UNWATCH: " << fields[i].field_id);
const rdc_status_t temp_status =
rocp.destroy_session(std::make_pair(fields[i].gpu_index, fields[i].field_id));
// return last non-ok status
if (temp_status != RDC_ST_OK) {
status = temp_status;
}
return status;
}
return status;
}
+65 -86
查看文件
@@ -19,30 +19,32 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_modules/rdc_rocr/ComputeQueueTest.h"
#include <assert.h>
#include <fcntl.h>
#include <stdint.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <string>
#include <climits>
#include <algorithm>
#include <climits>
#include <iostream>
#include <vector>
#include <memory>
#include "rdc_modules/rdc_rocr/common.h"
#include "rdc_modules/rdc_rocr/ComputeQueueTest.h"
#include "rdc_modules/rdc_rocr/base_rocr_utils.h"
#include <string>
#include <vector>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rocr/base_rocr_utils.h"
#include "rdc_modules/rdc_rocr/common.h"
namespace amd {
namespace rdc {
static const uint32_t kNumBufferElements = 256;
ComputeQueueTest::ComputeQueueTest(uint32_t gpu_index): TestBase(gpu_index) {
ComputeQueueTest::ComputeQueueTest(uint32_t gpu_index) : TestBase(gpu_index) {
set_num_iteration(10); // Number of iterations to execute of the main test;
// This is a default value which can be overridden
// on the command line.
@@ -50,8 +52,7 @@ ComputeQueueTest::ComputeQueueTest(uint32_t gpu_index): TestBase(gpu_index) {
set_description("This test will run binary search compute task via AQL.");
}
ComputeQueueTest::~ComputeQueueTest(void) {
}
ComputeQueueTest::~ComputeQueueTest(void) {}
// Any 1-time setup involving member variables used in the rest of the test
// should be done here.
@@ -61,7 +62,7 @@ hsa_status_t ComputeQueueTest::SetUp(void) {
TestBase::SetUp();
err = SetDefaultAgents(this);
if ( err != HSA_STATUS_SUCCESS) return err;
if (err != HSA_STATUS_SUCCESS) return err;
err = SetPoolsTypical(this);
return err;
@@ -77,9 +78,7 @@ void ComputeQueueTest::Run(void) {
TestBase::Run();
}
void ComputeQueueTest::DisplayTestInfo(void) {
TestBase::DisplayTestInfo();
}
void ComputeQueueTest::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); }
void ComputeQueueTest::DisplayResults(void) const {
// Compare required profile for this test case with what we're actually
@@ -112,8 +111,8 @@ void ComputeQueueTest::InitializeBinarySearch(BinarySearch* bs) {
// This function shows how to do an asynchronous copy. We have to create a
// signal and use the signal to notify us when the copy has completed.
hsa_status_t ComputeQueueTest::AgentMemcpy(void* dst, const void* src,
size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag) {
hsa_status_t ComputeQueueTest::AgentMemcpy(void* dst, const void* src, size_t size,
hsa_agent_t dst_ag, hsa_agent_t src_ag) {
hsa_signal_t s;
hsa_status_t err;
@@ -123,8 +122,8 @@ hsa_status_t ComputeQueueTest::AgentMemcpy(void* dst, const void* src,
err = hsa_amd_memory_async_copy(dst, dst_ag, src, src_ag, size, 0, NULL, s);
throw_if_error(err);
if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) {
if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
HSA_WAIT_STATE_BLOCKED) != 0) {
err = HSA_STATUS_ERROR;
RDC_LOG(RDC_ERROR, "Async copy signal error");
@@ -141,22 +140,19 @@ hsa_status_t ComputeQueueTest::AgentMemcpy(void* dst, const void* src,
hsa_status_t ComputeQueueTest::FindPools(BinarySearch* bs) {
hsa_status_t err;
err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, FindStandardPool,
&bs->cpu_pool);
err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, FindStandardPool, &bs->cpu_pool);
if (err != HSA_STATUS_INFO_BREAK) {
return HSA_STATUS_ERROR;
}
err = hsa_amd_agent_iterate_memory_pools(bs->gpu_dev, FindStandardPool,
&bs->gpu_pool);
err = hsa_amd_agent_iterate_memory_pools(bs->gpu_dev, FindStandardPool, &bs->gpu_pool);
if (err != HSA_STATUS_INFO_BREAK) {
return HSA_STATUS_ERROR;
}
err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev,
FindKernArgPool, &bs->kern_arg_pool);
err = hsa_amd_agent_iterate_memory_pools(bs->cpu_dev, FindKernArgPool, &bs->kern_arg_pool);
if (err != HSA_STATUS_INFO_BREAK) {
return HSA_STATUS_ERROR;
@@ -203,7 +199,7 @@ hsa_status_t ComputeQueueTest::AllocateAndInitBuffers(BinarySearch* bs) {
(void)memset(bs->input, 0, in_length);
err = hsa_amd_memory_pool_allocate(bs->cpu_pool, in_length, 0,
reinterpret_cast<void**>(&bs->input_arr_local));
reinterpret_cast<void**>(&bs->input_arr_local));
throw_if_error(err);
err = hsa_amd_agents_allow_access(2, ag_list, NULL, bs->input_arr_local);
throw_if_error(err);
@@ -218,7 +214,7 @@ hsa_status_t ComputeQueueTest::AllocateAndInitBuffers(BinarySearch* bs) {
for (uint32_t i = 1; i < bs->length; ++i) {
bs->input[i] = bs->input[i - 1] +
static_cast<uint32_t>(max * rand_r(&seed) / static_cast<float>(RAND_MAX));
static_cast<uint32_t>(max * rand_r(&seed) / static_cast<float>(RAND_MAX));
}
return err;
@@ -238,11 +234,10 @@ hsa_status_t ComputeQueueTest::LoadKernelFromObjFile(BinarySearch* bs) {
err = hsa_agent_get_info(bs->gpu_dev, HSA_AGENT_INFO_NAME, agent_name);
throw_if_error(err);
std::string kernel_file = search_hsaco_full_path(
bs->kernel_file_name.c_str(), agent_name);
std::string kernel_file = search_hsaco_full_path(bs->kernel_file_name.c_str(), agent_name);
if (kernel_file == "") {
RDC_LOG(RDC_ERROR, "failed to open " << bs->kernel_file_name.c_str() <<
" at line " << __LINE__ << ", errno: " << errno);
RDC_LOG(RDC_ERROR, "failed to open " << bs->kernel_file_name.c_str() << " at line " << __LINE__
<< ", errno: " << errno);
std::string msg("fail to open ");
msg += bs->kernel_file_name;
throw_if_skip(msg);
@@ -251,8 +246,8 @@ hsa_status_t ComputeQueueTest::LoadKernelFromObjFile(BinarySearch* bs) {
hsa_file_t file_handle = open(kernel_file.c_str(), O_RDONLY);
if (file_handle == -1) {
RDC_LOG(RDC_ERROR, "failed to open " << bs->kernel_file_name.c_str() <<
" at line " << __LINE__ << ", errno: " << errno);
RDC_LOG(RDC_ERROR, "failed to open " << bs->kernel_file_name.c_str() << " at line " << __LINE__
<< ", errno: " << errno);
return HSA_STATUS_ERROR;
}
@@ -260,46 +255,40 @@ hsa_status_t ComputeQueueTest::LoadKernelFromObjFile(BinarySearch* bs) {
throw_if_error(err);
close(file_handle);
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &executable);
err = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL,
&executable);
throw_if_error(err);
err = hsa_executable_load_agent_code_object(executable, bs->gpu_dev,
code_obj_rdr, NULL, NULL);
err = hsa_executable_load_agent_code_object(executable, bs->gpu_dev, code_obj_rdr, NULL, NULL);
throw_if_error(err);
err = hsa_executable_freeze(executable, NULL);
throw_if_error(err);
hsa_executable_symbol_t kern_sym;
err = hsa_executable_get_symbol(executable, NULL, bs->kernel_name.c_str(),
bs->gpu_dev, 0, &kern_sym);
err = hsa_executable_get_symbol(executable, NULL, bs->kernel_name.c_str(), bs->gpu_dev, 0,
&kern_sym);
throw_if_error(err);
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
&bs->kernel_object);
err = hsa_executable_symbol_get_info(kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
&bs->kernel_object);
throw_if_error(err);
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&bs->private_segment_size);
err = hsa_executable_symbol_get_info(
kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &bs->private_segment_size);
throw_if_error(err);
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&bs->group_segment_size);
err = hsa_executable_symbol_get_info(
kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &bs->group_segment_size);
throw_if_error(err);
// Remaining queries not supported on code object v3.
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
&bs->kernarg_size);
err = hsa_executable_symbol_get_info(
kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &bs->kernarg_size);
throw_if_error(err);
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
&bs->kernarg_align);
err = hsa_executable_symbol_get_info(
kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, &bs->kernarg_align);
throw_if_error(err);
assert(bs->kernarg_align >= 16 && "Reported kernarg size is too small.");
bs->kernarg_align = (bs->kernarg_align == 0) ? 16 : bs->kernarg_align;
@@ -310,7 +299,7 @@ hsa_status_t ComputeQueueTest::LoadKernelFromObjFile(BinarySearch* bs) {
// This function populates the AQL patch with the information
// we have collected and stored in the BinarySearch structure thus far.
void ComputeQueueTest::PopulateAQLPacket(BinarySearch const* bs,
hsa_kernel_dispatch_packet_t* aql) {
hsa_kernel_dispatch_packet_t* aql) {
aql->header = 0; // Dummy val. for now. Set this right before doorbell ring
aql->setup = 1;
aql->workgroup_size_x = bs->work_group_size;
@@ -326,8 +315,7 @@ void ComputeQueueTest::PopulateAQLPacket(BinarySearch const* bs,
aql->completion_signal = bs->signal;
}
void ComputeQueueTest::WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql,
hsa_queue_t* q) {
void ComputeQueueTest::WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aql, hsa_queue_t* q) {
void* queue_base = q->base_address;
const uint32_t queue_mask = q->size - 1;
uint64_t que_idx = hsa_queue_add_write_index_relaxed(q, 1);
@@ -335,8 +323,7 @@ void ComputeQueueTest::WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aq
hsa_kernel_dispatch_packet_t* queue_aql_packet;
queue_aql_packet =
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue_base))
[que_idx & queue_mask];
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue_base))[que_idx & queue_mask];
queue_aql_packet->workgroup_size_x = in_aql->workgroup_size_x;
queue_aql_packet->workgroup_size_y = in_aql->workgroup_size_y;
@@ -351,11 +338,10 @@ void ComputeQueueTest::WriteAQLToQueue(hsa_kernel_dispatch_packet_t const* in_aq
queue_aql_packet->completion_signal = in_aql->completion_signal;
}
// This function allocates memory from the kern_arg pool we already found, and
// then sets the argument values needed by the kernel code.
hsa_status_t ComputeQueueTest::AllocAndSetKernArgs(BinarySearch* bs, void* args,
size_t arg_size, void** aql_buf_ptr) {
hsa_status_t ComputeQueueTest::AllocAndSetKernArgs(BinarySearch* bs, void* args, size_t arg_size,
void** aql_buf_ptr) {
void* kern_arg_buf = nullptr;
hsa_status_t err;
size_t buf_size;
@@ -448,11 +434,9 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) {
uint32_t global_lower_bound = 0;
uint32_t global_upper_bound = bs->length - 1;
uint32_t sub_div_size = (global_upper_bound - global_lower_bound + 1) /
bs->num_sub_divisions;
uint32_t sub_div_size = (global_upper_bound - global_lower_bound + 1) / bs->num_sub_divisions;
if ((bs->input[0] > bs->find_me) ||
(bs->input[bs->length - 1] < bs->find_me)) {
if ((bs->input[0] > bs->find_me) || (bs->input[bs->length - 1] < bs->find_me)) {
bs->output[0] = 0;
bs->output[1] = bs->length - 1;
bs->output[2] = 0;
@@ -472,7 +456,7 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) {
typedef uint32_t uint4[4];
struct __attribute__((aligned(16))) local_args_t {
uint4* outputArray;
uint2* sortedArray;
uint2* sortedArray;
uint32_t findMe;
uint32_t pad;
uint64_t global_offset_x;
@@ -494,8 +478,7 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) {
local_args.completion_action = 0;
// Copy the kernel args structure into kernel arg memory
err = AllocAndSetKernArgs(bs, &local_args, sizeof(local_args),
&bs->kern_arg_address);
err = AllocAndSetKernArgs(bs, &local_args, sizeof(local_args), &bs->kern_arg_address);
throw_if_error(err);
// Populate an AQL packet with the info we've gathered
@@ -505,7 +488,7 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) {
uint32_t in_length = bs->num_sub_divisions * 2 * sizeof(uint32_t);
while ((sub_div_size > 1) && (bs->output[3] != 0)) {
for (uint32_t i = 0 ; i < bs->num_sub_divisions; i++) {
for (uint32_t i = 0; i < bs->num_sub_divisions; i++) {
int idx1 = i * sub_div_size;
int idx2 = ((i + 1) * sub_div_size) - 1;
bs->input_arr[2 * i] = bs->input[idx1];
@@ -513,9 +496,9 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) {
}
// Copy kernel parameter from system memory to local memory
err = AgentMemcpy(reinterpret_cast<uint8_t*>(bs->input_arr_local),
reinterpret_cast<uint8_t*>(bs->input_arr),
in_length, bs->gpu_dev, bs->cpu_dev);
err =
AgentMemcpy(reinterpret_cast<uint8_t*>(bs->input_arr_local),
reinterpret_cast<uint8_t*>(bs->input_arr), in_length, bs->gpu_dev, bs->cpu_dev);
throw_if_error(err);
@@ -535,10 +518,8 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) {
WriteAQLToQueue(&aql, bs->queue);
uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
aql_header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
aql_header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
// Set the packet's type, acquire and release fences. This should be done
// atomically after all the other fields have been set, using release
@@ -546,9 +527,9 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) {
// signal is activated.
void* q_base = bs->queue->base_address;
AtomicSetPacketHeader(aql_header, aql.setup,
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>
(q_base))[que_idx & mask]);
AtomicSetPacketHeader(
aql_header, aql.setup,
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(q_base))[que_idx & mask]);
// Increment the write index and ring the doorbell to dispatch kernel.
hsa_queue_store_write_index_relaxed(bs->queue, (que_idx + 1));
@@ -563,9 +544,8 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) {
// the queue is less than 1. When the kernel associated with the queued AQL
// packet has completed execution, the signal value is automatically
// decremented by the packet processor.
hsa_signal_value_t value = hsa_signal_wait_scacquire(bs->signal,
HSA_SIGNAL_CONDITION_LT, 1,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
hsa_signal_value_t value = hsa_signal_wait_scacquire(bs->signal, HSA_SIGNAL_CONDITION_LT, 1,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
// value should be 0, or we timed-out
if (value) {
@@ -579,8 +559,7 @@ hsa_status_t ComputeQueueTest::Run(BinarySearch* bs) {
// Binary search algorithm stuff...
global_lower_bound = bs->output[0] * sub_div_size;
global_upper_bound = global_lower_bound + sub_div_size - 1;
sub_div_size = (global_upper_bound - global_lower_bound + 1) /
bs->num_sub_divisions;
sub_div_size = (global_upper_bound - global_lower_bound + 1) / bs->num_sub_divisions;
}
uint32_t element_index = UINT_MAX;
@@ -655,8 +634,8 @@ hsa_status_t ComputeQueueTest::RunBinarySearchTest(void) {
err = hsa_signal_create(1, 0, NULL, &bs.signal);
throw_if_error(err, "Fail to create signal.");
err = hsa_queue_create(bs.gpu_dev, 128, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
UINT32_MAX, UINT32_MAX, &bs.queue);
err = hsa_queue_create(bs.gpu_dev, 128, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX,
&bs.queue);
throw_if_error(err, "Fail to create queue.");
err = FindPools(&bs);
+104 -115
查看文件
@@ -20,34 +20,36 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_modules/rdc_rocr/MemoryAccess.h"
#include <fcntl.h>
#include <algorithm>
#include <iostream>
#include <vector>
#include <memory>
#include <vector>
#include "rdc_modules/rdc_rocr/common.h"
#include "rdc_modules/rdc_rocr/MemoryAccess.h"
#include "rdc_modules/rdc_rocr/base_rocr_utils.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rocr/base_rocr_utils.h"
#include "rdc_modules/rdc_rocr/common.h"
namespace amd {
namespace rdc {
MemoryAccessTest::MemoryAccessTest(uint32_t gpu_index): TestBase(gpu_index) {
MemoryAccessTest::MemoryAccessTest(uint32_t gpu_index) : TestBase(gpu_index) {
set_num_iteration(10); // Number of iterations to execute of the main test;
// This is a default value which can be overridden
// on the command line.
set_title("RocR Memory Access Tests");
set_description("This series of tests check memory allocation"
"on GPU and CPU, i.e. GPU access to system memory "
"and CPU access to GPU memory.");
set_description(
"This series of tests check memory allocation"
"on GPU and CPU, i.e. GPU access to system memory "
"and CPU access to GPU memory.");
}
MemoryAccessTest::~MemoryAccessTest(void) {
}
MemoryAccessTest::~MemoryAccessTest(void) {}
// Any 1-time setup involving member variables used in the rest of the test
// should be done here.
@@ -74,9 +76,7 @@ void MemoryAccessTest::Run(void) {
TestBase::Run();
}
void MemoryAccessTest::DisplayTestInfo(void) {
TestBase::DisplayTestInfo();
}
void MemoryAccessTest::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); }
void MemoryAccessTest::DisplayResults(void) const {
// Compare required profile for this test case with what we're actually
@@ -92,18 +92,17 @@ void MemoryAccessTest::Close() {
TestBase::Close();
}
typedef struct __attribute__((aligned(16))) args_t {
int* a;
int* b;
int* c;
} args;
typedef struct __attribute__ ((aligned(16))) args_t {
int *a;
int *b;
int *c;
} args;
args *kernArgs = NULL;
args* kernArgs = NULL;
static const char kSubTestSeparator[] = " **************************";
static void PrintMemorySubtestHeader(const char *header) {
static void PrintMemorySubtestHeader(const char* header) {
RDC_LOG(RDC_DEBUG, " *** Memory Subtest: " << header << " ***");
}
@@ -113,80 +112,64 @@ static const int kMemoryAllocSize = 8;
static const int kMemoryAllocSize = 1024;
#endif
// Test to check GPU can read & write to system memory
void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent,
hsa_agent_t gpuAgent) {
void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent) {
hsa_status_t err;
// Get Global Memory Pool on the gpuAgent to allocate gpu buffers
hsa_amd_memory_pool_t gpu_pool;
err = hsa_amd_agent_iterate_memory_pools(gpuAgent,
GetGlobalMemoryPool,
&gpu_pool);
err = hsa_amd_agent_iterate_memory_pools(gpuAgent, GetGlobalMemoryPool, &gpu_pool);
throw_if_error(err);
hsa_amd_memory_pool_access_t access;
hsa_amd_agent_memory_pool_get_info(cpuAgent, gpu_pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
&access);
hsa_amd_agent_memory_pool_get_info(cpuAgent, gpu_pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
&access);
if (access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
// hsa objects
hsa_queue_t *queue = NULL; // command queue
hsa_queue_t* queue = NULL; // command queue
hsa_signal_t signal = {0}; // completion signal
// get queue size
uint32_t queue_size = 0;
err = hsa_agent_get_info(gpuAgent,
HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
err = hsa_agent_get_info(gpuAgent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
throw_if_error(err);
// create queue
err = hsa_queue_create(gpuAgent,
queue_size, HSA_QUEUE_TYPE_MULTI,
NULL, NULL, 0, 0, &queue);
err = hsa_queue_create(gpuAgent, queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &queue);
throw_if_error(err);
// Get System Memory Pool on the cpuAgent to allocate host side buffers
hsa_amd_memory_pool_t global_pool;
err = hsa_amd_agent_iterate_memory_pools(cpuAgent,
GetGlobalMemoryPool,
&global_pool);
err = hsa_amd_agent_iterate_memory_pools(cpuAgent, GetGlobalMemoryPool, &global_pool);
throw_if_error(err);
// Find a memory pool that supports kernel arguments.
hsa_amd_memory_pool_t kernarg_pool;
err = hsa_amd_agent_iterate_memory_pools(cpuAgent,
GetKernArgMemoryPool,
&kernarg_pool);
err = hsa_amd_agent_iterate_memory_pools(cpuAgent, GetKernArgMemoryPool, &kernarg_pool);
throw_if_error(err);
// Allocate the host side buffers
// (sys_data,dup_sys_data,cpuResult,kernArg) on system memory
int *sys_data = NULL;
int *dup_sys_data = NULL;
int *cpuResult = NULL;
int *gpuResult = NULL;
int* sys_data = NULL;
int* dup_sys_data = NULL;
int* cpuResult = NULL;
int* gpuResult = NULL;
err = hsa_amd_memory_pool_allocate(global_pool,
kMemoryAllocSize, 0,
reinterpret_cast<void **>(&cpuResult));
err = hsa_amd_memory_pool_allocate(global_pool, kMemoryAllocSize, 0,
reinterpret_cast<void**>(&cpuResult));
throw_if_error(err);
err = hsa_amd_memory_pool_allocate(global_pool,
kMemoryAllocSize, 0,
reinterpret_cast<void **>(&sys_data));
err = hsa_amd_memory_pool_allocate(global_pool, kMemoryAllocSize, 0,
reinterpret_cast<void**>(&sys_data));
throw_if_error(err);
err = hsa_amd_memory_pool_allocate(global_pool,
kMemoryAllocSize, 0,
reinterpret_cast<void **>(&dup_sys_data));
err = hsa_amd_memory_pool_allocate(global_pool, kMemoryAllocSize, 0,
reinterpret_cast<void**>(&dup_sys_data));
throw_if_error(err);
// Allocate the kernel argument buffer from the kernarg_pool.
err = hsa_amd_memory_pool_allocate(kernarg_pool, sizeof(args_t), 0,
reinterpret_cast<void **>(&kernArgs));
reinterpret_cast<void**>(&kernArgs));
throw_if_error(err);
// initialize the host buffers
@@ -204,10 +187,9 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent,
// Get local memory of GPU to allocate device side buffers
err = hsa_amd_memory_pool_allocate(gpu_pool, kMemoryAllocSize, 0,
reinterpret_cast<void **>(&gpuResult));
reinterpret_cast<void**>(&gpuResult));
throw_if_error(err);
// Allow cpuAgent access to all allocated GPU memory.
err = hsa_amd_agents_allow_access(1, &cpuAgent, NULL, gpuResult);
throw_if_error(err);
@@ -227,7 +209,6 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent,
kernArgs->b = cpuResult; // system memory passed to gpu for write
kernArgs->c = gpuResult; // gpu memory to verify that gpu read system data
// Create the executable, get symbol by name and load the code object
set_kernel_file_name("gpuReadWrite_kernels.hsaco");
set_kernel_name("gpuReadWrite");
@@ -268,22 +249,22 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent,
WriteAQLToQueueLoc(queue, index, &aql);
hsa_kernel_dispatch_packet_t *q_base_addr =
reinterpret_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address);
hsa_kernel_dispatch_packet_t* q_base_addr =
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue->base_address);
AtomicSetPacketHeader(
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE),
(1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
reinterpret_cast<hsa_kernel_dispatch_packet_t *>
(&q_base_addr[index & queue_mask]));
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE),
(1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(&q_base_addr[index & queue_mask]));
// ringdoor bell
hsa_signal_store_relaxed(queue->doorbell_signal, index);
// wait for the signal and reset it for future use
while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1,
(uint64_t)-1, HSA_WAIT_STATE_ACTIVE)) { }
while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, (uint64_t)-1,
HSA_WAIT_STATE_ACTIVE)) {
}
hsa_signal_store_relaxed(signal, 1);
// compare device and host side results
@@ -292,8 +273,7 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent,
}
for (int i = 0; i < kMemoryAllocSize; ++i) {
if (gpuResult[i] != dup_sys_data[i]) {
throw_if_error(HSA_STATUS_ERROR,
"gpuResult does not match dup_sys_data.");
throw_if_error(HSA_STATUS_ERROR, "gpuResult does not match dup_sys_data.");
}
}
@@ -304,7 +284,7 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent,
for (int i = 0; i < kMemoryAllocSize; ++i) {
if (cpuResult[i] != i) {
throw_if_error(HSA_STATUS_ERROR,
"The CPU memory size does not match the system memory size.");
"The CPU memory size does not match the system memory size.");
}
}
@@ -312,27 +292,39 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent,
RDC_LOG(RDC_DEBUG, "gpu has written to system memory successfully");
}
if (sys_data) { hsa_memory_free(sys_data); }
if (dup_sys_data) { hsa_memory_free(dup_sys_data); }
if (cpuResult) {hsa_memory_free(cpuResult); }
if (gpuResult) {hsa_memory_free(gpuResult); }
if (kernArgs) { hsa_memory_free(kernArgs); }
if (signal.handle) { hsa_signal_destroy(signal); }
if (queue) { hsa_queue_destroy(queue); }
if (sys_data) {
hsa_memory_free(sys_data);
}
if (dup_sys_data) {
hsa_memory_free(dup_sys_data);
}
if (cpuResult) {
hsa_memory_free(cpuResult);
}
if (gpuResult) {
hsa_memory_free(gpuResult);
}
if (kernArgs) {
hsa_memory_free(kernArgs);
}
if (signal.handle) {
hsa_signal_destroy(signal);
}
if (queue) {
hsa_queue_destroy(queue);
}
} else {
if (verbosity() > 0) {
RDC_LOG(RDC_DEBUG,
"Test not applicable as system is not large bar, skipping");
RDC_LOG(RDC_DEBUG, "Test not applicable as system is not large bar, skipping");
}
return;
}
}
// Test to check cpu can read & write to GPU memory
void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent,
hsa_agent_t,
hsa_amd_memory_pool_t pool) {
void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t,
hsa_amd_memory_pool_t pool) {
hsa_status_t err;
pool_info_t pool_i;
@@ -340,14 +332,12 @@ void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent,
throw_if_error(err);
if (pool_i.segment == HSA_AMD_SEGMENT_GLOBAL &&
pool_i.global_flag == HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
pool_i.global_flag == HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
hsa_amd_memory_pool_access_t access;
hsa_amd_agent_memory_pool_get_info(cpuAgent, pool,
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
&access);
hsa_amd_agent_memory_pool_get_info(cpuAgent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
&access);
if (access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
if (!pool_i.alloc_allowed || pool_i.alloc_granule == 0 ||
pool_i.alloc_alignment == 0) {
if (!pool_i.alloc_allowed || pool_i.alloc_granule == 0 || pool_i.alloc_alignment == 0) {
if (verbosity() > 0) {
RDC_LOG(RDC_DEBUG, "Test not applicable. Skipping.");
}
@@ -356,10 +346,10 @@ void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent,
auto gran_sz = pool_i.alloc_granule;
auto pool_sz = pool_i.size / gran_sz;
auto max_alloc_size = pool_sz/2;
unsigned int max_element = max_alloc_size/sizeof(unsigned int);
unsigned int *gpu_data;
unsigned int *sys_data;
auto max_alloc_size = pool_sz / 2;
unsigned int max_element = max_alloc_size / sizeof(unsigned int);
unsigned int* gpu_data;
unsigned int* sys_data;
sys_data = (unsigned int*)malloc(max_alloc_size);
memset(sys_data, 0, max_alloc_size);
for (unsigned int i = 1; i <= max_element; ++i) {
@@ -368,7 +358,7 @@ void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent,
// err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, sys_data);
// EXPECT_EQ(err, HSA_STATUS_SUCCESS);
err = hsa_amd_memory_pool_allocate(pool, max_alloc_size, 0,
reinterpret_cast<void**>(&gpu_data));
reinterpret_cast<void**>(&gpu_data));
throw_if_error(err);
/*
if (err == HSA_STATUS_ERROR) {
@@ -385,21 +375,22 @@ void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent,
gpu_data[i] = i; // Write to gpu memory directly
}
for (unsigned int i = 1; i <= max_element; ++i) {
if (sys_data[i] != gpu_data[i]) { // Reading GPU memory
fprintf(stdout, "Values not mathing !! sys_data[%d]:%d ,"
"gpu_data[%d]\n", sys_data[i], i, gpu_data[i]);
}
}
RDC_LOG(RDC_DEBUG, "CPU have read & write to GPU memory successfully");
err = hsa_amd_memory_pool_free(gpu_data);
free(sys_data);
} else {
if (verbosity() > 0) {
RDC_LOG(RDC_DEBUG,
"Test not applicable as system is not large bar, Skipping.");
for (unsigned int i = 1; i <= max_element; ++i) {
if (sys_data[i] != gpu_data[i]) { // Reading GPU memory
fprintf(stdout,
"Values not mathing !! sys_data[%d]:%d ,"
"gpu_data[%d]\n",
sys_data[i], i, gpu_data[i]);
}
return;
}
RDC_LOG(RDC_DEBUG, "CPU have read & write to GPU memory successfully");
err = hsa_amd_memory_pool_free(gpu_data);
free(sys_data);
} else {
if (verbosity() > 0) {
RDC_LOG(RDC_DEBUG, "Test not applicable as system is not large bar, Skipping.");
}
return;
}
}
}
@@ -416,12 +407,10 @@ void MemoryAccessTest::CPUAccessToGPUMemoryTest(void) {
std::vector<hsa_agent_t> gpus;
err = hsa_iterate_agents(IterateGPUAgents, &gpus);
throw_if_error(err);
for (unsigned int i = 0 ; i< gpus.size(); ++i) {
for (unsigned int i = 0; i < gpus.size(); ++i) {
hsa_amd_memory_pool_t gpu_pool;
memset(&gpu_pool, 0, sizeof(gpu_pool));
err = hsa_amd_agent_iterate_memory_pools(gpus[i],
GetGlobalMemoryPool,
&gpu_pool);
err = hsa_amd_agent_iterate_memory_pools(gpus[i], GetGlobalMemoryPool, &gpu_pool);
throw_if_error(err);
if (gpu_pool.handle == 0) {
RDC_LOG(RDC_DEBUG, "no global mempool in gpu agent");
@@ -466,4 +455,4 @@ void MemoryAccessTest::GPUAccessToCPUMemoryTest(void) {
}
} // namespace rdc
} // namespace amd
} // namespace amd
+31 -36
查看文件
@@ -20,32 +20,35 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_modules/rdc_rocr/MemoryTest.h"
#include <algorithm>
#include <iostream>
#include <vector>
#include <memory>
#include "rdc_modules/rdc_rocr/common.h"
#include "rdc_modules/rdc_rocr/MemoryTest.h"
#include "rdc_modules/rdc_rocr/base_rocr_utils.h"
#include <vector>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rocr/base_rocr_utils.h"
#include "rdc_modules/rdc_rocr/common.h"
namespace amd {
namespace rdc {
static const uint32_t kNumBufferElements = 256;
MemoryTest::MemoryTest(uint32_t gpu_index): TestBase(gpu_index) {
MemoryTest::MemoryTest(uint32_t gpu_index) : TestBase(gpu_index) {
set_num_iteration(10); // Number of iterations to execute of the main test;
// This is a default value which can be overridden
// on the command line.
set_title("Max Single Allocation Memory Test");
set_description("This series of tests check memory allocation limits, extent"
" of GPU access to system memory and other memory related functionality.");
set_description(
"This series of tests check memory allocation limits, extent"
" of GPU access to system memory and other memory related "
"functionality.");
}
MemoryTest::~MemoryTest(void) {
}
MemoryTest::~MemoryTest(void) {}
// Any 1-time setup involving member variables used in the rest of the test
// should be done here.
@@ -55,7 +58,7 @@ hsa_status_t MemoryTest::SetUp(void) {
TestBase::SetUp();
err = SetDefaultAgents(this);
if ( err != HSA_STATUS_SUCCESS) return err;
if (err != HSA_STATUS_SUCCESS) return err;
err = SetPoolsTypical(this);
return err;
@@ -71,9 +74,7 @@ void MemoryTest::Run(void) {
TestBase::Run();
}
void MemoryTest::DisplayTestInfo(void) {
TestBase::DisplayTestInfo();
}
void MemoryTest::DisplayTestInfo(void) { TestBase::DisplayTestInfo(); }
void MemoryTest::DisplayResults(void) const {
// Compare required profile for this test case with what we're actually
@@ -92,7 +93,7 @@ void MemoryTest::Close() {
}
hsa_status_t MemoryTest::TestAllocate(hsa_amd_memory_pool_t pool, size_t sz) {
void *ptr;
void* ptr;
hsa_status_t err;
err = hsa_amd_memory_pool_allocate(pool, sz, 0, &ptr);
@@ -106,13 +107,12 @@ hsa_status_t MemoryTest::TestAllocate(hsa_amd_memory_pool_t pool, size_t sz) {
static const char kSubTestSeparator[] = " **************************";
static void PrintMemorySubtestHeader(const char *header) {
static void PrintMemorySubtestHeader(const char* header) {
RDC_LOG(RDC_DEBUG, " *** Memory Subtest: " << header << " ***");
}
// Test Fixtures
hsa_status_t MemoryTest::MaxSingleAllocationTest(hsa_agent_t ag,
hsa_amd_memory_pool_t pool) {
hsa_status_t MemoryTest::MaxSingleAllocationTest(hsa_agent_t ag, hsa_amd_memory_pool_t pool) {
hsa_status_t err = HSA_STATUS_SUCCESS;
pool_info_t pool_i;
@@ -142,19 +142,17 @@ hsa_status_t MemoryTest::MaxSingleAllocationTest(hsa_agent_t ag,
device_type = "DSP";
break;
}
RDC_LOG(RDC_DEBUG, " Agent: " << ag_name << " Node " << node << " ("
<< device_type << ")");
RDC_LOG(RDC_DEBUG, " Agent: " << ag_name << " Node " << node << " (" << device_type << ")");
}
err = AcquirePoolInfo(pool, &pool_i);
if (err != HSA_STATUS_SUCCESS) return err;
if (verbosity() > 0) {
DumpMemoryPoolInfo(&pool_i, 2);
DumpMemoryPoolInfo(&pool_i, 2);
}
if (!pool_i.alloc_allowed || pool_i.alloc_granule == 0 ||
pool_i.alloc_alignment == 0) {
if (!pool_i.alloc_allowed || pool_i.alloc_granule == 0 || pool_i.alloc_alignment == 0) {
if (verbosity() > 0) {
RDC_LOG(RDC_DEBUG, " Test not applicable. Skipping.");
}
@@ -165,25 +163,24 @@ hsa_status_t MemoryTest::MaxSingleAllocationTest(hsa_agent_t ag,
auto pool_sz = pool_i.aggregate_alloc_max / gran_sz;
// Neg. test: Try to allocate more than the pool size
err = TestAllocate(pool, pool_sz*gran_sz + gran_sz);
err = TestAllocate(pool, pool_sz * gran_sz + gran_sz);
if (err != HSA_STATUS_ERROR_INVALID_ALLOCATION) return err;
auto max_alloc_size = pool_sz/2;
auto max_alloc_size = pool_sz / 2;
uint64_t upper_bound = pool_sz;
uint64_t lower_bound = 0;
while (true) {
err = TestAllocate(pool, max_alloc_size * gran_sz);
if (err != HSA_STATUS_SUCCESS ||
err != HSA_STATUS_ERROR_OUT_OF_RESOURCES) return err;
if (err != HSA_STATUS_SUCCESS || err != HSA_STATUS_ERROR_OUT_OF_RESOURCES) return err;
if (err == HSA_STATUS_SUCCESS) {
lower_bound = max_alloc_size;
max_alloc_size += (upper_bound - lower_bound)/2;
max_alloc_size += (upper_bound - lower_bound) / 2;
} else if (err == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {
upper_bound = max_alloc_size;
max_alloc_size -= (upper_bound - lower_bound)/2;
max_alloc_size -= (upper_bound - lower_bound) / 2;
}
if ((upper_bound - lower_bound) < 2) {
@@ -197,15 +194,14 @@ hsa_status_t MemoryTest::MaxSingleAllocationTest(hsa_agent_t ag,
}
if (verbosity() > 0) {
RDC_LOG(RDC_DEBUG, " Biggest single allocation size for this pool is " <<
(max_alloc_size * gran_sz)/1024 << "KB.");
RDC_LOG(RDC_DEBUG, " This is " <<
static_cast<float>(max_alloc_size)/pool_sz*100 <<
"% of the total.");
RDC_LOG(RDC_DEBUG, " Biggest single allocation size for this pool is "
<< (max_alloc_size * gran_sz) / 1024 << "KB.");
RDC_LOG(RDC_DEBUG, " This is " << static_cast<float>(max_alloc_size) / pool_sz * 100
<< "% of the total.");
}
if (ag_type == HSA_DEVICE_TYPE_GPU) {
if ((float)max_alloc_size/pool_sz < (float)15/16) {
if ((float)max_alloc_size / pool_sz < (float)15 / 16) {
RDC_LOG(RDC_ERROR, "the allocate size is wrong");
throw_if_error(HSA_STATUS_ERROR, "The allocate size is wrong");
}
@@ -233,8 +229,7 @@ hsa_status_t MemoryTest::MaxSingleAllocationTest(void) {
auto pool_idx = 0;
for (auto a : agent_pools) {
if (a->agent.handle != current_gpu.handle)
continue;
if (a->agent.handle != current_gpu.handle) continue;
for (auto p : a->pools) {
pool_idx++;
RDC_LOG(RDC_DEBUG, " Pool " << pool_idx << ":");
+139 -151
查看文件
@@ -19,180 +19,168 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_modules/rdc_rocr/RdcDiagnosticLib.h"
#include <string.h>
#include <memory>
#include <stdexcept>
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rocr/common.h"
#include "rdc_modules/rdc_rocr/RdcDiagnosticLib.h"
#include "rdc_modules/rdc_rocr/MemoryTest.h"
#include "rdc_modules/rdc_rocr/MemoryAccess.h"
#include "rdc_modules/rdc_rocr/ComputeQueueTest.h"
#include "rdc_modules/rdc_rocr/MemoryAccess.h"
#include "rdc_modules/rdc_rocr/MemoryTest.h"
#include "rdc_modules/rdc_rocr/common.h"
rdc_status_t rdc_diag_init(uint64_t) {
return RDC_ST_OK;
}
rdc_status_t rdc_diag_init(uint64_t) { return RDC_ST_OK; }
rdc_status_t rdc_diag_destroy() {
return RDC_ST_OK;
}
rdc_status_t rdc_diag_destroy() { return RDC_ST_OK; }
rdc_status_t rdc_diag_test_cases_query(
rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) {
if (test_case_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rdc_status_t rdc_diag_test_cases_query(rdc_diag_test_cases_t test_cases[MAX_TEST_CASES],
uint32_t* test_case_count) {
if (test_case_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
*test_case_count = 2;
test_cases[0] = RDC_DIAG_COMPUTE_QUEUE;
test_cases[1] = RDC_DIAG_SYS_MEM_CHECK;
*test_case_count = 2;
test_cases[0] = RDC_DIAG_COMPUTE_QUEUE;
test_cases[1] = RDC_DIAG_SYS_MEM_CHECK;
return RDC_ST_OK;
return RDC_ST_OK;
}
// Helper function to run the memory test on GPU
static rdc_status_t run_memory_test(uint32_t gpu_index,
rdc_diag_test_result_t* result) {
std::string info = result->info;
std::string per_gpu_info = result->gpu_results[gpu_index].gpu_result.msg;
static rdc_status_t run_memory_test(uint32_t gpu_index, rdc_diag_test_result_t* result) {
std::string info = result->info;
std::string per_gpu_info = result->gpu_results[gpu_index].gpu_result.msg;
try {
amd::rdc::MemoryTest test(gpu_index);
test.MaxSingleAllocationTest();
try {
amd::rdc::MemoryTest test(gpu_index);
test.MaxSingleAllocationTest();
info += test.get_gpu_info();
per_gpu_info += test.get_per_gpu_info();
} catch (const amd::rdc::SkipException& e) {
result->status = RDC_DIAG_RESULT_SKIP;
per_gpu_info += "MaxSingleAllocationTest is skipped: ";
per_gpu_info += e.what();
info += "GPU ";
info += std::to_string(gpu_index);
info += " MaxSingleAllocationTest is skipped: ";
info += e.what();
info += ".";
} catch (const std::exception& e) {
result->status = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "MaxSingleAllocationTest returns with error ";
per_gpu_info += e.what();
info += "GPU ";
info += std::to_string(gpu_index);
info += " MaxSingleAllocationTest returns with error ";
info += e.what();
info += ".";
}
info += test.get_gpu_info();
per_gpu_info += test.get_per_gpu_info();
} catch (const amd::rdc::SkipException& e) {
result->status = RDC_DIAG_RESULT_SKIP;
per_gpu_info += "MaxSingleAllocationTest is skipped: ";
per_gpu_info += e.what();
info += "GPU ";
info += std::to_string(gpu_index);
info += " MaxSingleAllocationTest is skipped: ";
info += e.what();
info += ".";
} catch (const std::exception& e) {
result->status = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "MaxSingleAllocationTest returns with error ";
per_gpu_info += e.what();
info += "GPU ";
info += std::to_string(gpu_index);
info += " MaxSingleAllocationTest returns with error ";
info += e.what();
info += ".";
}
try {
amd::rdc::MemoryAccessTest test(gpu_index);
test.CPUAccessToGPUMemoryTest();
test.GPUAccessToCPUMemoryTest();
info += test.get_gpu_info();
per_gpu_info += test.get_per_gpu_info();
} catch (const amd::rdc::SkipException& e) {
result->status = RDC_DIAG_RESULT_SKIP;
per_gpu_info += "Memory Access is skipped: ";
per_gpu_info += e.what();
info += "GPU ";
info += std::to_string(gpu_index);
info += " Memory Access is skipped: ";
info += e.what();
info += ".";
} catch (const std::exception& e) {
result->status = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Memory Access returns with error ";
per_gpu_info += e.what();
info += "GPU ";
info += std::to_string(gpu_index);
info += " Memory Access returns with error ";
info += e.what();
info += ".";
}
try {
amd::rdc::MemoryAccessTest test(gpu_index);
test.CPUAccessToGPUMemoryTest();
test.GPUAccessToCPUMemoryTest();
info += test.get_gpu_info();
per_gpu_info += test.get_per_gpu_info();
} catch (const amd::rdc::SkipException& e) {
result->status = RDC_DIAG_RESULT_SKIP;
per_gpu_info += "Memory Access is skipped: ";
per_gpu_info += e.what();
info += "GPU ";
info += std::to_string(gpu_index);
info += " Memory Access is skipped: ";
info += e.what();
info += ".";
} catch (const std::exception& e) {
result->status = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Memory Access returns with error ";
per_gpu_info += e.what();
info += "GPU ";
info += std::to_string(gpu_index);
info += " Memory Access returns with error ";
info += e.what();
info += ".";
}
strncpy_with_null(result->info, info.c_str(),
MAX_DIAG_MSG_LENGTH);
strncpy_with_null(result->gpu_results[gpu_index].gpu_result.msg,
per_gpu_info.c_str(), MAX_DIAG_MSG_LENGTH);
strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH);
strncpy_with_null(result->gpu_results[gpu_index].gpu_result.msg, per_gpu_info.c_str(),
MAX_DIAG_MSG_LENGTH);
return RDC_ST_OK;
return RDC_ST_OK;
}
static rdc_status_t run_compute_queue_test(uint32_t gpu_index, rdc_diag_test_result_t* result) {
std::string info = result->info;
std::string per_gpu_info = result->gpu_results[gpu_index].gpu_result.msg;
static rdc_status_t run_compute_queue_test(uint32_t gpu_index,
rdc_diag_test_result_t* result) {
std::string info = result->info;
std::string per_gpu_info = result->gpu_results[gpu_index].gpu_result.msg;
try {
amd::rdc::ComputeQueueTest test(gpu_index);
test.RunBinarySearchTest();
info += test.get_gpu_info();
per_gpu_info += test.get_per_gpu_info();
} catch (const amd::rdc::SkipException& e) {
result->status = RDC_DIAG_RESULT_SKIP;
per_gpu_info += "Compute Queue test is skipped: ";
per_gpu_info += e.what();
info += "GPU ";
info += std::to_string(gpu_index);
info += " Compute Queue test is skipped: ";
info += e.what();
info += ".";
} catch (const std::exception& e) {
result->status = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Compute Queue test returns with error ";
per_gpu_info += e.what();
info += "GPU ";
info += std::to_string(gpu_index);
info += " Compute Queue test returns with error ";
info += e.what();
info += ".";
}
try {
amd::rdc::ComputeQueueTest test(gpu_index);
test.RunBinarySearchTest();
info += test.get_gpu_info();
per_gpu_info += test.get_per_gpu_info();
} catch (const amd::rdc::SkipException& e) {
strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH);
strncpy_with_null(result->gpu_results[gpu_index].gpu_result.msg, per_gpu_info.c_str(),
MAX_DIAG_MSG_LENGTH);
return RDC_ST_OK;
}
rdc_status_t rdc_diag_test_case_run(rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result) {
if (result == nullptr || gpu_count == 0) {
return RDC_ST_BAD_PARAMETER;
}
if (test_case != RDC_DIAG_COMPUTE_QUEUE && test_case != RDC_DIAG_SYS_MEM_CHECK) {
return RDC_ST_OK;
}
// init the return data
*result = {};
result->test_case = test_case;
result->status = RDC_DIAG_RESULT_PASS;
result->per_gpu_result_count = 0;
// Run test for each GPU. It will continue even
// if one GPU test is fail.
for (uint32_t i = 0; i < gpu_count; i++) {
switch (test_case) {
case RDC_DIAG_SYS_MEM_CHECK:
run_memory_test(gpu_index[i], result);
break;
case RDC_DIAG_COMPUTE_QUEUE:
run_compute_queue_test(gpu_index[i], result);
break;
default:
result->status = RDC_DIAG_RESULT_SKIP;
per_gpu_info += "Compute Queue test is skipped: ";
per_gpu_info += e.what();
info += "GPU ";
info += std::to_string(gpu_index);
info += " Compute Queue test is skipped: ";
info += e.what();
info += ".";
} catch (const std::exception& e) {
result->status = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Compute Queue test returns with error ";
per_gpu_info += e.what();
info += "GPU ";
info += std::to_string(gpu_index);
info += " Compute Queue test returns with error ";
info += e.what();
info += ".";
strncpy_with_null(result->info, "Not support yet", MAX_DIAG_MSG_LENGTH);
}
}
strncpy_with_null(result->info, info.c_str(),
MAX_DIAG_MSG_LENGTH);
strncpy_with_null(result->gpu_results[gpu_index].gpu_result.msg,
per_gpu_info.c_str(), MAX_DIAG_MSG_LENGTH);
return RDC_ST_OK;
}
rdc_status_t rdc_diag_test_case_run(
rdc_diag_test_cases_t test_case,
uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
if (result == nullptr ||
gpu_count == 0 ) {
return RDC_ST_BAD_PARAMETER;
}
if (test_case != RDC_DIAG_COMPUTE_QUEUE &&
test_case != RDC_DIAG_SYS_MEM_CHECK) {
return RDC_ST_OK;
}
// init the return data
*result = {};
result->test_case = test_case;
result->status = RDC_DIAG_RESULT_PASS;
result->per_gpu_result_count = 0;
// Run test for each GPU. It will continue even
// if one GPU test is fail.
for (uint32_t i = 0; i < gpu_count; i++) {
switch (test_case) {
case RDC_DIAG_SYS_MEM_CHECK:
run_memory_test(gpu_index[i], result);
break;
case RDC_DIAG_COMPUTE_QUEUE:
run_compute_queue_test(gpu_index[i], result);
break;
default:
result->status = RDC_DIAG_RESULT_SKIP;
strncpy_with_null(result->info, "Not support yet"
, MAX_DIAG_MSG_LENGTH);
}
}
return RDC_ST_OK;
return RDC_ST_OK;
}
+2 -2
查看文件
@@ -21,6 +21,7 @@ THE SOFTWARE.
*/
#include "rdc_modules/rdc_rocr/RdcRocrBase.h"
#include <string.h>
namespace amd {
@@ -45,8 +46,7 @@ RdcRocrBase::RdcRocrBase(void) {
orig_hsa_enable_interrupt_ = nullptr;
}
RdcRocrBase::~RdcRocrBase() {
}
RdcRocrBase::~RdcRocrBase() {}
} // namespace rdc
} // namespace amd
+20 -28
查看文件
@@ -20,13 +20,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_modules/rdc_rocr/TestBase.h"
#include <assert.h>
#include <unistd.h>
#include <algorithm>
#include "rdc_modules/rdc_rocr/TestBase.h"
#include "rdc_modules/rdc_rocr/base_rocr_utils.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rocr/base_rocr_utils.h"
namespace amd {
namespace rdc {
@@ -40,16 +43,10 @@ static const char kRunLabel[] = "TEST EXECUTION";
static const char kCloseLabel[] = "TEST CLEAN UP";
static const char kResultsLabel[] = "TEST RESULTS";
TestBase::TestBase(uint32_t gpu_index) : gpu_index_(gpu_index), description_("") { SetUp(); }
TestBase::~TestBase() { Close(); }
TestBase::TestBase(uint32_t gpu_index):
gpu_index_(gpu_index), description_("") {
SetUp();
}
TestBase::~TestBase() {
Close();
}
static void MakeHeaderStr(const char *inStr, std::string *outStr) {
static void MakeHeaderStr(const char* inStr, std::string* outStr) {
assert(outStr != nullptr);
assert(inStr != nullptr);
@@ -88,7 +85,6 @@ void TestBase::Close(void) {
throw_if_error(err);
}
void TestBase::DisplayResults(void) const {
std::string label;
MakeHeaderStr(kResultsLabel, &label);
@@ -96,8 +92,9 @@ void TestBase::DisplayResults(void) const {
}
void TestBase::DisplayTestInfo(void) {
printf("#########################################"
"######################################\n");
printf(
"#########################################"
"######################################\n");
std::string label;
MakeHeaderStr(kTitleLabel, &label);
@@ -122,8 +119,7 @@ void TestBase::set_description(std::string d) {
}
}
hsa_status_t TestBase::get_agent_by_gpu_index(uint32_t gpu_index,
hsa_agent_t* agent) {
hsa_status_t TestBase::get_agent_by_gpu_index(uint32_t gpu_index, hsa_agent_t* agent) {
hsa_status_t err = HSA_STATUS_SUCCESS;
std::vector<hsa_agent_t> gpus;
err = hsa_iterate_agents(IterateGPUAgents, &gpus);
@@ -135,19 +131,16 @@ hsa_status_t TestBase::get_agent_by_gpu_index(uint32_t gpu_index,
// sort based on bdf id
std::vector<std::pair<uint16_t, hsa_agent_t>> dv_to_id;
for (uint32_t dv_ind = 0; dv_ind < gpus.size(); ++dv_ind) {
auto dev = gpus[dv_ind];
uint16_t bdf_id = 0;
err = hsa_agent_get_info(dev,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id);
throw_if_error(err, "fail to get gpu bdfid");
dv_to_id.push_back({bdf_id, dev});
auto dev = gpus[dv_ind];
uint16_t bdf_id = 0;
err = hsa_agent_get_info(dev, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id);
throw_if_error(err, "fail to get gpu bdfid");
dv_to_id.push_back({bdf_id, dev});
}
// Stable sort to keep the order if bdf is equal.
std::stable_sort(dv_to_id.begin(), dv_to_id.end(), []
(const std::pair<uint16_t, hsa_agent_t>& p1,
const std::pair<uint16_t, hsa_agent_t>& p2) {
return p1.first < p2.first;
});
std::stable_sort(dv_to_id.begin(), dv_to_id.end(),
[](const std::pair<uint16_t, hsa_agent_t>& p1,
const std::pair<uint16_t, hsa_agent_t>& p2) { return p1.first < p2.first; });
*agent = dv_to_id[gpu_index].second;
@@ -156,4 +149,3 @@ hsa_status_t TestBase::get_agent_by_gpu_index(uint32_t gpu_index,
} // namespace rdc
} // namespace amd
+94 -111
查看文件
@@ -20,16 +20,18 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_modules/rdc_rocr/base_rocr_utils.h"
#include <assert.h>
#include <fcntl.h>
#include <libgen.h>
#include <stdio.h>
#include <stdlib.h>
#include <libgen.h>
#include <unistd.h>
#include <string>
#include <stdexcept>
#include <string>
#include "hsa/hsa.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
@@ -37,7 +39,6 @@ THE SOFTWARE.
namespace amd {
namespace rdc {
// Clean up some of the common handles and memory used by RdcRocrBase code, then
// shut down hsa. Restore HSA_ENABLE_INTERRUPT to original value, if necessary
hsa_status_t CommonCleanUp(RdcRocrBase* test) {
@@ -78,7 +79,10 @@ hsa_status_t CommonCleanUp(RdcRocrBase* test) {
return err;
}
static const char* PROFILE_STR[] = {"HSA_PROFILE_BASE", "HSA_PROFILE_FULL", };
static const char* PROFILE_STR[] = {
"HSA_PROFILE_BASE",
"HSA_PROFILE_FULL",
};
/// Verify that the machine running the test has the required profile.
/// This function will verify that the execution machine meets any specific
@@ -89,18 +93,16 @@ static const char* PROFILE_STR[] = {"HSA_PROFILE_BASE", "HSA_PROFILE_FULL", };
/// - false Machine does not meet test requirements
bool CheckProfileAndInform(RdcRocrBase* test) {
if (test->verbosity() > 0) {
RDC_LOG(RDC_DEBUG, "Target HW Profile is "
<< PROFILE_STR[test->profile()]);
RDC_LOG(RDC_DEBUG, "Target HW Profile is " << PROFILE_STR[test->profile()]);
}
if (test->requires_profile() == -1) {
if (test->verbosity() > 0) {
RDC_LOG(RDC_DEBUG, "Test can run on any profile. OK.");
RDC_LOG(RDC_DEBUG, "Test can run on any profile. OK.");
}
return true;
} else {
RDC_LOG(RDC_DEBUG, "Test requires " << PROFILE_STR[test->requires_profile()]
<< ". ");
RDC_LOG(RDC_DEBUG, "Test requires " << PROFILE_STR[test->requires_profile()] << ". ");
if (test->requires_profile() != test->profile()) {
RDC_LOG(RDC_DEBUG, "Not Running.");
return false;
@@ -133,29 +135,29 @@ static hsa_status_t ProcessIterateError(hsa_status_t err) {
hsa_status_t SetPoolsTypical(RdcRocrBase* test) {
hsa_status_t err;
if (test->profile() == HSA_PROFILE_FULL) {
err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(),
FindAPUStandardPool, &test->cpu_pool());
err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), FindAPUStandardPool,
&test->cpu_pool());
throw_if_error(ProcessIterateError(err));
err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(),
FindAPUStandardPool, &test->device_pool());
err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), FindAPUStandardPool,
&test->device_pool());
throw_if_error(ProcessIterateError(err));
err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(),
FindAPUStandardPool, &test->kern_arg_pool());
err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), FindAPUStandardPool,
&test->kern_arg_pool());
throw_if_error(ProcessIterateError(err));
} else {
err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(),
FindStandardPool, &test->cpu_pool());
err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), FindStandardPool,
&test->cpu_pool());
throw_if_error(ProcessIterateError(err));
err = hsa_amd_agent_iterate_memory_pools(*test->gpu_device1(),
FindStandardPool, &test->device_pool());
err = hsa_amd_agent_iterate_memory_pools(*test->gpu_device1(), FindStandardPool,
&test->device_pool());
throw_if_error(ProcessIterateError(err));
err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(),
FindKernArgPool, &test->kern_arg_pool());
err = hsa_amd_agent_iterate_memory_pools(*test->cpu_device(), FindKernArgPool,
&test->kern_arg_pool());
throw_if_error(ProcessIterateError(err));
}
@@ -261,11 +263,11 @@ hsa_status_t LoadKernelFromObjFile(RdcRocrBase* test, hsa_agent_t* agent) {
}
std::string kern_name = test->kernel_name();
std::string obj_file = search_hsaco_full_path(
test->kernel_file_name().c_str(), test->get_agent_name().c_str());
std::string obj_file =
search_hsaco_full_path(test->kernel_file_name().c_str(), test->get_agent_name().c_str());
if (obj_file == "") {
RDC_LOG(RDC_ERROR, "failed to find " << test->kernel_file_name() <<
" at line " << __LINE__ << ", errno: " << errno);
RDC_LOG(RDC_ERROR, "failed to find " << test->kernel_file_name() << " at line " << __LINE__
<< ", errno: " << errno);
std::string msg("fail to open ");
msg += test->kernel_file_name();
throw_if_skip(msg);
@@ -275,55 +277,53 @@ hsa_status_t LoadKernelFromObjFile(RdcRocrBase* test, hsa_agent_t* agent) {
hsa_file_t file_handle = open(obj_file.c_str(), O_RDONLY);
if (file_handle == -1) {
RDC_LOG(RDC_ERROR, "failed to open " << obj_file.c_str() << " at line "
<< __LINE__ << ", file: " << __FILE__);
return (hsa_status_t) errno;
RDC_LOG(RDC_ERROR, "failed to open " << obj_file.c_str() << " at line " << __LINE__
<< ", file: " << __FILE__);
return (hsa_status_t)errno;
}
err = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr);
throw_if_error(err);
close(file_handle);
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
NULL, &executable);
err = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL,
&executable);
throw_if_error(err);
err = hsa_executable_load_agent_code_object(executable, *agent, code_obj_rdr,
NULL, NULL);
err = hsa_executable_load_agent_code_object(executable, *agent, code_obj_rdr, NULL, NULL);
throw_if_error(err);
err = hsa_executable_freeze(executable, NULL);
throw_if_error(err);
hsa_executable_symbol_t kern_sym;
err = hsa_executable_get_symbol(executable, NULL, (kern_name + ".kd").c_str(), *agent,
0, &kern_sym);
err = hsa_executable_get_symbol(executable, NULL, (kern_name + ".kd").c_str(), *agent, 0,
&kern_sym);
throw_if_error(err);
uint64_t codeHandle;
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &codeHandle);
err = hsa_executable_symbol_get_info(kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
&codeHandle);
throw_if_error(err);
test->set_kernel_object(codeHandle);
uint32_t val;
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &val);
err = hsa_executable_symbol_get_info(
kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &val);
throw_if_error(err);
test->set_private_segment_size(val);
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &val);
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &val);
throw_if_error(err);
test->set_group_segment_size(val);
// Remaining queries only supported on code object v3.
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &val);
err = hsa_executable_symbol_get_info(
kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &val);
throw_if_error(err);
test->set_kernarg_size(val);
err = hsa_executable_symbol_get_info(kern_sym,
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, &val);
err = hsa_executable_symbol_get_info(
kern_sym, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, &val);
throw_if_error(err);
assert(val >= 16 && "Reported kernarg size is too small.");
val = (val == 0) ? 16 : val;
@@ -332,26 +332,23 @@ hsa_status_t LoadKernelFromObjFile(RdcRocrBase* test, hsa_agent_t* agent) {
return HSA_STATUS_SUCCESS;
}
hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue,
uint32_t num_pkts) {
hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue, uint32_t num_pkts) {
hsa_status_t err;
if (num_pkts == 0) {
err = hsa_agent_get_info(device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
&num_pkts);
err = hsa_agent_get_info(device, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &num_pkts);
throw_if_error(err);
}
err = hsa_queue_create(device, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL,
NULL, UINT32_MAX, UINT32_MAX, queue);
err = hsa_queue_create(device, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX,
queue);
throw_if_error(err);
return HSA_STATUS_SUCCESS;
}
// Initialize the provided aql packet with standard default values, and
// values from provided RdcRocrBase object.
hsa_status_t InitializeAQLPacket(const RdcRocrBase* test,
hsa_kernel_dispatch_packet_t* aql) {
hsa_status_t InitializeAQLPacket(const RdcRocrBase* test, hsa_kernel_dispatch_packet_t* aql) {
hsa_status_t err;
assert(aql != nullptr);
@@ -359,7 +356,7 @@ hsa_status_t InitializeAQLPacket(const RdcRocrBase* test,
if (aql == nullptr) {
return HSA_STATUS_ERROR;
}
// Initialize Packet type as Invalid
// Update packet type to Kernel Dispatch
// right before ringing doorbell
@@ -370,7 +367,7 @@ hsa_status_t InitializeAQLPacket(const RdcRocrBase* test,
aql->workgroup_size_y = 1;
aql->workgroup_size_z = 1;
aql->grid_size_x = (uint64_t) 256; // manual_input*group_input; workg max sz
aql->grid_size_x = (uint64_t)256; // manual_input*group_input; workg max sz
aql->grid_size_y = 1;
aql->grid_size_z = 1;
@@ -392,11 +389,11 @@ hsa_status_t InitializeAQLPacket(const RdcRocrBase* test,
// Copy RdcRocrBase aql object values to the RdcRocrBase object queue in the
// specified queue position (ind)
hsa_kernel_dispatch_packet_t * WriteAQLToQueue(RdcRocrBase* test, uint64_t *ind) {
hsa_kernel_dispatch_packet_t* WriteAQLToQueue(RdcRocrBase* test, uint64_t* ind) {
assert(test);
assert(test->main_queue());
void *queue_base = test->main_queue()->base_address;
void* queue_base = test->main_queue()->base_address;
const uint32_t queue_mask = test->main_queue()->size - 1;
uint64_t que_idx = hsa_queue_add_write_index_relaxed(test->main_queue(), 1);
*ind = que_idx;
@@ -405,8 +402,7 @@ hsa_kernel_dispatch_packet_t * WriteAQLToQueue(RdcRocrBase* test, uint64_t *ind)
hsa_kernel_dispatch_packet_t* queue_aql_packet;
queue_aql_packet =
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue_base))
[que_idx & queue_mask];
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue_base))[que_idx & queue_mask];
queue_aql_packet->workgroup_size_x = staging_aql_packet->workgroup_size_x;
queue_aql_packet->workgroup_size_y = staging_aql_packet->workgroup_size_y;
@@ -414,10 +410,8 @@ hsa_kernel_dispatch_packet_t * WriteAQLToQueue(RdcRocrBase* test, uint64_t *ind)
queue_aql_packet->grid_size_x = staging_aql_packet->grid_size_x;
queue_aql_packet->grid_size_y = staging_aql_packet->grid_size_y;
queue_aql_packet->grid_size_z = staging_aql_packet->grid_size_z;
queue_aql_packet->private_segment_size =
staging_aql_packet->private_segment_size;
queue_aql_packet->group_segment_size =
staging_aql_packet->group_segment_size;
queue_aql_packet->private_segment_size = staging_aql_packet->private_segment_size;
queue_aql_packet->group_segment_size = staging_aql_packet->group_segment_size;
queue_aql_packet->kernel_object = staging_aql_packet->kernel_object;
queue_aql_packet->kernarg_address = staging_aql_packet->kernarg_address;
queue_aql_packet->completion_signal = staging_aql_packet->completion_signal;
@@ -425,19 +419,16 @@ hsa_kernel_dispatch_packet_t * WriteAQLToQueue(RdcRocrBase* test, uint64_t *ind)
return queue_aql_packet;
}
void
WriteAQLToQueueLoc(hsa_queue_t *queue, uint64_t indx,
hsa_kernel_dispatch_packet_t *aql_pkt) {
void WriteAQLToQueueLoc(hsa_queue_t* queue, uint64_t indx, hsa_kernel_dispatch_packet_t* aql_pkt) {
assert(queue);
assert(aql_pkt);
void *queue_base = queue->base_address;
void* queue_base = queue->base_address;
const uint32_t queue_mask = queue->size - 1;
hsa_kernel_dispatch_packet_t* queue_aql_packet;
queue_aql_packet =
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue_base))
[indx & queue_mask];
&(reinterpret_cast<hsa_kernel_dispatch_packet_t*>(queue_base))[indx & queue_mask];
queue_aql_packet->workgroup_size_x = aql_pkt->workgroup_size_x;
queue_aql_packet->workgroup_size_y = aql_pkt->workgroup_size_y;
@@ -445,10 +436,8 @@ WriteAQLToQueueLoc(hsa_queue_t *queue, uint64_t indx,
queue_aql_packet->grid_size_x = aql_pkt->grid_size_x;
queue_aql_packet->grid_size_y = aql_pkt->grid_size_y;
queue_aql_packet->grid_size_z = aql_pkt->grid_size_z;
queue_aql_packet->private_segment_size =
aql_pkt->private_segment_size;
queue_aql_packet->group_segment_size =
aql_pkt->group_segment_size;
queue_aql_packet->private_segment_size = aql_pkt->private_segment_size;
queue_aql_packet->group_segment_size = aql_pkt->group_segment_size;
queue_aql_packet->kernel_object = aql_pkt->kernel_object;
queue_aql_packet->kernarg_address = aql_pkt->kernarg_address;
queue_aql_packet->completion_signal = aql_pkt->completion_signal;
@@ -474,11 +463,10 @@ hsa_status_t AllocAndSetKernArgs(RdcRocrBase* test, void* args, size_t arg_size)
test->set_kernarg_buffer(kern_arg_buf);
void *adj_kern_arg_buf = AlignUp(kern_arg_buf, req_align);
void* adj_kern_arg_buf = AlignUp(kern_arg_buf, req_align);
assert(arg_size >= test->kernarg_size());
assert(((uintptr_t)adj_kern_arg_buf + arg_size) <
((uintptr_t)kern_arg_buf + buf_size));
assert(((uintptr_t)adj_kern_arg_buf + arg_size) < ((uintptr_t)kern_arg_buf + buf_size));
hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()};
err = hsa_amd_agents_allow_access(2, ag_list, NULL, kern_arg_buf);
@@ -494,28 +482,27 @@ hsa_status_t AllocAndSetKernArgs(RdcRocrBase* test, void* args, size_t arg_size)
std::string get_lib_dir(const char* lib_name) {
std::string result;
char line[1024*8];
char line[1024 * 8];
FILE* file = fopen("/proc/self/maps", "r");
if (file == NULL)
return result;
if (file == NULL) return result;
std::string lib_path = "/";
lib_path += lib_name;
// 7f4eacb46000 r-xp 00000 08:01 17183106 /lib/x86_64-linux-gnu/libc-2.27.so
while (fgets(line, sizeof(line), file)) {
char* end = strstr(line, lib_path.c_str());
if (end != NULL) {
char* start = end;
while (start > line) {
if (isspace(*start)) {
start++;
break;
}
start--;
}
result = std::string(start, end-start);
break;
char* end = strstr(line, lib_path.c_str());
if (end != NULL) {
char* start = end;
while (start > line) {
if (isspace(*start)) {
start++;
break;
}
start--;
}
result = std::string(start, end - start);
break;
}
}
fclose(file);
@@ -523,41 +510,37 @@ std::string get_lib_dir(const char* lib_name) {
}
std::string get_app_dir() {
char buf[1024*8];
int ret = readlink("/proc/self/exe", buf, 1024*8);
if ((ret != -1) && ret < (1024*8 - 1)) {
char buf[1024 * 8];
int ret = readlink("/proc/self/exe", buf, 1024 * 8);
if ((ret != -1) && ret < (1024 * 8 - 1)) {
buf[ret] = '\0';
return dirname(buf);
}
return "";
}
std::string search_hsaco_full_path(const char* hsaco_file_name,
const char* agent_name) {
std::string search_hsaco_full_path(const char* hsaco_file_name, const char* agent_name) {
const std::string lib_dir = get_lib_dir("librdc_rocr.so");
const std::string app_dir = get_app_dir();
std::vector<std::string> path_to_search;
path_to_search.push_back(std::string("./")+hsaco_file_name);
path_to_search.push_back(app_dir+"/"+hsaco_file_name);
path_to_search.push_back(lib_dir+"/"+hsaco_file_name);
path_to_search.push_back(lib_dir+"/rdc/hsaco/"+ agent_name
+ "/" + hsaco_file_name);
path_to_search.push_back(lib_dir+"/hsaco/"+ agent_name
+ "/" + hsaco_file_name);
path_to_search.push_back(std::string("./") + hsaco_file_name);
path_to_search.push_back(app_dir + "/" + hsaco_file_name);
path_to_search.push_back(lib_dir + "/" + hsaco_file_name);
path_to_search.push_back(lib_dir + "/rdc/hsaco/" + agent_name + "/" + hsaco_file_name);
path_to_search.push_back(lib_dir + "/hsaco/" + agent_name + "/" + hsaco_file_name);
// for dev structure
path_to_search.push_back(lib_dir+"/../../rdc_libs/rdc_modules/kernels/hsaco/"
+ agent_name + "/" + hsaco_file_name);
path_to_search.push_back(lib_dir + "/../../rdc_libs/rdc_modules/kernels/hsaco/" + agent_name +
"/" + hsaco_file_name);
for (std::size_t i = 0; i < path_to_search.size(); i++) {
if ( ::access(path_to_search[i].c_str(), F_OK) == 0 ) {
RDC_LOG(RDC_DEBUG, "Use the file " << path_to_search[i]);
return path_to_search[i];
}
RDC_LOG(RDC_DEBUG, "Skip not exists file " << path_to_search[i]);
if (::access(path_to_search[i].c_str(), F_OK) == 0) {
RDC_LOG(RDC_DEBUG, "Use the file " << path_to_search[i]);
return path_to_search[i];
}
RDC_LOG(RDC_DEBUG, "Skip not exists file " << path_to_search[i]);
}
return "";
}
} // namespace rdc
} // namespace amd
+81 -132
查看文件
@@ -23,11 +23,14 @@ THE SOFTWARE.
/// \file
/// Implementation of utility functions used by RocR applications
#include "rdc_modules/rdc_rocr/common.h"
#include <assert.h>
#include <stdlib.h>
#include <memory>
#include <sstream>
#include <string>
#include <memory>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
@@ -38,14 +41,11 @@ void throw_if_error(hsa_status_t err, const std::string& msg) {
if (err != HSA_STATUS_SUCCESS) {
const char* errstr = 0;
hsa_status_string(err, &errstr);
throw std::runtime_error(msg + " hsa error code: "
+ std::to_string(err) + " " + errstr);
throw std::runtime_error(msg + " hsa error code: " + std::to_string(err) + " " + errstr);
}
}
void throw_if_skip(const std::string& msg) {
throw SkipException(msg.c_str());
}
void throw_if_skip(const std::string& msg) { throw SkipException(msg.c_str()); }
void SetEnv(const char* env_var_name, const char* env_var_value) {
int err = setenv(env_var_name, env_var_value, 1);
@@ -56,28 +56,21 @@ void SetEnv(const char* env_var_name, const char* env_var_value) {
}
}
intptr_t
AlignDown(intptr_t value, size_t alignment) {
assert(alignment != 0 && "Zero alignment");
return (intptr_t) (value & ~(alignment - 1));
intptr_t AlignDown(intptr_t value, size_t alignment) {
assert(alignment != 0 && "Zero alignment");
return (intptr_t)(value & ~(alignment - 1));
}
void *
AlignDown(void* value, size_t alignment) {
return reinterpret_cast<void*>(AlignDown(
reinterpret_cast<uintptr_t>(value), alignment));
void* AlignDown(void* value, size_t alignment) {
return reinterpret_cast<void*>(AlignDown(reinterpret_cast<uintptr_t>(value), alignment));
}
void *
AlignUp(void* value, size_t alignment) {
return reinterpret_cast<void*>(
AlignDown((uintptr_t)(reinterpret_cast<uintptr_t>(value) + alignment - 1),
alignment));
void* AlignUp(void* value, size_t alignment) {
return reinterpret_cast<void*>(
AlignDown((uintptr_t)(reinterpret_cast<uintptr_t>(value) + alignment - 1), alignment));
}
static hsa_status_t FindAgent(hsa_agent_t agent, void* data,
hsa_device_type_t dev_type) {
static hsa_status_t FindAgent(hsa_agent_t agent, void* data, hsa_device_type_t dev_type) {
assert(data != nullptr);
if (data == nullptr) {
@@ -85,8 +78,7 @@ static hsa_status_t FindAgent(hsa_agent_t agent, void* data,
}
hsa_device_type_t hsa_device_type;
hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE,
&hsa_device_type);
hsa_status_t hsa_error_code = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type);
throw_if_error(hsa_error_code);
if (hsa_device_type == dev_type) {
@@ -98,7 +90,7 @@ static hsa_status_t FindAgent(hsa_agent_t agent, void* data,
}
// Find CPU Agents
hsa_status_t IterateCPUAgents(hsa_agent_t agent, void *data) {
hsa_status_t IterateCPUAgents(hsa_agent_t agent, void* data) {
hsa_status_t status;
assert(data != nullptr);
if (data == nullptr) {
@@ -115,10 +107,8 @@ hsa_status_t IterateCPUAgents(hsa_agent_t agent, void *data) {
return status;
}
// Find GPU Agents
hsa_status_t IterateGPUAgents(hsa_agent_t agent, void *data) {
hsa_status_t IterateGPUAgents(hsa_agent_t agent, void* data) {
hsa_status_t status;
assert(data != nullptr);
if (data == nullptr) {
@@ -138,27 +128,20 @@ hsa_status_t IterateGPUAgents(hsa_agent_t agent, void *data) {
hsa_status_t GetGlobalMemoryPool(hsa_amd_memory_pool_t pool, void* data) {
hsa_amd_segment_t segment;
hsa_status_t err;
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
if (HSA_AMD_SEGMENT_GLOBAL != segment)
return err;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
if (HSA_AMD_SEGMENT_GLOBAL != segment) return err;
hsa_amd_memory_pool_global_flag_t flags;
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS,
&flags);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
throw_if_error(err);
// this is valid for dGPUs. But on APUs, it has to be FINE_GRAINED
if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
hsa_amd_memory_pool_t* ret =
reinterpret_cast<hsa_amd_memory_pool_t*>(data);
hsa_amd_memory_pool_t* ret = reinterpret_cast<hsa_amd_memory_pool_t*>(data);
*ret = pool;
} else { // this is for APUs
if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
hsa_amd_memory_pool_t* ret =
reinterpret_cast<hsa_amd_memory_pool_t*>(data);
hsa_amd_memory_pool_t* ret = reinterpret_cast<hsa_amd_memory_pool_t*>(data);
*ret = pool;
}
}
@@ -172,23 +155,18 @@ hsa_status_t GetKernArgMemoryPool(hsa_amd_memory_pool_t pool, void* data) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
hsa_amd_segment_t segment;
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
throw_if_error(err);
if (HSA_AMD_SEGMENT_GLOBAL != segment) {
return HSA_STATUS_SUCCESS;
}
hsa_amd_memory_pool_global_flag_t flags;
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS,
&flags);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
throw_if_error(err);
if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
hsa_amd_memory_pool_t* ret =
reinterpret_cast<hsa_amd_memory_pool_t*>(data);
hsa_amd_memory_pool_t* ret = reinterpret_cast<hsa_amd_memory_pool_t*>(data);
*ret = pool;
}
@@ -211,10 +189,9 @@ typedef enum {
POOL_PROP_DONT_CARE ///< We don't care if the property is present or not.
} pool_prop_t;
static hsa_status_t
FindPool(hsa_amd_memory_pool_t pool, void* data, hsa_amd_segment_t in_segment,
pool_prop_t accessible_by_all, pool_prop_t kern_arg,
pool_prop_t fine_grain) {
static hsa_status_t FindPool(hsa_amd_memory_pool_t pool, void* data, hsa_amd_segment_t in_segment,
pool_prop_t accessible_by_all, pool_prop_t kern_arg,
pool_prop_t fine_grain) {
if (nullptr == data) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
@@ -223,8 +200,7 @@ FindPool(hsa_amd_memory_pool_t pool, void* data, hsa_amd_segment_t in_segment,
hsa_amd_segment_t segment;
uint32_t flag;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
throw_if_error(err);
if (in_segment != segment) {
@@ -232,8 +208,7 @@ FindPool(hsa_amd_memory_pool_t pool, void* data, hsa_amd_segment_t in_segment,
}
if (HSA_AMD_SEGMENT_GLOBAL == in_segment) {
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
throw_if_error(err);
if (kern_arg != POOL_PROP_DONT_CARE) {
@@ -254,13 +229,12 @@ FindPool(hsa_amd_memory_pool_t pool, void* data, hsa_amd_segment_t in_segment,
if (accessible_by_all != POOL_PROP_DONT_CARE) {
bool access_read;
err = hsa_amd_memory_pool_get_info(pool,
(hsa_amd_memory_pool_info_t)
HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &access_read);
err = hsa_amd_memory_pool_get_info(
pool, (hsa_amd_memory_pool_info_t)HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &access_read);
throw_if_error(err);
if (((!access_read) && accessible_by_all == POOL_PROP_ON) ||
(access_read && (accessible_by_all == POOL_PROP_OFF))) {
(access_read && (accessible_by_all == POOL_PROP_OFF))) {
return HSA_STATUS_SUCCESS;
}
}
@@ -270,69 +244,64 @@ FindPool(hsa_amd_memory_pool_t pool, void* data, hsa_amd_segment_t in_segment,
}
hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) {
return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_DONT_CARE,
POOL_PROP_OFF, POOL_PROP_DONT_CARE);
return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_DONT_CARE, POOL_PROP_OFF,
POOL_PROP_DONT_CARE);
}
hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_DONT_CARE,
POOL_PROP_ON, POOL_PROP_DONT_CARE);
return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_DONT_CARE, POOL_PROP_ON,
POOL_PROP_DONT_CARE);
}
hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data) {
return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_ON,
POOL_PROP_OFF, POOL_PROP_DONT_CARE);
return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_ON, POOL_PROP_OFF,
POOL_PROP_DONT_CARE);
}
hsa_status_t FindAPUStandardPool(hsa_amd_memory_pool_t pool, void* data) {
return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_DONT_CARE,
POOL_PROP_DONT_CARE, POOL_PROP_DONT_CARE);
return FindPool(pool, data, HSA_AMD_SEGMENT_GLOBAL, POOL_PROP_DONT_CARE, POOL_PROP_DONT_CARE,
POOL_PROP_DONT_CARE);
}
// Populate the vector with handles to all agents and pools
hsa_status_t
GetAgentPools(std::vector<std::shared_ptr<agent_pools_t>> *agent_pools) {
hsa_status_t GetAgentPools(std::vector<std::shared_ptr<agent_pools_t>>* agent_pools) {
hsa_status_t err;
assert(agent_pools != nullptr);
auto save_agent = [](hsa_agent_t a, void *data)->hsa_status_t {
std::vector<std::shared_ptr<agent_pools_t>> *ag_vec;
auto save_agent = [](hsa_agent_t a, void* data) -> hsa_status_t {
std::vector<std::shared_ptr<agent_pools_t>>* ag_vec;
hsa_status_t err;
assert(data != nullptr);
ag_vec =
reinterpret_cast<std::vector<std::shared_ptr<agent_pools_t>> *>(data);
ag_vec = reinterpret_cast<std::vector<std::shared_ptr<agent_pools_t>>*>(data);
std::shared_ptr<agent_pools_t> ag(new agent_pools_t);
ag->agent = a;
auto save_pool = [](hsa_amd_memory_pool_t p, void *data)->hsa_status_t {
auto save_pool = [](hsa_amd_memory_pool_t p, void* data) -> hsa_status_t {
assert(data != nullptr);
std::vector<hsa_amd_memory_pool_t> *p_list =
reinterpret_cast<std::vector<hsa_amd_memory_pool_t> *>(data);
std::vector<hsa_amd_memory_pool_t>* p_list =
reinterpret_cast<std::vector<hsa_amd_memory_pool_t>*>(data);
p_list->push_back(p);
return HSA_STATUS_SUCCESS;
};
err = hsa_amd_agent_iterate_memory_pools(a, save_pool,
reinterpret_cast<void *>(&ag->pools));
err = hsa_amd_agent_iterate_memory_pools(a, save_pool, reinterpret_cast<void*>(&ag->pools));
ag_vec->push_back(ag);
return err;
};
err = hsa_iterate_agents(save_agent, reinterpret_cast<void *>(agent_pools));
err = hsa_iterate_agents(save_agent, reinterpret_cast<void*>(agent_pools));
return err;
}
static hsa_status_t MakeGlobalFlagsString(const pool_info_t *pool_i,
std::string* out_str) {
static hsa_status_t MakeGlobalFlagsString(const pool_info_t* pool_i, std::string* out_str) {
uint32_t global_flag = pool_i->global_flag;
assert(out_str != nullptr);
*out_str = "";
std::vector < std::string > flags;
std::vector<std::string> flags;
if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) {
flags.push_back("KERNARG");
@@ -356,8 +325,7 @@ static hsa_status_t MakeGlobalFlagsString(const pool_info_t *pool_i,
return HSA_STATUS_SUCCESS;
}
static hsa_status_t DumpSegment(const pool_info_t *pool_i,
std::string const *ind_lvl) {
static hsa_status_t DumpSegment(const pool_info_t* pool_i, std::string const* ind_lvl) {
hsa_status_t err;
RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Segment:");
@@ -394,53 +362,44 @@ static hsa_status_t DumpSegment(const pool_info_t *pool_i,
return HSA_STATUS_SUCCESS;
}
hsa_status_t AcquirePoolInfo(hsa_amd_memory_pool_t pool,
pool_info_t *pool_i) {
hsa_status_t AcquirePoolInfo(hsa_amd_memory_pool_t pool, pool_info_t* pool_i) {
hsa_status_t err;
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &pool_i->global_flag);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS,
&pool_i->global_flag);
throw_if_error(err);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&pool_i->segment);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &pool_i->segment);
throw_if_error(err);
// Get the size of the POOL
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
&pool_i->size);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &pool_i->size);
throw_if_error(err);
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
&pool_i->alloc_allowed);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
&pool_i->alloc_allowed);
throw_if_error(err);
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
&pool_i->alloc_granule);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
&pool_i->alloc_granule);
throw_if_error(err);
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT,
&pool_i->alloc_alignment);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT,
&pool_i->alloc_alignment);
throw_if_error(err);
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL,
&pool_i->accessible_by_all);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL,
&pool_i->accessible_by_all);
throw_if_error(err);
err = hsa_amd_memory_pool_get_info(pool,
HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE,
&pool_i->aggregate_alloc_max);
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE,
&pool_i->aggregate_alloc_max);
throw_if_error(err);
return HSA_STATUS_SUCCESS;
}
hsa_status_t DumpMemoryPoolInfo(const pool_info_t *pool_i,
uint32_t indent) {
hsa_status_t DumpMemoryPoolInfo(const pool_info_t* pool_i, uint32_t indent) {
std::string ind_lvl(indent, ' ');
DumpSegment(pool_i, &ind_lvl);
@@ -448,31 +407,25 @@ hsa_status_t DumpMemoryPoolInfo(const pool_info_t *pool_i,
std::string sz_str = std::to_string(pool_i->size / 1024) + "KB";
RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Size:" << sz_str);
RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Allocatable:"
<< (pool_i->alloc_allowed ? "TRUE" : "FALSE"));
RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Allocatable:" << (pool_i->alloc_allowed ? "TRUE" : "FALSE"));
std::string gr_str = std::to_string(pool_i->alloc_granule / 1024) + "KB";
RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Alloc Granule:" << gr_str);
std::string al_str =
std::to_string(pool_i->alloc_alignment / 1024) + "KB";
std::string al_str = std::to_string(pool_i->alloc_alignment / 1024) + "KB";
RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Alloc Alignment:" << al_str);
RDC_LOG(RDC_DEBUG, ind_lvl << " Pool Acessible by all:" <<
(pool_i->accessible_by_all ? "TRUE" : "FALSE"));
RDC_LOG(RDC_DEBUG,
ind_lvl << " Pool Acessible by all:" << (pool_i->accessible_by_all ? "TRUE" : "FALSE"));
std::string agg_str =
std::to_string(pool_i->aggregate_alloc_max / 1024) + "KB";
std::string agg_str = std::to_string(pool_i->aggregate_alloc_max / 1024) + "KB";
RDC_LOG(RDC_DEBUG, ind_lvl << "Pool Aggregate Alloc Size:" << agg_str);
return HSA_STATUS_SUCCESS;
}
static const char* Types[] = {"HSA_EXT_POINTER_TYPE_UNKNOWN",
"HSA_EXT_POINTER_TYPE_HSA",
"HSA_EXT_POINTER_TYPE_LOCKED",
"HSA_EXT_POINTER_TYPE_GRAPHICS",
"HSA_EXT_POINTER_TYPE_IPC"
};
static const char* Types[] = {"HSA_EXT_POINTER_TYPE_UNKNOWN", "HSA_EXT_POINTER_TYPE_HSA",
"HSA_EXT_POINTER_TYPE_LOCKED", "HSA_EXT_POINTER_TYPE_GRAPHICS",
"HSA_EXT_POINTER_TYPE_IPC"};
hsa_status_t DumpPointerInfo(void* ptr) {
hsa_amd_pointer_info_t info;
@@ -484,14 +437,11 @@ hsa_status_t DumpPointerInfo(void* ptr) {
throw_if_error(err);
std::cout << "Info for ptr: " << ptr << std::endl;
std::cout << "CPU ptr: " << reinterpret_cast<void*>(info.hostBaseAddress) <<
std::endl;
std::cout << "GPU ptr: " << reinterpret_cast<void*>(info.agentBaseAddress)
<< std::endl;
std::cout << "CPU ptr: " << reinterpret_cast<void*>(info.hostBaseAddress) << std::endl;
std::cout << "GPU ptr: " << reinterpret_cast<void*>(info.agentBaseAddress) << std::endl;
std::cout << "Size: " << info.sizeInBytes << std::endl;
std::cout << "Type: " << Types[info.type] << std::endl;
std::cout << "UsrPtr " << reinterpret_cast<void*>(info.userData) <<
std::endl;
std::cout << "UsrPtr " << reinterpret_cast<void*>(info.userData) << std::endl;
std::cout << "Accessible by: ";
for (uint32_t i = 0; i < count; i++) {
@@ -503,7 +453,6 @@ hsa_status_t DumpPointerInfo(void* ptr) {
return HSA_STATUS_SUCCESS;
}
/*! \brief Writes to the buffer and increments the write pointer to the
* buffer. Also, ensures that the argument is written to an
* aligned memory as specified. Return the new write pointer.
+17 -18
查看文件
@@ -22,40 +22,39 @@ THE SOFTWARE.
#ifndef RDCI_INCLUDE_RDCIDIAGSUBSYSTEM_H_
#define RDCI_INCLUDE_RDCIDIAGSUBSYSTEM_H_
#include <signal.h>
#include <map>
#include <vector>
#include <string>
#include <vector>
#include "RdciSubSystem.h"
namespace amd {
namespace rdc {
class RdciDiagSubSystem: public RdciSubSystem {
class RdciDiagSubSystem : public RdciSubSystem {
public:
RdciDiagSubSystem();
~RdciDiagSubSystem();
void parse_cmd_opts(int argc, char ** argv) override;
void process() override;
RdciDiagSubSystem();
~RdciDiagSubSystem();
void parse_cmd_opts(int argc, char** argv) override;
void process() override;
private:
void show_help() const;
void show_help() const;
std::string get_test_name(
rdc_diag_test_cases_t test_case) const;
std::string get_test_name(rdc_diag_test_cases_t test_case) const;
enum OPERATIONS {
DIAG_UNKNOWN = 0,
DIAG_HELP,
DIAG_RUN,
} diag_ops_;
enum OPERATIONS {
DIAG_UNKNOWN = 0,
DIAG_HELP,
DIAG_RUN,
} diag_ops_;
rdc_gpu_group_t group_id_;
rdc_diag_level_t run_level_;
rdc_gpu_group_t group_id_;
rdc_diag_level_t run_level_;
};
} // namespace rdc
} // namespace amd
#endif // RDCI_INCLUDE_RDCIDIAGSUBSYSTEM_H_
+8 -9
查看文件
@@ -27,19 +27,18 @@ THE SOFTWARE.
namespace amd {
namespace rdc {
class RdciDiscoverySubSystem: public RdciSubSystem {
class RdciDiscoverySubSystem : public RdciSubSystem {
public:
RdciDiscoverySubSystem();
void parse_cmd_opts(int argc, char ** argv) override;
void process() override;
private:
bool show_help_;
void show_help() const;
};
RdciDiscoverySubSystem();
void parse_cmd_opts(int argc, char** argv) override;
void process() override;
private:
bool show_help_;
void show_help() const;
};
} // namespace rdc
} // namespace amd
#endif // RDCI_INCLUDE_RDCIDISCOVERYSUBSYSTEM_H_
+34 -35
查看文件
@@ -22,58 +22,57 @@ THE SOFTWARE.
#ifndef RDCI_INCLUDE_RDCIDMONSUBSYSTEM_H_
#define RDCI_INCLUDE_RDCIDMONSUBSYSTEM_H_
#include <signal.h>
#include <map>
#include <vector>
#include "RdciSubSystem.h"
#include "RdciSubSystem.h"
namespace amd {
namespace rdc {
class RdciDmonSubSystem: public RdciSubSystem {
class RdciDmonSubSystem : public RdciSubSystem {
public:
RdciDmonSubSystem();
~RdciDmonSubSystem();
void parse_cmd_opts(int argc, char ** argv) override;
void process() override;
RdciDmonSubSystem();
~RdciDmonSubSystem();
void parse_cmd_opts(int argc, char** argv) override;
void process() override;
private:
void show_help() const;
void show_field_usage() const;
void clean_up();
void show_help() const;
void show_field_usage() const;
void clean_up();
void create_temp_group();
void create_temp_field_group();
void create_temp_group();
void create_temp_field_group();
enum OPERATIONS {
DMON_UNKNOWN = 0,
DMON_HELP,
DMON_LIST_FIELDS,
DMON_LIST_ALL_FIELDS,
DMON_MONITOR
} dmon_ops_;
enum OPERATIONS {
DMON_UNKNOWN = 0,
DMON_HELP,
DMON_LIST_FIELDS,
DMON_LIST_ALL_FIELDS,
DMON_MONITOR
} dmon_ops_;
enum OPTIONS {
OPTIONS_UNKNOWN = 0,
OPTIONS_COUNT,
OPTIONS_DELAY,
OPTIONS_FIELD_GROUP_ID,
OPTIONS_GROUP_ID
};
enum OPTIONS {
OPTIONS_UNKNOWN = 0,
OPTIONS_COUNT,
OPTIONS_DELAY,
OPTIONS_FIELD_GROUP_ID,
OPTIONS_GROUP_ID
};
std::map<OPTIONS, uint32_t> options_;
std::vector<rdc_field_t> field_ids_;
std::vector<uint32_t> gpu_indexes_;
bool need_cleanup_;
uint64_t latest_time_stamp_;
bool show_timpstamps_;
static volatile sig_atomic_t is_terminating_;
static void set_terminating(int sig);
std::map<OPTIONS, uint32_t> options_;
std::vector<rdc_field_t> field_ids_;
std::vector<uint32_t> gpu_indexes_;
bool need_cleanup_;
uint64_t latest_time_stamp_;
bool show_timpstamps_;
static volatile sig_atomic_t is_terminating_;
static void set_terminating(int sig);
};
} // namespace rdc
} // namespace amd
#endif // RDCI_INCLUDE_RDCIDMONSUBSYSTEM_H_
+19 -19
查看文件
@@ -23,37 +23,37 @@ THE SOFTWARE.
#define RDCI_INCLUDE_RDCIFIELDGROUPSUBSYSTEM_H_
#include <string>
#include "RdciSubSystem.h"
namespace amd {
namespace rdc {
class RdciFieldGroupSubSystem: public RdciSubSystem {
class RdciFieldGroupSubSystem : public RdciSubSystem {
public:
RdciFieldGroupSubSystem();
void parse_cmd_opts(int argc, char ** argv) override;
void process() override;
RdciFieldGroupSubSystem();
void parse_cmd_opts(int argc, char** argv) override;
void process() override;
private:
void show_help() const;
void show_help() const;
enum OPERATIONS {
FIELD_GROUP_UNKNOWN = 0,
FIELD_GROUP_HELP,
FIELD_GROUP_CREATE,
FIELD_GROUP_DELETE,
FIELD_GROUP_LIST,
FIELD_GROUP_INFO
} field_group_ops_;
enum OPERATIONS {
FIELD_GROUP_UNKNOWN = 0,
FIELD_GROUP_HELP,
FIELD_GROUP_CREATE,
FIELD_GROUP_DELETE,
FIELD_GROUP_LIST,
FIELD_GROUP_INFO
} field_group_ops_;
bool is_group_set_;
uint32_t group_id_;
std::string group_name_;
std::string field_ids_;
bool is_group_set_;
uint32_t group_id_;
std::string group_name_;
std::string field_ids_;
};
} // namespace rdc
} // namespace amd
#endif // RDCI_INCLUDE_RDCIFIELDGROUPSUBSYSTEM_H_

本差異變更的檔案數量過多導致部分檔案未顯示 顯示更多