345ac64a43
1. Add policy APIs 2. Add policy example for policy API usage Change-Id: I14deb7c809d0b865b7bb083842092fc37868025e Signed-off-by: Chao Fei <Chao.Fei@amd.com>
1269 строки
42 KiB
C++
1269 строки
42 KiB
C++
/*
|
|
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef INCLUDE_RDC_RDC_H_
|
|
#define INCLUDE_RDC_RDC_H_
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif // __cplusplus
|
|
|
|
#ifdef __cplusplus
|
|
|
|
// cstddef include causes issues on older GCC
|
|
// use stddef.h instead
|
|
#if __GNUC__ < 9
|
|
#include <stddef.h>
|
|
#else
|
|
#include <cstddef>
|
|
#endif // __GNUC__
|
|
|
|
#include <cstdint>
|
|
#else
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#endif // __cplusplus
|
|
|
|
/** \file rdc.h
|
|
* Main header file for the ROCm RDC library.
|
|
* All required function, structure, enum, etc. definitions should be defined
|
|
* in this file.
|
|
*
|
|
* @brief The rocm_rdc library api is new, and therefore subject to change
|
|
* either at the ABI or API level. Instead of marking every function prototype
|
|
* as "unstable", we are instead saying the API is unstable (i.e., changes
|
|
* are possible) while the major version remains 0. This means that if the
|
|
* API/ABI changes, we will not increment the major version to 1. Once the
|
|
* ABI stabilizes, we will increment the major version to 1, and thereafter
|
|
* increment it on all ABI breaks.
|
|
*/
|
|
|
|
/**
|
|
* @brief Error codes returned by rocm_rdc_lib functions
|
|
*/
|
|
typedef enum {
|
|
RDC_ST_OK = 0, //!< Success
|
|
RDC_ST_NOT_SUPPORTED, //!< Not supported feature
|
|
RDC_ST_MSI_ERROR, //!< The MSI library error
|
|
RDC_ST_FAIL_LOAD_MODULE, //!< Fail to load the library
|
|
RDC_ST_INVALID_HANDLER, //!< Invalid handler
|
|
RDC_ST_BAD_PARAMETER, //!< A parameter is invalid
|
|
RDC_ST_NOT_FOUND, //!< Cannot find the value
|
|
RDC_ST_CONFLICT, //!< Conflict with current state
|
|
RDC_ST_CLIENT_ERROR, //!< The RDC client error
|
|
RDC_ST_ALREADY_EXIST, //!< The item already exists
|
|
RDC_ST_MAX_LIMIT, //!< Max limit recording for the object
|
|
RDC_ST_INSUFF_RESOURCES, //!< Not enough resources to complete
|
|
//!< operation
|
|
RDC_ST_FILE_ERROR, //!< Failed to access a file
|
|
RDC_ST_NO_DATA, //!< Data was requested,
|
|
//!< but none was found
|
|
RDC_ST_PERM_ERROR, //!< Insufficient permission to complete
|
|
//!< operation
|
|
RDC_ST_DISABLED_MODULE, //!< Attempted loading disabled module
|
|
|
|
RDC_ST_UNKNOWN_ERROR = 0xFFFFFFFF //!< Unknown error
|
|
} rdc_status_t;
|
|
|
|
/**
|
|
* @brief rdc operation mode
|
|
* rdc can run in auto mode where background threads will collect metrics.
|
|
* When run in manual mode, the user needs to periodically call
|
|
* rdc_field_update_all for data collection.
|
|
*/
|
|
typedef enum { RDC_OPERATION_MODE_AUTO = 0, RDC_OPERATION_MODE_MANUAL } rdc_operation_mode_t;
|
|
|
|
/**
|
|
* @brief type of GPU group
|
|
*/
|
|
typedef enum {
|
|
RDC_GROUP_DEFAULT = 0, //!< All GPUs on the Node
|
|
RDC_GROUP_EMPTY //!< Empty group
|
|
} rdc_group_type_t;
|
|
|
|
/**
|
|
* @brief the type stored in the filed value
|
|
*/
|
|
typedef enum { INTEGER = 0, DOUBLE, STRING, BLOB } rdc_field_type_t;
|
|
|
|
//! ID used to represent an invalid GPU
|
|
#define GPU_ID_INVALID (-1)
|
|
//! Used to specify all GPUs
|
|
#define RDC_GROUP_ALL_GPUS (-1000)
|
|
//! Used to specify all stats fields
|
|
#define RDC_JOB_STATS_FIELDS (-1000)
|
|
|
|
/**
|
|
* @brief The max rdc field string length
|
|
*/
|
|
#define RDC_MAX_STR_LENGTH 256
|
|
|
|
/**
|
|
* @brief The max entities in a group
|
|
*/
|
|
#define RDC_GROUP_MAX_ENTITIES 64
|
|
|
|
/**
|
|
* @brief Max number of GPUs supported by RDC
|
|
*/
|
|
#define RDC_MAX_NUM_DEVICES 128
|
|
|
|
/**
|
|
* @brief The max fields in a field group
|
|
*/
|
|
#define RDC_MAX_FIELD_IDS_PER_FIELD_GROUP 128
|
|
|
|
/**
|
|
* @brief The max number of groups
|
|
*/
|
|
#define RDC_MAX_NUM_GROUPS 64
|
|
|
|
/**
|
|
* @brief The max number of the field groups
|
|
*/
|
|
#define RDC_MAX_NUM_FIELD_GROUPS 64
|
|
|
|
/**
|
|
* @brief The max string length occupied by version information
|
|
*/
|
|
#define RDC_MAX_VERSION_STR_LENGTH 60
|
|
|
|
/**
|
|
* These enums are used to specify a particular field to be retrieved.
|
|
*/
|
|
typedef enum {
|
|
RDC_FI_INVALID = 0, //!< Invalid field value
|
|
//!< @brief Identifier fields
|
|
RDC_FI_GPU_COUNT = 1, //!< GPU count in the system
|
|
RDC_FI_DEV_NAME, //!< Name of the device
|
|
RDC_FI_OAM_ID, //!< OAM ID of the device
|
|
|
|
/**
|
|
* @brief Frequency related fields
|
|
*/
|
|
RDC_FI_GPU_CLOCK = 100, //!< The current clock for the GPU
|
|
RDC_FI_MEM_CLOCK, //!< Clock for the memory
|
|
|
|
/*
|
|
* @brief Physical monitor fields
|
|
*/
|
|
RDC_FI_MEMORY_TEMP = 200, //!< Memory temperature for the device
|
|
RDC_FI_GPU_TEMP, //!< Current temperature for the device
|
|
RDC_FI_POWER_USAGE = 300, //!< Power usage for the device
|
|
|
|
/**
|
|
* @brief PCIe related fields
|
|
*/
|
|
RDC_FI_PCIE_TX = 400, //!< PCIe Tx utilization information
|
|
RDC_FI_PCIE_RX, //!< PCIe Rx utilization information
|
|
// RDC_FI_PCIE_TX, RDC_FI_PCIE_RX are not supported on new ASIC
|
|
// The RDC_FI_PCIE_BANDWIDTH should be used
|
|
RDC_FI_PCIE_BANDWIDTH, //!< PCIe bandwidth in GB/sec
|
|
|
|
/**
|
|
* @brief GPU usage related fields
|
|
*/
|
|
RDC_FI_GPU_UTIL = 500, //!< GPU Utilization
|
|
RDC_FI_GPU_MEMORY_USAGE, //!< Memory usage of the GPU instance
|
|
RDC_FI_GPU_MEMORY_TOTAL, //!< Total memory of the GPU instance
|
|
RDC_FI_GPU_MM_ENC_UTIL, //!< Multimedia encoder busy percentage
|
|
RDC_FI_GPU_MM_DEC_UTIL, //!< Multimedia decoder busy percentage
|
|
RDC_FI_GPU_MEMORY_ACTIVITY, //!< Memory busy percentage
|
|
|
|
/**
|
|
* @brief GPU page related fields
|
|
*/
|
|
RDC_FI_GPU_PAGE_RETRIED = 550, //!< Retried page of the GPU instance
|
|
/**
|
|
* @brief ECC related fields
|
|
*/
|
|
RDC_FI_ECC_CORRECT_TOTAL = 600, //!< Accumulated correctable ECC errors
|
|
RDC_FI_ECC_UNCORRECT_TOTAL, //!< Accumulated uncorrectable ECC errors
|
|
|
|
RDC_FI_ECC_FIRST = 602, //!< FIRST Error Correction and Detection field
|
|
RDC_FI_ECC_SDMA_CE = RDC_FI_ECC_FIRST,
|
|
RDC_FI_ECC_SDMA_UE,
|
|
RDC_FI_ECC_GFX_CE,
|
|
RDC_FI_ECC_GFX_UE,
|
|
RDC_FI_ECC_MMHUB_CE,
|
|
RDC_FI_ECC_MMHUB_UE,
|
|
RDC_FI_ECC_ATHUB_CE,
|
|
RDC_FI_ECC_ATHUB_UE,
|
|
RDC_FI_ECC_PCIE_BIF_CE,
|
|
RDC_FI_ECC_PCIE_BIF_UE,
|
|
RDC_FI_ECC_HDP_CE,
|
|
RDC_FI_ECC_HDP_UE,
|
|
RDC_FI_ECC_XGMI_WAFL_CE,
|
|
RDC_FI_ECC_XGMI_WAFL_UE,
|
|
RDC_FI_ECC_DF_CE,
|
|
RDC_FI_ECC_DF_UE,
|
|
RDC_FI_ECC_SMN_CE,
|
|
RDC_FI_ECC_SMN_UE,
|
|
RDC_FI_ECC_SEM_CE,
|
|
RDC_FI_ECC_SEM_UE,
|
|
RDC_FI_ECC_MP0_CE,
|
|
RDC_FI_ECC_MP0_UE,
|
|
RDC_FI_ECC_MP1_CE,
|
|
RDC_FI_ECC_MP1_UE,
|
|
RDC_FI_ECC_FUSE_CE,
|
|
RDC_FI_ECC_FUSE_UE,
|
|
RDC_FI_ECC_UMC_CE,
|
|
RDC_FI_ECC_UMC_UE,
|
|
RDC_FI_ECC_MCA_CE,
|
|
RDC_FI_ECC_MCA_UE,
|
|
RDC_FI_ECC_VCN_CE,
|
|
RDC_FI_ECC_VCN_UE,
|
|
RDC_FI_ECC_JPEG_CE,
|
|
RDC_FI_ECC_JPEG_UE,
|
|
RDC_FI_ECC_IH_CE,
|
|
RDC_FI_ECC_IH_UE,
|
|
RDC_FI_ECC_MPIO_CE,
|
|
RDC_FI_ECC_MPIO_UE,
|
|
RDC_FI_ECC_LAST = RDC_FI_ECC_MPIO_UE,
|
|
|
|
// In new ASCI, such as MI300, the XGMI events is not supported
|
|
// Using below XGMI related fields to calculate the bandwidth.
|
|
RDC_FI_XGMI_0_READ_KB = 700, //!< XGMI_0 accumulated data read size (KB)
|
|
RDC_FI_XGMI_1_READ_KB, //!< XGMI_1 accumulated data read size (KB)
|
|
RDC_FI_XGMI_2_READ_KB, //!< XGMI_2 accumulated data read size (KB)
|
|
RDC_FI_XGMI_3_READ_KB, //!< XGMI_3 accumulated data read size (KB)
|
|
RDC_FI_XGMI_4_READ_KB, //!< XGMI_4 accumulated data read size (KB)
|
|
RDC_FI_XGMI_5_READ_KB, //!< XGMI_5 accumulated data read size (KB)
|
|
RDC_FI_XGMI_6_READ_KB, //!< XGMI_6 accumulated data read size (KB)
|
|
RDC_FI_XGMI_7_READ_KB, //!< XGMI_7 accumulated data read size (KB)
|
|
|
|
RDC_FI_XGMI_0_WRITE_KB, //!< XGMI_0 accumulated data write size (KB)
|
|
RDC_FI_XGMI_1_WRITE_KB, //!< XGMI_1 accumulated data write size (KB)
|
|
RDC_FI_XGMI_2_WRITE_KB, //!< XGMI_2 accumulated data write size (KB)
|
|
RDC_FI_XGMI_3_WRITE_KB, //!< XGMI_3 accumulated data write size (KB)
|
|
RDC_FI_XGMI_4_WRITE_KB, //!< XGMI_4 accumulated data write size (KB)
|
|
RDC_FI_XGMI_5_WRITE_KB, //!< XGMI_5 accumulated data write size (KB)
|
|
RDC_FI_XGMI_6_WRITE_KB, //!< XGMI_6 accumulated data write size (KB)
|
|
RDC_FI_XGMI_7_WRITE_KB, //!< XGMI_7 accumulated data write size (KB)
|
|
RDC_FI_XGMI_TOTAL_READ_KB, //!< XGMI_SUM accumulated data read size (KB)
|
|
RDC_FI_XGMI_TOTAL_WRITE_KB, //!< XGMI_SUM accumulated data write size (KB)
|
|
|
|
/**
|
|
* @brief ROC-profiler related fields
|
|
*/
|
|
RDC_FI_PROF_OCCUPANCY_PERCENT = 800,
|
|
RDC_FI_PROF_ACTIVE_CYCLES,
|
|
RDC_FI_PROF_ACTIVE_WAVES,
|
|
RDC_FI_PROF_ELAPSED_CYCLES,
|
|
RDC_FI_PROF_TENSOR_ACTIVE_PERCENT,
|
|
RDC_FI_PROF_GPU_UTIL_PERCENT,
|
|
// metrics below are divided by time passed
|
|
RDC_FI_PROF_EVAL_MEM_R_BW,
|
|
RDC_FI_PROF_EVAL_MEM_W_BW,
|
|
RDC_FI_PROF_EVAL_FLOPS_16,
|
|
RDC_FI_PROF_EVAL_FLOPS_32,
|
|
RDC_FI_PROF_EVAL_FLOPS_64,
|
|
|
|
/**
|
|
* @brief Raw XGMI counter events
|
|
*/
|
|
RDC_EVNT_XGMI_0_NOP_TX = 1000, //!< NOPs sent to neighbor 0
|
|
RDC_EVNT_XGMI_0_REQ_TX, //!< Outgoing requests to
|
|
//!< neighbor 0
|
|
RDC_EVNT_XGMI_0_RESP_TX, //!< Outgoing responses to
|
|
//!< neighbor 0
|
|
/**
|
|
* @brief
|
|
*
|
|
* Data beats sent to neighbor 0; Each beat represents 32 bytes.<br><br>
|
|
*
|
|
* XGMI throughput can be calculated by multiplying a BEATs event
|
|
* such as ::RDC_EVNT_XGMI_0_BEATS_TX by 32 and dividing by
|
|
* the time for which event collection occurred,
|
|
* ::rdc_gpu_usage_info_t.start_time (which is in nanoseconds). To get
|
|
* bytes per second, multiply this value by 10<sup>9</sup>.<br>
|
|
* <br>
|
|
* Throughput = BEATS/time_running * 10<sup>9</sup> (bytes/second)<br>
|
|
*/
|
|
// ie, Throughput = BEATS/time_running 10^9 bytes/sec
|
|
RDC_EVNT_XGMI_0_BEATS_TX,
|
|
RDC_EVNT_XGMI_1_NOP_TX, //!< NOPs sent to neighbor 1
|
|
RDC_EVNT_XGMI_1_REQ_TX, //!< Outgoing requests to
|
|
//!< neighbor 1
|
|
RDC_EVNT_XGMI_1_RESP_TX, //!< Outgoing responses to
|
|
//!< neighbor 1
|
|
RDC_EVNT_XGMI_1_BEATS_TX, //!< Data beats sent to
|
|
//!< neighbor 1; Each beat
|
|
//!< represents 32 bytes
|
|
|
|
// "Composite" events. These events have additional processing beyond
|
|
// the value provided by the amd_smi library.
|
|
RDC_EVNT_XGMI_0_THRPUT = 1500, //!< Transmit throughput to XGMI
|
|
//!< neighbor 0 in byes/sec
|
|
RDC_EVNT_XGMI_1_THRPUT, //!< Transmit throughput to XGMI
|
|
//!< neighbor 1 in byes/sec
|
|
RDC_EVNT_XGMI_2_THRPUT, //!< Transmit throughput to XGMI
|
|
//!< neighbor 2 in byes/sec
|
|
RDC_EVNT_XGMI_3_THRPUT, //!< Transmit throughput to XGMI
|
|
//!< neighbor 3 in byes/sec
|
|
RDC_EVNT_XGMI_4_THRPUT, //!< Transmit throughput to XGMI
|
|
//!< neighbor 4 in byes/sec
|
|
RDC_EVNT_XGMI_5_THRPUT, //!< Transmit throughput to XGMI
|
|
//!< neighbor 5 in byes/sec
|
|
|
|
RDC_EVNT_NOTIF_VMFAULT = 2000, //!< VM page fault
|
|
RDC_EVNT_NOTIF_FIRST = RDC_EVNT_NOTIF_VMFAULT,
|
|
|
|
RDC_EVNT_NOTIF_THERMAL_THROTTLE, //!< Clock frequency has decreased
|
|
//!< due to temperature rise
|
|
RDC_EVNT_NOTIF_PRE_RESET, //!< GPU reset is about to occur
|
|
RDC_EVNT_NOTIF_POST_RESET, //!< GPU reset just occurred
|
|
RDC_EVNT_NOTIF_RING_HANG, //!< GPU ring hang just occurred
|
|
|
|
RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_RING_HANG,
|
|
} rdc_field_t;
|
|
|
|
// even and odd numbers are used for correctable and uncorrectable errors
|
|
static_assert(RDC_FI_ECC_SDMA_CE % 2 == 0, "Correctable Error enum is not even");
|
|
static_assert(RDC_FI_ECC_SDMA_UE % 2 == 1, "Uncorrectable Error enum is not odd");
|
|
static_assert(RDC_FI_ECC_MPIO_CE % 2 == 0, "Correctable Error enum is not even");
|
|
static_assert(RDC_FI_ECC_MPIO_UE % 2 == 1, "Uncorrectable Error enum is not odd");
|
|
|
|
#define RDC_EVNT_IS_NOTIF_FIELD(FIELD) \
|
|
((FIELD) >= RDC_EVNT_NOTIF_FIRST && (FIELD) <= RDC_EVNT_NOTIF_LAST)
|
|
/**
|
|
* @brief handlers used in various rdc calls
|
|
*/
|
|
typedef void* rdc_handle_t; //!< Handle used for an RDC session
|
|
typedef uint32_t rdc_gpu_group_t; //!< GPU Group ID type
|
|
typedef uint32_t rdc_field_grp_t; //!< Field group ID type
|
|
|
|
/**
|
|
* @brief Represents attributes corresponding to a device
|
|
*/
|
|
typedef struct {
|
|
char device_name[RDC_MAX_STR_LENGTH]; //!< Name of the device.
|
|
} rdc_device_attributes_t;
|
|
|
|
/**
|
|
* @brief Store version information for each component
|
|
*/
|
|
typedef struct {
|
|
char version[RDC_MAX_VERSION_STR_LENGTH];
|
|
} rdc_component_version_t;
|
|
|
|
/**
|
|
* @brief The structure to store the group info
|
|
*/
|
|
typedef struct {
|
|
unsigned int count; //!< count of GPUs in the group
|
|
char group_name[RDC_MAX_STR_LENGTH]; //!< group name
|
|
/**
|
|
* The list of entities in the group
|
|
*/
|
|
uint32_t entity_ids[RDC_GROUP_MAX_ENTITIES];
|
|
} rdc_group_info_t;
|
|
|
|
/**
|
|
* @brief The structure to store summary of data
|
|
*/
|
|
typedef struct {
|
|
uint64_t max_value; //!< Maximum value measured
|
|
uint64_t min_value; //!< Minimum value measured
|
|
uint64_t average; //!< Average value measured
|
|
double standard_deviation; //!< The standard deviation
|
|
} rdc_stats_summary_t;
|
|
|
|
/**
|
|
* @brief The structure to hold the GPU usage information
|
|
*/
|
|
typedef struct {
|
|
uint32_t gpu_id; //!< GPU_ID_INVALID for summary information
|
|
uint64_t start_time; //!< The time to start the watching
|
|
uint64_t end_time; //!< The time to stop the watching
|
|
|
|
uint64_t energy_consumed; //!< GPU Energy consumed
|
|
uint64_t ecc_correct; //!< Correctable errors
|
|
uint64_t ecc_uncorrect; //!< Uncorrectable errors
|
|
rdc_stats_summary_t pcie_tx; //!< Bytes sent over PCIe stats
|
|
rdc_stats_summary_t pcie_rx; //!< Bytes received over PCIe stats
|
|
rdc_stats_summary_t power_usage; //!< GPU Power usage stats
|
|
rdc_stats_summary_t gpu_clock; //!< GPU Clock speed stats
|
|
rdc_stats_summary_t memory_clock; //!< Mem. Clock speed stats
|
|
rdc_stats_summary_t gpu_utilization; //!< GPU Utilization stats
|
|
rdc_stats_summary_t gpu_temperature; //!< GPU temperature stats
|
|
|
|
uint64_t max_gpu_memory_used; //!< Maximum GPU memory used
|
|
rdc_stats_summary_t memory_utilization; //!< Memory Utilization statistics
|
|
} rdc_gpu_usage_info_t; //!< GPU usage statistics
|
|
|
|
/**
|
|
* @brief The structure to hold the job stats
|
|
*/
|
|
typedef struct {
|
|
uint32_t num_gpus; //!< Number of GPUs used by job
|
|
rdc_gpu_usage_info_t summary; //!< Job usage summary statistics
|
|
//!< (overall)
|
|
rdc_gpu_usage_info_t gpus[16]; //!< Job usage summary statistics by GPU
|
|
} rdc_job_info_t;
|
|
|
|
/**
|
|
* @brief Field value data
|
|
*/
|
|
typedef union {
|
|
int64_t l_int;
|
|
double dbl;
|
|
char str[RDC_MAX_STR_LENGTH];
|
|
} rdc_field_value_data;
|
|
|
|
/**
|
|
* @brief The structure to store the field value
|
|
*/
|
|
typedef struct {
|
|
rdc_field_t field_id; //!< The field id of the value
|
|
int status; //!< RDC_ST_OK or error status
|
|
uint64_t ts; //!< Timestamp in usec since 1970
|
|
rdc_field_type_t type; //!< The field type
|
|
rdc_field_value_data value; //!< Value of the field. Value type
|
|
//!< depends on the field type.
|
|
} rdc_field_value;
|
|
|
|
/**
|
|
* @brief The structure to store the field group info
|
|
*/
|
|
typedef struct {
|
|
uint32_t count; //!< count of fields in the group
|
|
char group_name[RDC_MAX_STR_LENGTH]; //!< field group name
|
|
/**
|
|
* The list of fields in the group
|
|
*/
|
|
rdc_field_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP];
|
|
} rdc_field_group_info_t;
|
|
|
|
/**
|
|
* @brief The structure to store the job info
|
|
*/
|
|
typedef struct {
|
|
char job_id[RDC_MAX_STR_LENGTH]; //!< job id
|
|
rdc_gpu_group_t group_id; //!< group name
|
|
uint64_t start_time; //!< job start time
|
|
uint64_t stop_time; //!< job stop time
|
|
} rdc_job_group_info_t;
|
|
|
|
/**
|
|
* @brief type of diagnostic level
|
|
*/
|
|
typedef enum {
|
|
RDC_DIAG_LVL_INVALID = 0, //!< invalid level
|
|
RDC_DIAG_LVL_SHORT, //!< take a few seconds to run
|
|
RDC_DIAG_LVL_MED, //!< take less than 2 minutes to run
|
|
RDC_DIAG_LVL_LONG //!< take up to 15 minutes to run
|
|
} rdc_diag_level_t;
|
|
|
|
/**
|
|
* @brief type of diagnostic result
|
|
*/
|
|
typedef enum {
|
|
RDC_DIAG_RESULT_PASS, //!< The diagnostic test pass
|
|
RDC_DIAG_RESULT_SKIP, //!< The diagnostic test skipped
|
|
RDC_DIAG_RESULT_WARN, //!< The diagnostic test has warnings
|
|
RDC_DIAG_RESULT_FAIL //!< The diagnostic test fail
|
|
} rdc_diag_result_t;
|
|
|
|
/**
|
|
* @brief The test cases to run
|
|
*/
|
|
typedef enum {
|
|
RDC_DIAG_TEST_FIRST = 0,
|
|
//!< The diagnostic test pass
|
|
RDC_DIAG_COMPUTE_PROCESS = RDC_DIAG_TEST_FIRST,
|
|
RDC_DIAG_COMPUTE_QUEUE, //!< The Compute Queue is ready
|
|
RDC_DIAG_SYS_MEM_CHECK, //!< Check System memory
|
|
RDC_DIAG_NODE_TOPOLOGY, //!< Report node topology
|
|
RDC_DIAG_RVS_TEST, //!< TODO: Replace with real RVS tests
|
|
RDC_DIAG_GPU_PARAMETERS, //!< GPU parameters in range
|
|
RDC_DIAG_TEST_LAST = RDC_DIAG_GPU_PARAMETERS
|
|
} rdc_diag_test_cases_t;
|
|
|
|
/**
|
|
* @brief Type of Components
|
|
*/
|
|
typedef enum {
|
|
RDC_AMDMSI_COMPONENT
|
|
//If needed later, add them one by one
|
|
} rdc_component_t;
|
|
|
|
/**
|
|
* @brief The maximum test cases to run
|
|
*/
|
|
#define MAX_TEST_CASES (RDC_DIAG_TEST_LAST - RDC_DIAG_TEST_FIRST + 1)
|
|
|
|
/**
|
|
* @brief The maximum length of the diagnostic messages
|
|
*/
|
|
#define MAX_DIAG_MSG_LENGTH 4096
|
|
|
|
/**
|
|
* @brief details of the diagnostic errors
|
|
*/
|
|
typedef struct {
|
|
char msg[MAX_DIAG_MSG_LENGTH]; //!< The test result details
|
|
uint32_t code; //!< The low level error code
|
|
} rdc_diag_detail_t;
|
|
|
|
/**
|
|
* @brief details of the per gpu diagnostic results
|
|
*/
|
|
typedef struct {
|
|
uint32_t gpu_index; //!< The GPU index
|
|
rdc_diag_detail_t gpu_result; //!< The detail results
|
|
} rdc_diag_per_gpu_result_t;
|
|
|
|
/**
|
|
* @brief The diagnostic results for all GPUs
|
|
*/
|
|
typedef struct {
|
|
rdc_diag_result_t status; //!< The diagnostic result
|
|
rdc_diag_detail_t details; //!< The summary details
|
|
rdc_diag_test_cases_t test_case; //!< The test case to run
|
|
|
|
uint32_t per_gpu_result_count; //!< How many gpu_results
|
|
//!< Result details
|
|
rdc_diag_per_gpu_result_t gpu_results[RDC_MAX_NUM_DEVICES];
|
|
|
|
char info[MAX_DIAG_MSG_LENGTH]; //!< Detail information
|
|
} rdc_diag_test_result_t;
|
|
|
|
/**
|
|
* @brief The diagnostic responses for test cases
|
|
*/
|
|
typedef struct {
|
|
uint32_t results_count;
|
|
rdc_diag_test_result_t diag_info[MAX_TEST_CASES];
|
|
} rdc_diag_response_t;
|
|
|
|
/**
|
|
* @brief The policy type to support
|
|
*/
|
|
typedef enum {
|
|
RDC_POLICY_COND_MAX_PAGE_RETRIED, //!< Max number of page retired
|
|
RDC_POLICY_COND_THERMAL, //!< Temperature threshold, millidegree Celsius
|
|
RDC_POLICY_COND_POWER, //!< Power threshold, unit milliwatt
|
|
RDC_POLICY_COND_MAX
|
|
} rdc_policy_condition_type_t;
|
|
|
|
typedef struct {
|
|
rdc_policy_condition_type_t type;
|
|
int64_t value;
|
|
} rdc_policy_condition_t;
|
|
|
|
typedef enum { RDC_POLICY_ACTION_NONE, RDC_POLICY_ACTION_GPU_RESET } rdc_policy_action_t;
|
|
|
|
/**
|
|
* @brief The structure to define policy to enforce on GPU.
|
|
*/
|
|
typedef struct {
|
|
rdc_policy_condition_t condition; //!< condition to meet
|
|
rdc_policy_action_t action; //!< Action to take
|
|
} rdc_policy_t;
|
|
|
|
/**
|
|
* @brief Initialize ROCm RDC.
|
|
*
|
|
* @details When called, this initializes internal data structures,
|
|
* including those corresponding to sources of information that RDC provides.
|
|
* This must be called before rdc_start_embedded() or rdc_connect()
|
|
*
|
|
* @param[in] init_flags init_flags Bit flags that tell RDC how to initialize.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_init(uint64_t init_flags);
|
|
|
|
/**
|
|
* @brief Shutdown ROCm RDC.
|
|
*
|
|
* @details Do any necessary clean up.
|
|
*/
|
|
rdc_status_t rdc_shutdown();
|
|
|
|
/**
|
|
* @brief Start embedded RDC agent within this process.
|
|
*
|
|
* @details The RDC is loaded as library so that it does not require rdcd
|
|
* daemon. In this mode, the user has to periodically call
|
|
* rdc_field_update_all() when op_mode is RDC_OPERATION_MODE_MANUAL, which
|
|
* tells RDC to collect the stats.
|
|
*
|
|
* @param[in] op_mode Operation modes. When RDC_OPERATION_MODE_AUTO, RDC
|
|
* schedules background task to collect the stats. When
|
|
* RDC_OPERATION_MODE_MANUAL, the user needs to call rdc_field_update_all()
|
|
* periodically.
|
|
*
|
|
* @param[inout] p_rdc_handle Caller provided pointer to rdc_handle_t. Upon
|
|
* successful call, the value will contain the handler for following API calls.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, rdc_handle_t* p_rdc_handle);
|
|
|
|
/**
|
|
* @brief Stop embedded RDC agent.
|
|
*
|
|
* @details Stop the embedded RDC agent, and p_rdc_handle becomes
|
|
* invalid after this call.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler that come from
|
|
* rdc_start_embedded().
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle);
|
|
|
|
/**
|
|
* @brief Connect to rdcd daemon
|
|
*
|
|
* @details This method is used to connect to a remote stand-alone
|
|
* rdcd daemon.
|
|
*
|
|
* @param[in] ipAndPort The IP and port of the remote rdcd. The ipAndPort
|
|
* can be specified in this x.x.x.x:yyyy format, where x.x.x.x is the
|
|
* IP address and yyyy is the port.
|
|
*
|
|
* @param[inout] p_rdc_handle Caller provided pointer to rdc_handle_t. Upon
|
|
* successful call, the value will contain the handler
|
|
* for following API calls.
|
|
*
|
|
* @param [in] root_ca The root CA stored in the string in pem format. Set it
|
|
* as nullptr if the communication is not encrypted.
|
|
*
|
|
* @param [in] client_cert The client certificate stored in the string in pem
|
|
* format. Set it as nullptr if the communication is not encrypted.
|
|
*
|
|
* @param [in] client_key The client key stored in the string in pem format.
|
|
* Set it as nullptr if the communication is not encrypted.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_connect(const char* ipAndPort, rdc_handle_t* p_rdc_handle, const char* root_ca,
|
|
const char* client_cert, const char* client_key);
|
|
|
|
/**
|
|
* @brief Disconnect from rdcd daemon.
|
|
*
|
|
* @details Disconnect from rdcd daemon, and p_rdc_handle becomes invalid
|
|
* after this call.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler that come from rdc_connect().
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_disconnect(rdc_handle_t p_rdc_handle);
|
|
|
|
/**
|
|
* @brief Request the RDC to watch the job stats
|
|
*
|
|
* @details This should be executed as part of job prologue. The summary
|
|
* job stats can be retrieved using rdc_job_get_stats().
|
|
* In RDC_OPERATION_MODE_MANUAL, user must call rdc_field_update_all(1)
|
|
* at least once, before call rdc_job_get_stats()
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The group of GPUs to be watched.
|
|
*
|
|
* @param[in] job_id The name of the job.
|
|
*
|
|
* @param[in] update_freq How often to update this field in usec.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
const char job_id[64], uint64_t update_freq);
|
|
|
|
/**
|
|
* @brief Get the stats of the job using the job id.
|
|
*
|
|
* @details The stats can be retrieved at any point when the job is in
|
|
* process.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] job_id The name of the job.
|
|
*
|
|
* @param[inout] p_job_info Caller provided pointer to rdc_job_info_t. Upon
|
|
* successful call, the value will contain the stats of the job.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, const char job_id[64],
|
|
rdc_job_info_t* p_job_info);
|
|
|
|
/**
|
|
* @brief Request RDC to stop watching the stats of the job
|
|
*
|
|
* @details This should be execute as part of job epilogue. The job Id
|
|
* remains available to view the stats at any point. You must call
|
|
* rdc_watch_job_fields() before this call.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] job_id The name of the job.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, const char job_id[64]);
|
|
|
|
/**
|
|
* @brief Request RDC to stop tracking the job given by job_id
|
|
*
|
|
* @details After this call, you will no longer be able to call
|
|
* rdc_job_get_stats() on this job_id. But you will be able to reuse
|
|
* the job_id after this call.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] job_id The name of the job.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_job_remove(rdc_handle_t p_rdc_handle, const char job_id[64]);
|
|
|
|
/**
|
|
* @brief Request RDC to stop tracking all the jobs
|
|
*
|
|
* @details After this call, you will no longer be able to call
|
|
* rdc_job_get_stats() on any job id. But you will be able to reuse
|
|
* the any previous used job id after this call.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_job_remove_all(rdc_handle_t p_rdc_handle);
|
|
|
|
/**
|
|
* @brief Request RDC to update all fields to be watched.
|
|
*
|
|
* @details In RDC_OPERATION_MODE_MANUAL, the user must call this method
|
|
* periodically.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] wait_for_update Whether or not to wait for the update loop to
|
|
* complete before returning to the caller 1=wait. 0=do not wait.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_field_update_all(rdc_handle_t p_rdc_handle, uint32_t wait_for_update);
|
|
|
|
/**
|
|
* @brief Get indexes corresponding to all the devices on the system.
|
|
*
|
|
* @details Indexes represents RDC GPU Id corresponding to each GPU on the
|
|
* system and is immutable during the lifespan of the engine. The list
|
|
* should be queried again if the engine is restarted.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[out] gpu_index_list Array reference to fill GPU indexes present on
|
|
* the system.
|
|
*
|
|
* @param[out] count Number of GPUs returned in gpu_index_list.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_device_get_all(rdc_handle_t p_rdc_handle,
|
|
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count);
|
|
|
|
/**
|
|
* @brief Gets device attributes corresponding to the gpu_index.
|
|
*
|
|
* @details Fetch the attributes, such as device name, of a GPU.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] gpu_index GPU index corresponding to which the attributes
|
|
* should be fetched
|
|
*
|
|
* @param[out] p_rdc_attr GPU attribute corresponding to the gpu_index.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
|
|
rdc_device_attributes_t* p_rdc_attr);
|
|
|
|
/**
|
|
* @brief Get version information of components used by rdc.
|
|
*
|
|
* @details Given a component type, return its version information.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] component Type of Components. See rdc_component_t definition for details.
|
|
*
|
|
* @param[out] p_rdc_compv Version information of the corresponding component.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component, rdc_component_version_t* p_rdc_compv);
|
|
|
|
/**
|
|
* @brief Create a group contains multiple GPUs
|
|
*
|
|
* @details This method can create a group contains multiple GPUs. Instead of
|
|
* executing an operation separately for each GPU, the RDC group enables
|
|
* the user to execute same operation on all the GPUs present in the group as
|
|
* a single API call.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] type The type of the group. RDC_GROUP_DEFAULT includes all the
|
|
* GPUs on the node, and RDC_GROUP_EMPTY creates an empty group.
|
|
*
|
|
* @param[in] group_name The group name specified as NULL terminated C String
|
|
*
|
|
* @param[inout] p_rdc_group_id Caller provided pointer to rdc_gpu_group_t.
|
|
* Upon successful call, the value will contain the group id for following
|
|
* group API calls.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle, rdc_group_type_t type,
|
|
const char* group_name, rdc_gpu_group_t* p_rdc_group_id);
|
|
|
|
/**
|
|
* @brief Add a GPU to the group
|
|
*
|
|
* @details This method can add a GPU to the group
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The group id to which the GPU will be added.
|
|
*
|
|
* @param[in] gpu_index The GPU index to be added to the group.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
uint32_t gpu_index);
|
|
|
|
/**
|
|
* @brief Get information about a GPU group
|
|
*
|
|
* @details Get detail information about a GPU group created by
|
|
* rdc_group_gpu_create
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] p_rdc_group_id The GPU group handler created by
|
|
* rdc_group_gpu_create
|
|
*
|
|
* @param[out] p_rdc_group_info The information of the GPU
|
|
* group p_rdc_group_id.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle, rdc_gpu_group_t p_rdc_group_id,
|
|
rdc_group_info_t* p_rdc_group_info);
|
|
|
|
/**
|
|
* @brief Used to get information about all GPU groups in the system.
|
|
*
|
|
* @details Get the list of GPU group ids in the system.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[out] group_id_list Array reference to fill GPU group
|
|
* ids in the system.
|
|
*
|
|
* @param[out] count Number of GPU group returned in group_id_list.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_get_all_ids(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id_list[],
|
|
uint32_t* count);
|
|
|
|
/**
|
|
* @brief Destroy GPU group represented by p_rdc_group_id
|
|
*
|
|
* @details Delete the logic group represented by p_rdc_group_id
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] p_rdc_group_id The group id
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle, rdc_gpu_group_t p_rdc_group_id);
|
|
|
|
/**
|
|
* @brief create a group of fields
|
|
*
|
|
* @details The user can create a group of fields and perform an operation
|
|
* on a group of fields at once.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] num_field_ids Number of field IDs that are being provided
|
|
* in field_ids.
|
|
*
|
|
* @param[in] field_ids Field IDs to be added to the newly-created
|
|
* field group.
|
|
*
|
|
* @param[in] field_group_name Unique name for this group of fields.
|
|
*
|
|
* @param[out] rdc_field_group_id Handle to the newly-created field group
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle, uint32_t num_field_ids,
|
|
rdc_field_t* field_ids, const char* field_group_name,
|
|
rdc_field_grp_t* rdc_field_group_id);
|
|
|
|
/**
|
|
* @brief Get information about a field group
|
|
*
|
|
* @details Get detail information about a field group created by
|
|
* rdc_group_field_create
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] rdc_field_group_id The field group handler created by
|
|
* rdc_group_field_create
|
|
*
|
|
* @param[out] field_group_info The information of the field group
|
|
* rdc_field_group_id.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle, rdc_field_grp_t rdc_field_group_id,
|
|
rdc_field_group_info_t* field_group_info);
|
|
|
|
/**
|
|
* @brief Used to get information about all field groups in the system.
|
|
*
|
|
* @details Get the list of field group ids in the system.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[out] field_group_id_list Array reference to fill field group
|
|
* ids in the system.
|
|
*
|
|
* @param[out] count Number of field group returned in field_group_id_list.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_field_get_all_ids(rdc_handle_t p_rdc_handle,
|
|
rdc_field_grp_t field_group_id_list[], uint32_t* count);
|
|
|
|
/**
|
|
* @brief Destroy field group represented by rdc_field_group_id
|
|
*
|
|
* @details Delete the logic group represented by rdc_field_group_id
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] rdc_field_group_id The field group id
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle, rdc_field_grp_t rdc_field_group_id);
|
|
|
|
/**
|
|
* @brief Request the RDC start recording updates for a given field
|
|
* collection.
|
|
*
|
|
* @details Note that the first update of the field will not occur
|
|
* until the next field update cycle. To force a field update cycle,
|
|
* user must call rdc_field_update_all(1)
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The group of GPUs to be watched.
|
|
*
|
|
* @param[in] field_group_id The collection of fields to record
|
|
*
|
|
* @param[in] update_freq How often to update fields in usec.
|
|
*
|
|
* @param[in] max_keep_age How long to keep data for fields in seconds.
|
|
*
|
|
* @param[in] max_keep_samples Maximum number of samples to keep. 0=no limit.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_field_watch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_field_grp_t field_group_id, uint64_t update_freq,
|
|
double max_keep_age, uint32_t max_keep_samples);
|
|
|
|
/**
|
|
* @brief Request a latest cached field of a GPU
|
|
*
|
|
* @details Note that the field can be cached after called rdc_field_watch
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] gpu_index The GPU index.
|
|
*
|
|
* @param[in] field The field id
|
|
*
|
|
* @param[out] value The field value got from cache.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_field_get_latest_value(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
|
|
rdc_field_t field, rdc_field_value* value);
|
|
|
|
/**
|
|
* @brief Request a history cached field of a GPU
|
|
*
|
|
* @details Note that the field can be cached after called rdc_field_watch
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] gpu_index The GPU index.
|
|
*
|
|
* @param[in] field The field id
|
|
*
|
|
* @param[in] since_time_stamp Timestamp to request values since in
|
|
* usec since 1970.
|
|
*
|
|
* @param[out] next_since_time_stamp Timestamp to use for sinceTimestamp
|
|
* on next call to this function
|
|
*
|
|
* @param[out] value The field value got from cache.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
|
|
rdc_field_t field, uint64_t since_time_stamp,
|
|
uint64_t* next_since_time_stamp, rdc_field_value* value);
|
|
|
|
/**
|
|
* @brief Stop record updates for a given field collection.
|
|
*
|
|
* @details The cache of those fields will not be updated after this call
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[in] field_group_id The field group id.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_field_grp_t field_group_id);
|
|
|
|
/**
|
|
* @brief Run the diagnostic test cases
|
|
*
|
|
* @details Run the diagnostic test cases at different levels.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[in] level The level decides how long the test will run.
|
|
* The RDC_DIAG_LVL_SHORT only take a few seconds, and the
|
|
* the RDC_DIAG_LVL_LONG may take up to 15 minutes.
|
|
*
|
|
* @param[in] config Implementation specific configuration.
|
|
*
|
|
* @param[in] config_size Length of the configuration.
|
|
*
|
|
* @param[inout] response The detail results of the tests run.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_diag_level_t level, const char* config, size_t config_size,
|
|
rdc_diag_response_t* response);
|
|
|
|
/**
|
|
* @brief Run one diagnostic test case
|
|
*
|
|
* @details Run a specific diagnostic test case.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[in] test_case The test case to run.
|
|
*
|
|
* @param[in] config Implementation specific configuration.
|
|
*
|
|
* @param[in] config_size Length of the configuration.
|
|
*
|
|
* @param[inout] result The results of the test.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_diag_test_cases_t test_case, const char* config,
|
|
size_t config_size, rdc_diag_test_result_t* result);
|
|
|
|
/**
|
|
* @brief Get a description of a provided RDC error status
|
|
*
|
|
* @details return the string in human readable format.
|
|
*
|
|
* @param[in] status The RDC status.
|
|
*
|
|
* @retval The string to describe the RDC status.
|
|
*/
|
|
const char* rdc_status_string(rdc_status_t status);
|
|
|
|
/**
|
|
* @brief Get the name of a field
|
|
*
|
|
* @details return the string in human readable format.
|
|
*
|
|
* @param[in] field_id The field id.
|
|
*
|
|
* @retval The string to describe the field.
|
|
*/
|
|
const char* field_id_string(rdc_field_t field_id);
|
|
|
|
/**
|
|
* @brief Get the field id from name
|
|
*
|
|
* @details return the field id from field name.
|
|
*
|
|
* @param[in] name The field name.
|
|
*
|
|
* @retval return RDC_FI_INVALID if the field name is invalid.
|
|
*/
|
|
rdc_field_t get_field_id_from_name(const char* name);
|
|
|
|
/**
|
|
* @brief Get a description of a diagnostic result.
|
|
*
|
|
* @details return the string in human readable format.
|
|
*
|
|
* @param[in] result The RDC diagnostic result.
|
|
*
|
|
* @retval The string to describe the RDC diagnostic result.
|
|
*/
|
|
const char* rdc_diagnostic_result_string(rdc_diag_result_t result);
|
|
|
|
/**
|
|
* @brief Set the RDC policy. Each group has multiple policies, these policies can be set by this
|
|
* API one by one. Multiple calls of this API will override the existing policy.
|
|
*
|
|
* @details Set the RDC policy
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[in] policy The policy to set
|
|
*
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_policy_t policy);
|
|
|
|
#define RDC_MAX_POLICY_SETTINGS 32
|
|
|
|
/**
|
|
* @brief Get the RDC policy
|
|
*
|
|
* @details Get the RDC policy
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[out] count The size of policies array
|
|
*
|
|
* @param[out] policies The policies to get
|
|
*
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_policy_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, uint32_t* count,
|
|
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]);
|
|
|
|
/**
|
|
* @brief delete the RDC policy for this group based on condition type
|
|
*
|
|
* @details clear the RDC policy for this group based on condition type. In a GPU group, only one
|
|
* policy can be set for a specific rdc_policy_condition_type_t
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id
|
|
*
|
|
* @param[in] condition_type The condition type to delete
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
|
|
rdc_status_t rdc_policy_delete(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_policy_condition_type_t condition_type);
|
|
|
|
/**
|
|
* Define the structure is used in RDC policy callback
|
|
*/
|
|
typedef struct {
|
|
unsigned int version;
|
|
rdc_policy_condition_t condition; //!< the condition that is meet
|
|
rdc_gpu_group_t group_id; //!< The group id trigger this callback
|
|
int64_t value; //!< The current value that meet the condition
|
|
} rdc_policy_callback_response_t;
|
|
|
|
/**
|
|
* The user data is the rdc_policy_callback_response_t
|
|
*/
|
|
typedef int (*rdc_policy_register_callback)(rdc_policy_callback_response_t* userData);
|
|
|
|
/**
|
|
* @brief Register a function to be called when policy condition is meet.
|
|
*
|
|
* @details Register the RDC policy callback
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[in] callback The callback function to be called when condition meet.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_policy_register_callback callback);
|
|
|
|
/**
|
|
* @brief un-register a policy callback function for a conditioin.
|
|
*
|
|
* @details Un-register the policy callback for a condition.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif // __cplusplus
|
|
|
|
#endif // INCLUDE_RDC_RDC_H_
|