76e9846bb1
Change-Id: Ib68f9909f2a6e0a1e5764298f1012a2bcf7ce1fc Signed-off-by: adapryor <Adam.pryor@amd.com>
1828 строки
61 KiB
C++
1828 строки
61 KiB
C++
/*
|
|
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef INCLUDE_RDC_RDC_H_
|
|
#define INCLUDE_RDC_RDC_H_
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif // __cplusplus
|
|
|
|
#ifdef __cplusplus
|
|
|
|
// cstddef include causes issues on older GCC
|
|
// use stddef.h instead
|
|
#if __GNUC__ < 9
|
|
#include <stddef.h>
|
|
#else
|
|
#include <cstddef>
|
|
#endif // __GNUC__
|
|
|
|
#include <cstdint>
|
|
#else
|
|
#include <assert.h>
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#endif // __cplusplus
|
|
|
|
/** \file rdc.h
|
|
* Main header file for the ROCm RDC library.
|
|
* All required function, structure, enum, etc. definitions should be defined
|
|
* in this file.
|
|
*
|
|
* @brief The rocm_rdc library api is new, and therefore subject to change
|
|
* either at the ABI or API level. Instead of marking every function prototype
|
|
* as "unstable", we are instead saying the API is unstable (i.e., changes
|
|
* are possible) while the major version remains 0. This means that if the
|
|
* API/ABI changes, we will not increment the major version to 1. Once the
|
|
* ABI stabilizes, we will increment the major version to 1, and thereafter
|
|
* increment it on all ABI breaks.
|
|
*/
|
|
|
|
/**
|
|
* @brief Error codes returned by rocm_rdc_lib functions
|
|
*/
|
|
typedef enum {
|
|
RDC_ST_OK = 0, //!< Success
|
|
RDC_ST_NOT_SUPPORTED, //!< Not supported feature
|
|
RDC_ST_SMI_ERROR, //!< The SMI library error
|
|
RDC_ST_FAIL_LOAD_MODULE, //!< Fail to load the library
|
|
RDC_ST_INVALID_HANDLER, //!< Invalid handler
|
|
RDC_ST_BAD_PARAMETER, //!< A parameter is invalid
|
|
RDC_ST_NOT_FOUND, //!< Cannot find the value
|
|
RDC_ST_CONFLICT, //!< Conflict with current state
|
|
RDC_ST_CLIENT_ERROR, //!< The RDC client error
|
|
RDC_ST_ALREADY_EXIST, //!< The item already exists
|
|
RDC_ST_MAX_LIMIT, //!< Max limit recording for the object
|
|
RDC_ST_INSUFF_RESOURCES, //!< Not enough resources to complete
|
|
//!< operation
|
|
RDC_ST_FILE_ERROR, //!< Failed to access a file
|
|
RDC_ST_NO_DATA, //!< Data was requested,
|
|
//!< but none was found
|
|
RDC_ST_PERM_ERROR, //!< Insufficient permission to complete
|
|
//!< operation
|
|
RDC_ST_CORRUPTED_EEPROM, //!< EEPROM is corrupted
|
|
RDC_ST_DISABLED_MODULE, //!< Attempted loading disabled module
|
|
|
|
RDC_ST_UNKNOWN_ERROR = 0xFFFFFFFF //!< Unknown error
|
|
} rdc_status_t;
|
|
|
|
/**
|
|
* @brief rdc operation mode
|
|
* rdc can run in auto mode where background threads will collect metrics.
|
|
* When run in manual mode, the user needs to periodically call
|
|
* rdc_field_update_all for data collection.
|
|
*/
|
|
typedef enum { RDC_OPERATION_MODE_AUTO = 0, RDC_OPERATION_MODE_MANUAL } rdc_operation_mode_t;
|
|
|
|
/**
|
|
* @brief type of GPU group
|
|
*/
|
|
typedef enum {
|
|
RDC_GROUP_DEFAULT = 0, //!< All GPUs on the Node
|
|
RDC_GROUP_EMPTY //!< Empty group
|
|
} rdc_group_type_t;
|
|
|
|
/**
|
|
* @brief the type stored in the filed value
|
|
*/
|
|
typedef enum { INTEGER = 0, DOUBLE, STRING, BLOB } rdc_field_type_t;
|
|
|
|
//! ID used to represent an invalid GPU
|
|
#define GPU_ID_INVALID (-1)
|
|
//! Used to specify all GPUs
|
|
#define RDC_GROUP_ALL_GPUS (-1000)
|
|
//! Used to specify all stats fields
|
|
#define RDC_JOB_STATS_FIELDS (-1000)
|
|
|
|
/**
|
|
* @brief The max rdc field string length
|
|
*/
|
|
#define RDC_MAX_STR_LENGTH 256
|
|
|
|
/**
|
|
* @brief The max entities in a group
|
|
*/
|
|
#define RDC_GROUP_MAX_ENTITIES 64
|
|
|
|
/**
|
|
* @brief Max number of GPUs supported by RDC
|
|
*/
|
|
#define RDC_MAX_NUM_DEVICES 128
|
|
|
|
/**
|
|
* @brief Max number of partitions
|
|
*/
|
|
#define RDC_MAX_NUM_PARTITIONS 8
|
|
|
|
/**
|
|
* @brief The max fields in a field group
|
|
*/
|
|
#define RDC_MAX_FIELD_IDS_PER_FIELD_GROUP 128
|
|
|
|
/**
|
|
* @brief The max number of groups
|
|
*/
|
|
#define RDC_MAX_NUM_GROUPS 64
|
|
|
|
/**
|
|
* @brief The max number of the field groups
|
|
*/
|
|
#define RDC_MAX_NUM_FIELD_GROUPS 64
|
|
|
|
/**
|
|
* @brief The max string length occupied by version information
|
|
*/
|
|
#define RDC_MAX_VERSION_STR_LENGTH 60
|
|
|
|
/**
|
|
* @brief Max configuration can be collected using the configuration get
|
|
*/
|
|
#define RDC_MAX_CONFIG_SETTINGS 32
|
|
|
|
/**
|
|
* These enums are used to specify a particular field to be retrieved.
|
|
*/
|
|
typedef enum {
|
|
RDC_FI_INVALID = 0, //!< Invalid field value
|
|
//!< @brief Identifier fields
|
|
RDC_FI_GPU_COUNT = 1, //!< GPU count in the system
|
|
RDC_FI_DEV_NAME, //!< Name of the device
|
|
RDC_FI_OAM_ID, //!< OAM ID of the device
|
|
RDC_FI_DEV_ID, //!< Device ID
|
|
RDC_FI_REV_ID, //!<
|
|
RDC_FI_TARGET_GRAPHICS_VERSION, //!< Target graphics version
|
|
RDC_FI_NUM_OF_COMPUTE_UNITS, //!< Number of compute units
|
|
RDC_FI_UUID, //!< Device UUID
|
|
|
|
/**
|
|
* @brief Frequency related fields
|
|
*/
|
|
RDC_FI_GPU_CLOCK = 100, //!< The current clock for the GPU
|
|
RDC_FI_MEM_CLOCK, //!< Clock for the memory
|
|
|
|
/*
|
|
* @brief Physical monitor fields
|
|
*/
|
|
RDC_FI_MEMORY_TEMP = 200, //!< Memory temperature for the device
|
|
RDC_FI_GPU_TEMP, //!< Current temperature for the device
|
|
RDC_FI_POWER_USAGE = 300, //!< Power usage for the device
|
|
|
|
/**
|
|
* @brief PCIe related fields
|
|
*/
|
|
RDC_FI_PCIE_TX = 400, //!< PCIe Tx utilization information
|
|
RDC_FI_PCIE_RX, //!< PCIe Rx utilization information
|
|
// RDC_FI_PCIE_TX, RDC_FI_PCIE_RX are not supported on new ASIC
|
|
// The RDC_FI_PCIE_BANDWIDTH should be used
|
|
RDC_FI_PCIE_BANDWIDTH, //!< PCIe bandwidth in Mbps
|
|
|
|
/**
|
|
* @brief GPU usage related fields
|
|
*/
|
|
RDC_FI_GPU_UTIL = 500, //!< GPU Utilization
|
|
RDC_FI_GPU_MEMORY_USAGE, //!< Memory usage of the GPU instance
|
|
RDC_FI_GPU_MEMORY_TOTAL, //!< Total memory of the GPU instance
|
|
RDC_FI_GPU_MM_ENC_UTIL, //!< Multimedia encoder busy percentage
|
|
RDC_FI_GPU_MM_DEC_UTIL, //!< Multimedia decoder busy percentage
|
|
RDC_FI_GPU_MEMORY_ACTIVITY, //!< Memory busy percentage
|
|
RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, //<! The Memory max bandwidth at current memory clock in
|
|
// Mb/Second
|
|
RDC_FI_GPU_MEMORY_CUR_BANDWIDTH, //<! The Memory current bandwidth in Mb/Second
|
|
RDC_FI_GPU_BUSY_PERCENT, //<! The GPU busy percentage
|
|
|
|
/**
|
|
* @brief GPU page related fields
|
|
*/
|
|
RDC_FI_GPU_PAGE_RETRIED = 550, //!< Retried page of the GPU instance
|
|
/**
|
|
* @brief ECC related fields
|
|
*/
|
|
RDC_FI_ECC_CORRECT_TOTAL = 600, //!< Accumulated correctable ECC errors
|
|
RDC_FI_ECC_UNCORRECT_TOTAL, //!< Accumulated uncorrectable ECC errors
|
|
|
|
RDC_FI_ECC_FIRST = 602, //!< FIRST Error Correction and Detection field
|
|
RDC_FI_ECC_SDMA_CE = RDC_FI_ECC_FIRST,
|
|
RDC_FI_ECC_SDMA_UE,
|
|
RDC_FI_ECC_GFX_CE,
|
|
RDC_FI_ECC_GFX_UE,
|
|
RDC_FI_ECC_MMHUB_CE,
|
|
RDC_FI_ECC_MMHUB_UE,
|
|
RDC_FI_ECC_ATHUB_CE,
|
|
RDC_FI_ECC_ATHUB_UE,
|
|
RDC_FI_ECC_PCIE_BIF_CE,
|
|
RDC_FI_ECC_PCIE_BIF_UE,
|
|
RDC_FI_ECC_HDP_CE,
|
|
RDC_FI_ECC_HDP_UE,
|
|
RDC_FI_ECC_XGMI_WAFL_CE,
|
|
RDC_FI_ECC_XGMI_WAFL_UE,
|
|
RDC_FI_ECC_DF_CE,
|
|
RDC_FI_ECC_DF_UE,
|
|
RDC_FI_ECC_SMN_CE,
|
|
RDC_FI_ECC_SMN_UE,
|
|
RDC_FI_ECC_SEM_CE,
|
|
RDC_FI_ECC_SEM_UE,
|
|
RDC_FI_ECC_MP0_CE,
|
|
RDC_FI_ECC_MP0_UE,
|
|
RDC_FI_ECC_MP1_CE,
|
|
RDC_FI_ECC_MP1_UE,
|
|
RDC_FI_ECC_FUSE_CE,
|
|
RDC_FI_ECC_FUSE_UE,
|
|
RDC_FI_ECC_UMC_CE,
|
|
RDC_FI_ECC_UMC_UE,
|
|
RDC_FI_ECC_MCA_CE,
|
|
RDC_FI_ECC_MCA_UE,
|
|
RDC_FI_ECC_VCN_CE,
|
|
RDC_FI_ECC_VCN_UE,
|
|
RDC_FI_ECC_JPEG_CE,
|
|
RDC_FI_ECC_JPEG_UE,
|
|
RDC_FI_ECC_IH_CE,
|
|
RDC_FI_ECC_IH_UE,
|
|
RDC_FI_ECC_MPIO_CE,
|
|
RDC_FI_ECC_MPIO_UE,
|
|
RDC_FI_ECC_LAST = RDC_FI_ECC_MPIO_UE,
|
|
|
|
// In new ASCI, such as MI300, the XGMI events is not supported
|
|
// Using below XGMI related fields to calculate the bandwidth.
|
|
RDC_FI_XGMI_0_READ_KB = 700, //!< XGMI_0 accumulated data read size (KB)
|
|
RDC_FI_XGMI_1_READ_KB, //!< XGMI_1 accumulated data read size (KB)
|
|
RDC_FI_XGMI_2_READ_KB, //!< XGMI_2 accumulated data read size (KB)
|
|
RDC_FI_XGMI_3_READ_KB, //!< XGMI_3 accumulated data read size (KB)
|
|
RDC_FI_XGMI_4_READ_KB, //!< XGMI_4 accumulated data read size (KB)
|
|
RDC_FI_XGMI_5_READ_KB, //!< XGMI_5 accumulated data read size (KB)
|
|
RDC_FI_XGMI_6_READ_KB, //!< XGMI_6 accumulated data read size (KB)
|
|
RDC_FI_XGMI_7_READ_KB, //!< XGMI_7 accumulated data read size (KB)
|
|
|
|
RDC_FI_XGMI_0_WRITE_KB, //!< XGMI_0 accumulated data write size (KB)
|
|
RDC_FI_XGMI_1_WRITE_KB, //!< XGMI_1 accumulated data write size (KB)
|
|
RDC_FI_XGMI_2_WRITE_KB, //!< XGMI_2 accumulated data write size (KB)
|
|
RDC_FI_XGMI_3_WRITE_KB, //!< XGMI_3 accumulated data write size (KB)
|
|
RDC_FI_XGMI_4_WRITE_KB, //!< XGMI_4 accumulated data write size (KB)
|
|
RDC_FI_XGMI_5_WRITE_KB, //!< XGMI_5 accumulated data write size (KB)
|
|
RDC_FI_XGMI_6_WRITE_KB, //!< XGMI_6 accumulated data write size (KB)
|
|
RDC_FI_XGMI_7_WRITE_KB, //!< XGMI_7 accumulated data write size (KB)
|
|
RDC_FI_XGMI_TOTAL_READ_KB, //!< XGMI_SUM accumulated data read size (KB)
|
|
RDC_FI_XGMI_TOTAL_WRITE_KB, //!< XGMI_SUM accumulated data write size (KB)
|
|
|
|
/**
|
|
* @brief ROC-profiler related fields
|
|
*/
|
|
RDC_FI_PROF_OCCUPANCY_PERCENT = 800,
|
|
RDC_FI_PROF_ACTIVE_CYCLES,
|
|
RDC_FI_PROF_ACTIVE_WAVES,
|
|
RDC_FI_PROF_ELAPSED_CYCLES,
|
|
RDC_FI_PROF_TENSOR_ACTIVE_PERCENT,
|
|
RDC_FI_PROF_GPU_UTIL_PERCENT,
|
|
// metrics with EVAL are divided by time passed
|
|
RDC_FI_PROF_EVAL_MEM_R_BW,
|
|
RDC_FI_PROF_EVAL_MEM_W_BW,
|
|
RDC_FI_PROF_EVAL_FLOPS_16,
|
|
RDC_FI_PROF_EVAL_FLOPS_32,
|
|
RDC_FI_PROF_EVAL_FLOPS_64,
|
|
RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL,
|
|
RDC_FI_PROF_SM_ACTIVE,
|
|
RDC_FI_PROF_OCC_PER_ACTIVE_CU,
|
|
RDC_FI_PROF_OCC_ELAPSED,
|
|
RDC_FI_PROF_EVAL_FLOPS_16_PERCENT,
|
|
RDC_FI_PROF_EVAL_FLOPS_32_PERCENT,
|
|
RDC_FI_PROF_EVAL_FLOPS_64_PERCENT,
|
|
// CPC
|
|
RDC_FI_PROF_CPC_CPC_STAT_BUSY,
|
|
RDC_FI_PROF_CPC_CPC_STAT_IDLE,
|
|
RDC_FI_PROF_CPC_CPC_STAT_STALL,
|
|
RDC_FI_PROF_CPC_CPC_TCIU_BUSY,
|
|
RDC_FI_PROF_CPC_CPC_TCIU_IDLE,
|
|
RDC_FI_PROF_CPC_CPC_UTCL2IU_BUSY,
|
|
RDC_FI_PROF_CPC_CPC_UTCL2IU_IDLE,
|
|
RDC_FI_PROF_CPC_CPC_UTCL2IU_STALL,
|
|
RDC_FI_PROF_CPC_ME1_BUSY_FOR_PACKET_DECODE,
|
|
RDC_FI_PROF_CPC_ME1_DC0_SPI_BUSY,
|
|
RDC_FI_PROF_CPC_UTCL1_STALL_ON_TRANSLATION,
|
|
RDC_FI_PROF_CPC_ALWAYS_COUNT,
|
|
RDC_FI_PROF_CPC_ADC_VALID_CHUNK_NOT_AVAIL,
|
|
RDC_FI_PROF_CPC_ADC_DISPATCH_ALLOC_DONE,
|
|
RDC_FI_PROF_CPC_ADC_VALID_CHUNK_END,
|
|
RDC_FI_PROF_CPC_SYNC_FIFO_FULL_LEVEL,
|
|
RDC_FI_PROF_CPC_SYNC_FIFO_FULL,
|
|
RDC_FI_PROF_CPC_GD_BUSY,
|
|
RDC_FI_PROF_CPC_TG_SEND,
|
|
RDC_FI_PROF_CPC_WALK_NEXT_CHUNK,
|
|
RDC_FI_PROF_CPC_STALLED_BY_SE0_SPI,
|
|
RDC_FI_PROF_CPC_STALLED_BY_SE1_SPI,
|
|
RDC_FI_PROF_CPC_STALLED_BY_SE2_SPI,
|
|
RDC_FI_PROF_CPC_STALLED_BY_SE3_SPI,
|
|
RDC_FI_PROF_CPC_LTE_ALL,
|
|
RDC_FI_PROF_CPC_SYNC_WRREQ_FIFO_BUSY,
|
|
RDC_FI_PROF_CPC_CANE_BUSY,
|
|
RDC_FI_PROF_CPC_CANE_STALL,
|
|
// CPF
|
|
RDC_FI_PROF_CPF_CMP_UTCL1_STALL_ON_TRANSLATION,
|
|
RDC_FI_PROF_CPF_CPF_STAT_BUSY,
|
|
RDC_FI_PROF_CPF_CPF_STAT_IDLE,
|
|
RDC_FI_PROF_CPF_CPF_STAT_STALL,
|
|
RDC_FI_PROF_CPF_CPF_TCIU_BUSY,
|
|
RDC_FI_PROF_CPF_CPF_TCIU_IDLE,
|
|
RDC_FI_PROF_CPF_CPF_TCIU_STALL,
|
|
RDC_FI_PROF_SIMD_UTILIZATION,
|
|
RDC_FI_PROF_UUID,
|
|
|
|
/**
|
|
* @brief Raw XGMI counter events
|
|
*/
|
|
RDC_EVNT_XGMI_0_NOP_TX = 1000, //!< NOPs sent to neighbor 0
|
|
RDC_EVNT_XGMI_0_REQ_TX, //!< Outgoing requests to
|
|
//!< neighbor 0
|
|
RDC_EVNT_XGMI_0_RESP_TX, //!< Outgoing responses to
|
|
//!< neighbor 0
|
|
/**
|
|
* @brief
|
|
*
|
|
* Data beats sent to neighbor 0; Each beat represents 32 bytes.<br><br>
|
|
*
|
|
* XGMI throughput can be calculated by multiplying a BEATs event
|
|
* such as ::RDC_EVNT_XGMI_0_BEATS_TX by 32 and dividing by
|
|
* the time for which event collection occurred,
|
|
* ::rdc_gpu_usage_info_t.start_time (which is in nanoseconds). To get
|
|
* bytes per second, multiply this value by 10<sup>9</sup>.<br>
|
|
* <br>
|
|
* Throughput = BEATS/time_running * 10<sup>9</sup> (bytes/second)<br>
|
|
*/
|
|
// ie, Throughput = BEATS/time_running 10^9 bytes/sec
|
|
RDC_EVNT_XGMI_0_BEATS_TX,
|
|
RDC_EVNT_XGMI_1_NOP_TX, //!< NOPs sent to neighbor 1
|
|
RDC_EVNT_XGMI_1_REQ_TX, //!< Outgoing requests to
|
|
//!< neighbor 1
|
|
RDC_EVNT_XGMI_1_RESP_TX, //!< Outgoing responses to
|
|
//!< neighbor 1
|
|
RDC_EVNT_XGMI_1_BEATS_TX, //!< Data beats sent to
|
|
//!< neighbor 1; Each beat
|
|
//!< represents 32 bytes
|
|
|
|
// "Composite" events. These events have additional processing beyond
|
|
// the value provided by the amd_smi library.
|
|
RDC_EVNT_XGMI_0_THRPUT = 1500, //!< Transmit throughput to XGMI
|
|
//!< neighbor 0 in byes/sec
|
|
RDC_EVNT_XGMI_1_THRPUT, //!< Transmit throughput to XGMI
|
|
//!< neighbor 1 in byes/sec
|
|
RDC_EVNT_XGMI_2_THRPUT, //!< Transmit throughput to XGMI
|
|
//!< neighbor 2 in byes/sec
|
|
RDC_EVNT_XGMI_3_THRPUT, //!< Transmit throughput to XGMI
|
|
//!< neighbor 3 in byes/sec
|
|
RDC_EVNT_XGMI_4_THRPUT, //!< Transmit throughput to XGMI
|
|
//!< neighbor 4 in byes/sec
|
|
RDC_EVNT_XGMI_5_THRPUT, //!< Transmit throughput to XGMI
|
|
//!< neighbor 5 in byes/sec
|
|
|
|
RDC_EVNT_NOTIF_VMFAULT = 2000, //!< VM page fault
|
|
RDC_EVNT_NOTIF_FIRST = RDC_EVNT_NOTIF_VMFAULT,
|
|
|
|
RDC_EVNT_NOTIF_THERMAL_THROTTLE, //!< Clock frequency has decreased
|
|
//!< due to temperature rise
|
|
RDC_EVNT_NOTIF_PRE_RESET, //!< GPU reset is about to occur
|
|
RDC_EVNT_NOTIF_POST_RESET, //!< GPU reset just occurred
|
|
RDC_EVNT_NOTIF_MIGRATE_START,
|
|
RDC_EVNT_NOTIF_MIGRATE_END,
|
|
RDC_EVNT_NOTIF_PAGE_FAULT_START,
|
|
RDC_EVNT_NOTIF_PAGE_FAULT_END,
|
|
RDC_EVNT_NOTIF_QUEUE_EVICTION,
|
|
RDC_EVNT_NOTIF_QUEUE_RESTORE,
|
|
RDC_EVNT_NOTIF_UNMAP_FROM_GPU,
|
|
RDC_EVNT_NOTIF_PROCESS_START,
|
|
RDC_EVNT_NOTIF_PROCESS_END,
|
|
|
|
RDC_EVNT_NOTIF_LAST = RDC_EVNT_NOTIF_PROCESS_END,
|
|
|
|
/**
|
|
* @brief RDC health related fields
|
|
*/
|
|
RDC_HEALTH_XGMI_ERROR = 3000, //!< XGMI one or more errors detected
|
|
RDC_HEALTH_PCIE_REPLAY_COUNT, //!< Total PCIE replay count
|
|
RDC_HEALTH_RETIRED_PAGE_NUM, //!< Retired page number
|
|
RDC_HEALTH_PENDING_PAGE_NUM, //!< Pending page number
|
|
RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< the threshold of retired page number
|
|
RDC_HEALTH_EEPROM_CONFIG_VALID, //!< Reads the EEPROM and verifies the checksums
|
|
RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter
|
|
RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds)
|
|
} rdc_field_t;
|
|
|
|
// even and odd numbers are used for correctable and uncorrectable errors
|
|
static_assert(RDC_FI_ECC_SDMA_CE % 2 == 0, "Correctable Error enum is not even");
|
|
static_assert(RDC_FI_ECC_SDMA_UE % 2 == 1, "Uncorrectable Error enum is not odd");
|
|
static_assert(RDC_FI_ECC_MPIO_CE % 2 == 0, "Correctable Error enum is not even");
|
|
static_assert(RDC_FI_ECC_MPIO_UE % 2 == 1, "Uncorrectable Error enum is not odd");
|
|
|
|
#define RDC_EVNT_IS_NOTIF_FIELD(FIELD) \
|
|
((FIELD) >= RDC_EVNT_NOTIF_FIRST && (FIELD) <= RDC_EVNT_NOTIF_LAST)
|
|
/**
|
|
* @brief handlers used in various rdc calls
|
|
*/
|
|
typedef void* rdc_handle_t; //!< Handle used for an RDC session
|
|
typedef uint32_t rdc_gpu_group_t; //!< GPU Group ID type
|
|
typedef uint32_t rdc_field_grp_t; //!< Field group ID type
|
|
|
|
/**
|
|
* @brief Represents attributes corresponding to a device
|
|
*/
|
|
typedef struct {
|
|
char device_name[RDC_MAX_STR_LENGTH]; //!< Name of the device.
|
|
uint64_t device_id; //!< The device id of a GPU
|
|
uint32_t num_of_compute_units;
|
|
uint64_t target_graphics_version;
|
|
} rdc_device_attributes_t;
|
|
|
|
/**
|
|
* @brief Store version information for each component
|
|
*/
|
|
typedef struct {
|
|
char version[RDC_MAX_VERSION_STR_LENGTH];
|
|
} rdc_component_version_t;
|
|
|
|
/**
|
|
* @brief The structure to store the group info
|
|
*/
|
|
typedef struct {
|
|
unsigned int count; //!< count of GPUs in the group
|
|
char group_name[RDC_MAX_STR_LENGTH]; //!< group name
|
|
/**
|
|
* The list of entities in the group
|
|
*/
|
|
uint32_t entity_ids[RDC_GROUP_MAX_ENTITIES];
|
|
} rdc_group_info_t;
|
|
|
|
/**
|
|
* @brief The structure to store summary of data
|
|
*/
|
|
typedef struct {
|
|
uint64_t max_value; //!< Maximum value measured
|
|
uint64_t min_value; //!< Minimum value measured
|
|
uint64_t average; //!< Average value measured
|
|
double standard_deviation; //!< The standard deviation
|
|
} rdc_stats_summary_t;
|
|
|
|
/**
|
|
* @brief The structure to hold the GPU usage information
|
|
*/
|
|
typedef struct {
|
|
uint32_t gpu_id; //!< GPU_ID_INVALID for summary information
|
|
uint64_t start_time; //!< The time to start the watching
|
|
uint64_t end_time; //!< The time to stop the watching
|
|
|
|
uint64_t energy_consumed; //!< GPU Energy consumed
|
|
uint64_t ecc_correct; //!< Correctable errors
|
|
uint64_t ecc_uncorrect; //!< Uncorrectable errors
|
|
rdc_stats_summary_t pcie_tx; //!< Bytes sent over PCIe stats
|
|
rdc_stats_summary_t pcie_rx; //!< Bytes received over PCIe stats
|
|
rdc_stats_summary_t pcie_total; //!< Total PCIe bandwidth stats
|
|
//!< pcie_tx/pcie_rx are not available on mi300, max integer
|
|
//!< returned, so use pcie_total
|
|
rdc_stats_summary_t power_usage; //!< GPU Power usage stats
|
|
rdc_stats_summary_t gpu_clock; //!< GPU Clock speed stats
|
|
rdc_stats_summary_t memory_clock; //!< Mem. Clock speed stats
|
|
rdc_stats_summary_t gpu_utilization; //!< GPU Utilization stats
|
|
rdc_stats_summary_t gpu_temperature; //!< GPU temperature stats
|
|
|
|
uint64_t max_gpu_memory_used; //!< Maximum GPU memory used
|
|
rdc_stats_summary_t memory_utilization; //!< Memory Utilization statistics
|
|
} rdc_gpu_usage_info_t; //!< GPU usage statistics
|
|
|
|
#define MAX_PROCESS_NAME 256
|
|
/**
|
|
* @brief The structure to track process start/stop times during a job running
|
|
*/
|
|
typedef struct {
|
|
uint32_t pid; //!< Process ID
|
|
char process_name[MAX_PROCESS_NAME];
|
|
uint64_t start_time; //!< Process start time in microseconds since 1970
|
|
uint64_t stop_time; //!< Process stop time in microseconds since 1970
|
|
} rdc_process_status_info_t;
|
|
#define RDC_MAX_NUM_PROCESSES_STATUS 64
|
|
|
|
/**
|
|
* @brief The structure to hold the job stats
|
|
*/
|
|
typedef struct {
|
|
uint32_t num_gpus; //!< Number of GPUs used by job
|
|
rdc_gpu_usage_info_t summary; //!< Job usage summary statistics
|
|
//!< (overall)
|
|
rdc_gpu_usage_info_t gpus[16]; //!< Job usage summary statistics by GPU
|
|
uint32_t num_processes; //!< Number of processes tracked
|
|
rdc_process_status_info_t
|
|
processes[RDC_MAX_NUM_PROCESSES_STATUS]; //!< Array to track process start/stop times
|
|
} rdc_job_info_t;
|
|
|
|
/**
|
|
* @brief Field value data
|
|
*/
|
|
typedef union {
|
|
int64_t l_int;
|
|
double dbl;
|
|
char str[RDC_MAX_STR_LENGTH];
|
|
} rdc_field_value_data;
|
|
|
|
/**
|
|
* @brief The structure to store the field value
|
|
*/
|
|
typedef struct {
|
|
rdc_field_t field_id; //!< The field id of the value
|
|
int status; //!< RDC_ST_OK or error status
|
|
uint64_t ts; //!< Timestamp in usec since 1970
|
|
rdc_field_type_t type; //!< The field type
|
|
rdc_field_value_data value; //!< Value of the field. Value type
|
|
//!< depends on the field type.
|
|
} rdc_field_value;
|
|
|
|
/**
|
|
* @brief The structure to store the field group info
|
|
*/
|
|
typedef struct {
|
|
uint32_t count; //!< count of fields in the group
|
|
char group_name[RDC_MAX_STR_LENGTH]; //!< field group name
|
|
/**
|
|
* The list of fields in the group
|
|
*/
|
|
rdc_field_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP];
|
|
} rdc_field_group_info_t;
|
|
|
|
/**
|
|
* @brief The structure to store the job info
|
|
*/
|
|
typedef struct {
|
|
char job_id[RDC_MAX_STR_LENGTH]; //!< job id
|
|
rdc_gpu_group_t group_id; //!< group name
|
|
uint64_t start_time; //!< job start time
|
|
uint64_t stop_time; //!< job stop time
|
|
} rdc_job_group_info_t;
|
|
|
|
/**
|
|
* @brief type of diagnostic level
|
|
*/
|
|
typedef enum {
|
|
RDC_DIAG_LVL_INVALID = 0, //!< invalid level
|
|
RDC_DIAG_LVL_SHORT, //!< take a few seconds to run
|
|
RDC_DIAG_LVL_MED, //!< take less than 2 minutes to run
|
|
RDC_DIAG_LVL_LONG //!< take up to 15 minutes to run
|
|
} rdc_diag_level_t;
|
|
|
|
/**
|
|
* @brief type of diagnostic result
|
|
*/
|
|
typedef enum {
|
|
RDC_DIAG_RESULT_PASS, //!< The diagnostic test pass
|
|
RDC_DIAG_RESULT_SKIP, //!< The diagnostic test skipped
|
|
RDC_DIAG_RESULT_WARN, //!< The diagnostic test has warnings
|
|
RDC_DIAG_RESULT_FAIL //!< The diagnostic test fail
|
|
} rdc_diag_result_t;
|
|
|
|
/**
|
|
* @brief The test cases to run
|
|
*/
|
|
typedef enum {
|
|
RDC_DIAG_TEST_FIRST = 0,
|
|
//!< The diagnostic test pass
|
|
RDC_DIAG_COMPUTE_PROCESS = RDC_DIAG_TEST_FIRST,
|
|
RDC_DIAG_COMPUTE_QUEUE, //!< The Compute Queue is ready
|
|
RDC_DIAG_SYS_MEM_CHECK, //!< Check System memory
|
|
RDC_DIAG_NODE_TOPOLOGY, //!< Report node topology
|
|
RDC_DIAG_GPU_PARAMETERS, //!< GPU parameters in range
|
|
RDC_DIAG_RVS_GST_TEST, //!< RVS GST test
|
|
RDC_DIAG_RVS_MEMBW_TEST, //!< RVS bandwidth test
|
|
RDC_DIAG_RVS_H2DD2H_TEST, //!< RVS Host<->Device transfer speed test
|
|
RDC_DIAG_RVS_IET_TEST, //!< RVS IET test
|
|
RDC_DIAG_RVS_GST_LONG_TEST, //!< RVS GST test
|
|
RDC_DIAG_RVS_MEMBW_LONG_TEST, //!< RVS bandwidth test
|
|
RDC_DIAG_RVS_H2DD2H_LONG_TEST, //!< RVS Host<->Device transfer speed test
|
|
RDC_DIAG_RVS_IET_LONG_TEST, //!< RVS IET test
|
|
RDC_DIAG_RVS_CUSTOM, //!< RVS custom test
|
|
RDC_DIAG_TEST_LAST,
|
|
} rdc_diag_test_cases_t;
|
|
|
|
/**
|
|
* @brief Type of Components
|
|
*/
|
|
typedef enum {
|
|
RDC_AMDSMI_COMPONENT
|
|
// If needed later, add them one by one
|
|
} rdc_component_t;
|
|
|
|
/**
|
|
* @brief The maximum test cases to run
|
|
*/
|
|
#define MAX_TEST_CASES (RDC_DIAG_TEST_LAST - RDC_DIAG_TEST_FIRST)
|
|
|
|
/**
|
|
* @brief The maximum length of the diagnostic messages
|
|
*/
|
|
#define MAX_DIAG_MSG_LENGTH 4096
|
|
|
|
/**
|
|
* @brief details of the diagnostic errors
|
|
*/
|
|
typedef struct {
|
|
char msg[MAX_DIAG_MSG_LENGTH]; //!< The test result details
|
|
uint32_t code; //!< The low level error code
|
|
} rdc_diag_detail_t;
|
|
|
|
/**
|
|
* @brief details of the per gpu diagnostic results
|
|
*/
|
|
typedef struct {
|
|
uint32_t gpu_index; //!< The GPU index
|
|
rdc_diag_detail_t gpu_result; //!< The detail results
|
|
} rdc_diag_per_gpu_result_t;
|
|
|
|
/**
|
|
* @brief The diagnostic results for all GPUs
|
|
*/
|
|
typedef struct {
|
|
rdc_diag_result_t status; //!< The diagnostic result
|
|
rdc_diag_detail_t details; //!< The summary details
|
|
rdc_diag_test_cases_t test_case; //!< The test case to run
|
|
|
|
uint32_t per_gpu_result_count; //!< How many gpu_results
|
|
//!< Result details
|
|
rdc_diag_per_gpu_result_t gpu_results[RDC_MAX_NUM_DEVICES];
|
|
|
|
char info[MAX_DIAG_MSG_LENGTH]; //!< Detail information
|
|
} rdc_diag_test_result_t;
|
|
|
|
/**
|
|
* @brief The diagnostic responses for test cases
|
|
*/
|
|
typedef struct {
|
|
uint32_t results_count;
|
|
rdc_diag_test_result_t diag_info[MAX_TEST_CASES];
|
|
} rdc_diag_response_t;
|
|
|
|
typedef void (*rdc_callback_t)(void*, void*);
|
|
typedef struct {
|
|
rdc_callback_t callback; //!< Callback sends logs for running diagnostics
|
|
void* cookie; //!< Cookie is used to identify different callbacks and supply them with data
|
|
} rdc_diag_callback_t;
|
|
|
|
/**
|
|
* @brief The policy type to support
|
|
*/
|
|
typedef enum {
|
|
RDC_POLICY_COND_FIRST = 0,
|
|
RDC_POLICY_COND_MAX_PAGE_RETRIED = RDC_POLICY_COND_FIRST, //!< Max number of page retired
|
|
RDC_POLICY_COND_THERMAL, //!< Temperature threshold, millidegree Celsius
|
|
RDC_POLICY_COND_POWER, //!< Power threshold, unit microwatt
|
|
RDC_POLICY_COND_LAST = RDC_POLICY_COND_POWER,
|
|
RDC_POLICY_COND_MAX
|
|
} rdc_policy_condition_type_t;
|
|
|
|
typedef struct {
|
|
rdc_policy_condition_type_t type;
|
|
int64_t value;
|
|
} rdc_policy_condition_t;
|
|
|
|
typedef enum { RDC_POLICY_ACTION_NONE, RDC_POLICY_ACTION_GPU_RESET } rdc_policy_action_t;
|
|
|
|
/**
|
|
* @brief The structure to define policy to enforce on GPU.
|
|
*/
|
|
typedef struct {
|
|
rdc_policy_condition_t condition; //!< condition to meet
|
|
rdc_policy_action_t action; //!< Action to take
|
|
} rdc_policy_t;
|
|
|
|
typedef enum {
|
|
RDC_IOLINK_TYPE_UNDEFINED = 0, //!< unknown type.
|
|
RDC_IOLINK_TYPE_PCIEXPRESS = 1, //!< PCI Express
|
|
RDC_IOLINK_TYPE_XGMI = 2, //!< XGMI
|
|
RDC_IOLINK_TYPE_SIZE = 0xFFFFFFFF //!< Max of IO Link types
|
|
} rdc_topology_link_type_t;
|
|
|
|
/**
|
|
* @brief The link information of the GPU connected to
|
|
*/
|
|
typedef struct {
|
|
uint32_t gpu_index;
|
|
// amdsmi_topo_get_link_weight
|
|
uint64_t weight; // the weight for a connection between 2 GPUs
|
|
// minimal and maximal io link bandwidth between 2 GPUs
|
|
// amdsmi_get_minmax_bandwidth_between_processors
|
|
uint64_t min_bandwidth;
|
|
uint64_t max_bandwidth;
|
|
// amdsmi_topo_get_link_type
|
|
uint64_t hops;
|
|
rdc_topology_link_type_t link_type;
|
|
// amdsmi_is_P2P_accessible
|
|
bool is_p2p_accessible;
|
|
} rdc_topology_link_info_t;
|
|
|
|
/**
|
|
* @brief The data in the data structure will be set to max value if it is N/A or error
|
|
*/
|
|
typedef struct {
|
|
uint32_t num_of_gpus; // The length of link_infos array
|
|
rdc_topology_link_info_t link_infos[RDC_MAX_NUM_DEVICES];
|
|
// amdsmi_topo_get_numa_node_number
|
|
uint32_t numa_node; // the NUMA CPU node number for a device
|
|
} rdc_device_topology_t;
|
|
|
|
typedef enum {
|
|
RDC_LINK_STATE_DOWN = 0,
|
|
RDC_LINK_STATE_UP,
|
|
RDC_LINK_STATE_DISABLED,
|
|
} rdc_link_state_t;
|
|
|
|
#define RDC_MAX_NUM_OF_LINKS 16
|
|
|
|
typedef struct {
|
|
uint32_t gpu_index;
|
|
uint32_t num_of_links; // The size of the array link_states
|
|
rdc_topology_link_type_t link_types; // XGMI, PCIe, and so on
|
|
rdc_link_state_t link_states[RDC_MAX_NUM_OF_LINKS];
|
|
} rdc_gpu_link_status_t;
|
|
|
|
typedef struct {
|
|
int32_t num_of_gpus; // The size of gpus array
|
|
rdc_gpu_link_status_t gpus[RDC_MAX_NUM_DEVICES];
|
|
} rdc_link_status_t;
|
|
|
|
/**
|
|
* @brief type of health watches
|
|
*/
|
|
typedef enum {
|
|
RDC_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches
|
|
RDC_HEALTH_WATCH_XGMI = 0x2, //!< XGMI system watches
|
|
RDC_HEALTH_WATCH_MEM = 0x4, //!< Memory watches
|
|
RDC_HEALTH_WATCH_EEPROM = 0x8, //!< EEPROM watches
|
|
RDC_HEALTH_WATCH_THERMAL = 0x10, //!< Temperature watches
|
|
RDC_HEALTH_WATCH_POWER = 0x20, //!< Power watches
|
|
} rdc_health_system_t;
|
|
|
|
/**
|
|
* @brief type of health result
|
|
*/
|
|
typedef enum {
|
|
RDC_HEALTH_RESULT_PASS, //!< The health test pass
|
|
RDC_HEALTH_RESULT_WARN, //!< The health test has warnings
|
|
RDC_HEALTH_RESULT_FAIL //!< The health test fail
|
|
} rdc_health_result_t;
|
|
|
|
/**
|
|
* @brief The maximum length of the health messages
|
|
*/
|
|
#define MAX_HEALTH_MSG_LENGTH 4096
|
|
|
|
/**
|
|
* 8 replays per minute is the maximum recommended
|
|
*/
|
|
#define PCIE_MAX_REPLAYS_PERMIN 8
|
|
|
|
// The error code set at rdc_health_incidents_t.error.code
|
|
typedef enum {
|
|
RDC_FR_PCI_REPLAY_RATE = 1000,
|
|
RDC_FR_ECC_UNCORRECTABLE_DETECTED = 1001,
|
|
RDC_FR_PENDING_PAGE_RETIREMENTS = 1002,
|
|
RDC_FR_RETIRED_PAGES_LIMIT = 1003,
|
|
RDC_FR_RETIRED_PAGES_UNCORRECTABLE_LIMIT = 1004,
|
|
RDC_FR_CLOCKS_THROTTLE_THERMAL = 1005,
|
|
RDC_FR_CLOCKS_THROTTLE_POWER = 1006,
|
|
RDC_FR_XGMI_SINGLE_ERROR = 1007,
|
|
RDC_FR_XGMI_MULTIPLE_ERROR = 1008,
|
|
RDC_FR_CORRUPT_EEPROM = 1009
|
|
} rdc_health_error_code_t;
|
|
|
|
/**
|
|
* @brief details of the health errors
|
|
*/
|
|
typedef struct {
|
|
char msg[MAX_HEALTH_MSG_LENGTH]; //!< The test result details
|
|
uint32_t code; //!< The low level error code
|
|
} rdc_health_detail_t;
|
|
|
|
/**
|
|
* @brief details of the per health incidents
|
|
*/
|
|
typedef struct {
|
|
uint32_t gpu_index; //!< which GPU in this group have the issue
|
|
rdc_health_system_t component; //!< which components have the issue
|
|
rdc_health_result_t health; //!< health diagnosis of this incident
|
|
rdc_health_detail_t error; //!< The details of the error, rdc_health_error_code_t
|
|
} rdc_health_incidents_t;
|
|
|
|
#define HEALTH_MAX_ERROR_ITEMS 64
|
|
|
|
/**
|
|
* @brief The health responses for test cases
|
|
*/
|
|
typedef struct {
|
|
rdc_health_result_t overall_health; //!< The overall health of this entire host
|
|
unsigned int incidents_count; //!< The number of health incidents reported in this struct
|
|
rdc_health_incidents_t incidents[HEALTH_MAX_ERROR_ITEMS]; //!< Report of the errors detected
|
|
} rdc_health_response_t;
|
|
|
|
/**
|
|
* @brief property id's for the configuration set/get
|
|
*/
|
|
typedef enum {
|
|
RDC_CFG_GFX_CLOCK_LIMIT,
|
|
RDC_CFG_MEMORY_CLOCK_LIMIT,
|
|
RDC_CFG_POWER_LIMIT
|
|
} rdc_config_type_t;
|
|
|
|
/**
|
|
* @brief Value mapped to rdc_config_type_t property id for the configuration set/get
|
|
*/
|
|
typedef struct {
|
|
rdc_config_type_t type;
|
|
uint64_t target_value;
|
|
} rdc_config_setting_t;
|
|
|
|
/**
|
|
* @brief Array of properties collected using the configuration get
|
|
*/
|
|
typedef struct {
|
|
uint32_t total_settings;
|
|
rdc_config_setting_t settings[RDC_MAX_CONFIG_SETTINGS];
|
|
} rdc_config_setting_list_t;
|
|
|
|
/**
|
|
* @brief Initialize ROCm RDC.
|
|
*
|
|
* @details When called, this initializes internal data structures,
|
|
* including those corresponding to sources of information that RDC provides.
|
|
* This must be called before rdc_start_embedded() or rdc_connect()
|
|
*
|
|
* @param[in] init_flags init_flags Bit flags that tell RDC how to initialize.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_init(uint64_t init_flags);
|
|
|
|
/**
|
|
* @brief Shutdown ROCm RDC.
|
|
*
|
|
* @details Do any necessary clean up.
|
|
*/
|
|
rdc_status_t rdc_shutdown();
|
|
|
|
/**
|
|
* @brief Start embedded RDC agent within this process.
|
|
*
|
|
* @details The RDC is loaded as library so that it does not require rdcd
|
|
* daemon. In this mode, the user has to periodically call
|
|
* rdc_field_update_all() when op_mode is RDC_OPERATION_MODE_MANUAL, which
|
|
* tells RDC to collect the stats.
|
|
*
|
|
* @param[in] op_mode Operation modes. When RDC_OPERATION_MODE_AUTO, RDC
|
|
* schedules background task to collect the stats. When
|
|
* RDC_OPERATION_MODE_MANUAL, the user needs to call rdc_field_update_all()
|
|
* periodically.
|
|
*
|
|
* @param[inout] p_rdc_handle Caller provided pointer to rdc_handle_t. Upon
|
|
* successful call, the value will contain the handler for following API calls.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_start_embedded(rdc_operation_mode_t op_mode, rdc_handle_t* p_rdc_handle);
|
|
|
|
/**
|
|
* @brief Stop embedded RDC agent.
|
|
*
|
|
* @details Stop the embedded RDC agent, and p_rdc_handle becomes
|
|
* invalid after this call.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler that come from
|
|
* rdc_start_embedded().
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_stop_embedded(rdc_handle_t p_rdc_handle);
|
|
|
|
/**
|
|
* @brief Connect to rdcd daemon
|
|
*
|
|
* @details This method is used to connect to a remote stand-alone
|
|
* rdcd daemon.
|
|
*
|
|
* @param[in] ipAndPort The IP and port of the remote rdcd. The ipAndPort
|
|
* can be specified in this x.x.x.x:yyyy format, where x.x.x.x is the
|
|
* IP address and yyyy is the port.
|
|
*
|
|
* @param[inout] p_rdc_handle Caller provided pointer to rdc_handle_t. Upon
|
|
* successful call, the value will contain the handler
|
|
* for following API calls.
|
|
*
|
|
* @param [in] root_ca The root CA stored in the string in pem format. Set it
|
|
* as nullptr if the communication is not encrypted.
|
|
*
|
|
* @param [in] client_cert The client certificate stored in the string in pem
|
|
* format. Set it as nullptr if the communication is not encrypted.
|
|
*
|
|
* @param [in] client_key The client key stored in the string in pem format.
|
|
* Set it as nullptr if the communication is not encrypted.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_connect(const char* ipAndPort, rdc_handle_t* p_rdc_handle, const char* root_ca,
|
|
const char* client_cert, const char* client_key);
|
|
|
|
/**
|
|
* @brief Disconnect from rdcd daemon.
|
|
*
|
|
* @details Disconnect from rdcd daemon, and p_rdc_handle becomes invalid
|
|
* after this call.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler that come from rdc_connect().
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_disconnect(rdc_handle_t p_rdc_handle);
|
|
|
|
/**
|
|
* @brief Request the RDC to watch the job stats
|
|
*
|
|
* @details This should be executed as part of job prologue. The summary
|
|
* job stats can be retrieved using rdc_job_get_stats().
|
|
* In RDC_OPERATION_MODE_MANUAL, user must call rdc_field_update_all(1)
|
|
* at least once, before call rdc_job_get_stats()
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The group of GPUs to be watched.
|
|
*
|
|
* @param[in] job_id The name of the job.
|
|
*
|
|
* @param[in] update_freq How often to update this field in usec.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_job_start_stats(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
const char job_id[64], uint64_t update_freq);
|
|
|
|
/**
|
|
* @brief Get the stats of the job using the job id.
|
|
*
|
|
* @details The stats can be retrieved at any point when the job is in
|
|
* process.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] job_id The name of the job.
|
|
*
|
|
* @param[inout] p_job_info Caller provided pointer to rdc_job_info_t. Upon
|
|
* successful call, the value will contain the stats of the job.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_job_get_stats(rdc_handle_t p_rdc_handle, const char job_id[64],
|
|
rdc_job_info_t* p_job_info);
|
|
|
|
/**
|
|
* @brief Request RDC to stop watching the stats of the job
|
|
*
|
|
* @details This should be execute as part of job epilogue. The job Id
|
|
* remains available to view the stats at any point. You must call
|
|
* rdc_watch_job_fields() before this call.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] job_id The name of the job.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_job_stop_stats(rdc_handle_t p_rdc_handle, const char job_id[64]);
|
|
|
|
/**
|
|
* @brief Request RDC to stop tracking the job given by job_id
|
|
*
|
|
* @details After this call, you will no longer be able to call
|
|
* rdc_job_get_stats() on this job_id. But you will be able to reuse
|
|
* the job_id after this call.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] job_id The name of the job.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_job_remove(rdc_handle_t p_rdc_handle, const char job_id[64]);
|
|
|
|
/**
|
|
* @brief Request RDC to stop tracking all the jobs
|
|
*
|
|
* @details After this call, you will no longer be able to call
|
|
* rdc_job_get_stats() on any job id. But you will be able to reuse
|
|
* the any previous used job id after this call.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_job_remove_all(rdc_handle_t p_rdc_handle);
|
|
|
|
/**
|
|
* @brief Request RDC to update all fields to be watched.
|
|
*
|
|
* @details In RDC_OPERATION_MODE_MANUAL, the user must call this method
|
|
* periodically.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] wait_for_update Whether or not to wait for the update loop to
|
|
* complete before returning to the caller 1=wait. 0=do not wait.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_field_update_all(rdc_handle_t p_rdc_handle, uint32_t wait_for_update);
|
|
|
|
/**
|
|
* @brief Get indexes corresponding to all the devices on the system.
|
|
*
|
|
* @details Indexes represents RDC GPU Id corresponding to each GPU on the
|
|
* system and is immutable during the lifespan of the engine. The list
|
|
* should be queried again if the engine is restarted.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[out] gpu_index_list Array reference to fill GPU indexes present on
|
|
* the system.
|
|
*
|
|
* @param[out] count Number of GPUs returned in gpu_index_list.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_device_get_all(rdc_handle_t p_rdc_handle,
|
|
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count);
|
|
|
|
/**
|
|
* @brief Gets device attributes corresponding to the gpu_index.
|
|
*
|
|
* @details Fetch the attributes, such as device name, of a GPU.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] gpu_index GPU index corresponding to which the attributes
|
|
* should be fetched
|
|
*
|
|
* @param[out] p_rdc_attr GPU attribute corresponding to the gpu_index.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
|
|
rdc_device_attributes_t* p_rdc_attr);
|
|
|
|
/**
|
|
* @brief Get version information of components used by rdc.
|
|
*
|
|
* @details Given a component type, return its version information.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] component Type of Components. See rdc_component_t definition for details.
|
|
*
|
|
* @param[out] p_rdc_compv Version information of the corresponding component.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component,
|
|
rdc_component_version_t* p_rdc_compv);
|
|
|
|
/**
|
|
* @brief Create a group contains multiple GPUs
|
|
*
|
|
* @details This method can create a group contains multiple GPUs. Instead of
|
|
* executing an operation separately for each GPU, the RDC group enables
|
|
* the user to execute same operation on all the GPUs present in the group as
|
|
* a single API call.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] type The type of the group. RDC_GROUP_DEFAULT includes all the
|
|
* GPUs on the node, and RDC_GROUP_EMPTY creates an empty group.
|
|
*
|
|
* @param[in] group_name The group name specified as NULL terminated C String
|
|
*
|
|
* @param[inout] p_rdc_group_id Caller provided pointer to rdc_gpu_group_t.
|
|
* Upon successful call, the value will contain the group id for following
|
|
* group API calls.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_gpu_create(rdc_handle_t p_rdc_handle, rdc_group_type_t type,
|
|
const char* group_name, rdc_gpu_group_t* p_rdc_group_id);
|
|
|
|
/**
|
|
* @brief Add a GPU to the group
|
|
*
|
|
* @details This method can add a GPU to the group
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The group id to which the GPU will be added.
|
|
*
|
|
* @param[in] gpu_index The GPU index to be added to the group.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
uint32_t gpu_index);
|
|
|
|
/**
|
|
* @brief Get information about a GPU group
|
|
*
|
|
* @details Get detail information about a GPU group created by
|
|
* rdc_group_gpu_create
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] p_rdc_group_id The GPU group handler created by
|
|
* rdc_group_gpu_create
|
|
*
|
|
* @param[out] p_rdc_group_info The information of the GPU
|
|
* group p_rdc_group_id.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_gpu_get_info(rdc_handle_t p_rdc_handle, rdc_gpu_group_t p_rdc_group_id,
|
|
rdc_group_info_t* p_rdc_group_info);
|
|
|
|
/**
|
|
* @brief Used to get information about all GPU groups in the system.
|
|
*
|
|
* @details Get the list of GPU group ids in the system.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[out] group_id_list Array reference to fill GPU group
|
|
* ids in the system.
|
|
*
|
|
* @param[out] count Number of GPU group returned in group_id_list.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_get_all_ids(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id_list[],
|
|
uint32_t* count);
|
|
|
|
/**
|
|
* @brief Destroy GPU group represented by p_rdc_group_id
|
|
*
|
|
* @details Delete the logic group represented by p_rdc_group_id
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] p_rdc_group_id The group id
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_gpu_destroy(rdc_handle_t p_rdc_handle, rdc_gpu_group_t p_rdc_group_id);
|
|
|
|
/**
|
|
* @brief create a group of fields
|
|
*
|
|
* @details The user can create a group of fields and perform an operation
|
|
* on a group of fields at once.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] num_field_ids Number of field IDs that are being provided
|
|
* in field_ids.
|
|
*
|
|
* @param[in] field_ids Field IDs to be added to the newly-created
|
|
* field group.
|
|
*
|
|
* @param[in] field_group_name Unique name for this group of fields.
|
|
*
|
|
* @param[out] rdc_field_group_id Handle to the newly-created field group
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_field_create(rdc_handle_t p_rdc_handle, uint32_t num_field_ids,
|
|
rdc_field_t* field_ids, const char* field_group_name,
|
|
rdc_field_grp_t* rdc_field_group_id);
|
|
|
|
/**
|
|
* @brief Get information about a field group
|
|
*
|
|
* @details Get detail information about a field group created by
|
|
* rdc_group_field_create
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] rdc_field_group_id The field group handler created by
|
|
* rdc_group_field_create
|
|
*
|
|
* @param[out] field_group_info The information of the field group
|
|
* rdc_field_group_id.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_field_get_info(rdc_handle_t p_rdc_handle, rdc_field_grp_t rdc_field_group_id,
|
|
rdc_field_group_info_t* field_group_info);
|
|
|
|
/**
|
|
* @brief Used to get information about all field groups in the system.
|
|
*
|
|
* @details Get the list of field group ids in the system.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[out] field_group_id_list Array reference to fill field group
|
|
* ids in the system.
|
|
*
|
|
* @param[out] count Number of field group returned in field_group_id_list.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_field_get_all_ids(rdc_handle_t p_rdc_handle,
|
|
rdc_field_grp_t field_group_id_list[], uint32_t* count);
|
|
|
|
/**
|
|
* @brief Destroy field group represented by rdc_field_group_id
|
|
*
|
|
* @details Delete the logic group represented by rdc_field_group_id
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] rdc_field_group_id The field group id
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_group_field_destroy(rdc_handle_t p_rdc_handle, rdc_field_grp_t rdc_field_group_id);
|
|
|
|
/**
|
|
* @brief Request the RDC start recording updates for a given field
|
|
* collection.
|
|
*
|
|
* @details Note that the first update of the field will not occur
|
|
* until the next field update cycle. To force a field update cycle,
|
|
* user must call rdc_field_update_all(1)
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The group of GPUs to be watched.
|
|
*
|
|
* @param[in] field_group_id The collection of fields to record
|
|
*
|
|
* @param[in] update_freq How often to update fields in usec.
|
|
*
|
|
* @param[in] max_keep_age How long to keep data for fields in seconds.
|
|
*
|
|
* @param[in] max_keep_samples Maximum number of samples to keep. 0=no limit.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_field_watch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_field_grp_t field_group_id, uint64_t update_freq,
|
|
double max_keep_age, uint32_t max_keep_samples);
|
|
|
|
/**
|
|
* @brief Request a latest cached field of a GPU
|
|
*
|
|
* @details Note that the field can be cached after called rdc_field_watch
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] gpu_index The GPU index.
|
|
*
|
|
* @param[in] field The field id
|
|
*
|
|
* @param[out] value The field value got from cache.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_field_get_latest_value(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
|
|
rdc_field_t field, rdc_field_value* value);
|
|
|
|
/**
|
|
* @brief Request a history cached field of a GPU
|
|
*
|
|
* @details Note that the field can be cached after called rdc_field_watch
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] gpu_index The GPU index.
|
|
*
|
|
* @param[in] field The field id
|
|
*
|
|
* @param[in] since_time_stamp Timestamp to request values since in
|
|
* usec since 1970.
|
|
*
|
|
* @param[out] next_since_time_stamp Timestamp to use for sinceTimestamp
|
|
* on next call to this function
|
|
*
|
|
* @param[out] value The field value got from cache.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_field_get_value_since(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
|
|
rdc_field_t field, uint64_t since_time_stamp,
|
|
uint64_t* next_since_time_stamp, rdc_field_value* value);
|
|
|
|
/**
|
|
* @brief Stop record updates for a given field collection.
|
|
*
|
|
* @details The cache of those fields will not be updated after this call
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[in] field_group_id The field group id.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_field_grp_t field_group_id);
|
|
|
|
/**
|
|
* @brief Run the diagnostic test cases
|
|
*
|
|
* @details Run the diagnostic test cases at different levels.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[in] level The level decides how long the test will run.
|
|
* The RDC_DIAG_LVL_SHORT only take a few seconds, and the
|
|
* the RDC_DIAG_LVL_LONG may take up to 15 minutes.
|
|
*
|
|
* @param[in] config Implementation specific configuration.
|
|
*
|
|
* @param[in] config_size Length of the configuration.
|
|
*
|
|
* @param[inout] response The detail results of the tests run.
|
|
*
|
|
* @param[inout] callback Callback for realtime communication
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_diag_level_t level, const char* config, size_t config_size,
|
|
rdc_diag_response_t* response, rdc_diag_callback_t* callback);
|
|
|
|
/**
|
|
* @brief Run one diagnostic test case
|
|
*
|
|
* @details Run a specific diagnostic test case.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[in] test_case The test case to run.
|
|
*
|
|
* @param[in] config Implementation specific configuration.
|
|
*
|
|
* @param[in] config_size Length of the configuration.
|
|
*
|
|
* @param[inout] result The results of the test.
|
|
*
|
|
* @param[inout] callback Callback for realtime communication
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_diag_test_cases_t test_case, const char* config,
|
|
size_t config_size, rdc_diag_test_result_t* result,
|
|
rdc_diag_callback_t* callback);
|
|
|
|
/**
|
|
* @brief Get a description of a provided RDC error status
|
|
*
|
|
* @details return the string in human readable format.
|
|
*
|
|
* @param[in] status The RDC status.
|
|
*
|
|
* @retval The string to describe the RDC status.
|
|
*/
|
|
const char* rdc_status_string(rdc_status_t status);
|
|
|
|
/**
|
|
* @brief Get the name of a field
|
|
*
|
|
* @details return the string in human readable format.
|
|
*
|
|
* @param[in] field_id The field id.
|
|
*
|
|
* @retval The string to describe the field.
|
|
*/
|
|
const char* field_id_string(rdc_field_t field_id);
|
|
|
|
/**
|
|
* @brief Get the field id from name
|
|
*
|
|
* @details return the field id from field name.
|
|
*
|
|
* @param[in] name The field name.
|
|
*
|
|
* @retval return RDC_FI_INVALID if the field name is invalid.
|
|
*/
|
|
rdc_field_t get_field_id_from_name(const char* name);
|
|
|
|
/**
|
|
* @brief Get a description of a diagnostic result.
|
|
*
|
|
* @details return the string in human readable format.
|
|
*
|
|
* @param[in] result The RDC diagnostic result.
|
|
*
|
|
* @retval The string to describe the RDC diagnostic result.
|
|
*/
|
|
const char* rdc_diagnostic_result_string(rdc_diag_result_t result);
|
|
|
|
/**
|
|
* @brief Set the RDC policy. Each group has multiple policies, these policies can be set by this
|
|
* API one by one. Multiple calls of this API will override the existing policy.
|
|
*
|
|
* @details Set the RDC policy
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[in] policy The policy to set
|
|
*
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_policy_t policy);
|
|
|
|
#define RDC_MAX_POLICY_SETTINGS 32
|
|
|
|
/**
|
|
* @brief Get the RDC policy
|
|
*
|
|
* @details Get the RDC policy
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[out] count The size of policies array
|
|
*
|
|
* @param[out] policies The policies to get
|
|
*
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_policy_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, uint32_t* count,
|
|
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]);
|
|
|
|
/**
|
|
* @brief delete the RDC policy for this group based on condition type
|
|
*
|
|
* @details clear the RDC policy for this group based on condition type. In a GPU group, only one
|
|
* policy can be set for a specific rdc_policy_condition_type_t
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id
|
|
*
|
|
* @param[in] condition_type The condition type to delete
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
|
|
rdc_status_t rdc_policy_delete(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_policy_condition_type_t condition_type);
|
|
|
|
/**
|
|
* Define the structure is used in RDC policy callback
|
|
*/
|
|
typedef struct {
|
|
unsigned int version;
|
|
rdc_policy_condition_t condition; //!< the condition that is meet
|
|
rdc_gpu_group_t group_id; //!< The group id trigger this callback
|
|
int64_t value; //!< The current value that meet the condition
|
|
} rdc_policy_callback_response_t;
|
|
|
|
/**
|
|
* The user data is the rdc_policy_callback_response_t
|
|
*/
|
|
typedef int (*rdc_policy_register_callback)(rdc_policy_callback_response_t* userData);
|
|
|
|
/**
|
|
* @brief Register a function to be called when policy condition is meet.
|
|
*
|
|
* @details Register the RDC policy callback
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[in] callback The callback function to be called when condition meet.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_policy_register_callback callback);
|
|
|
|
/**
|
|
* @brief un-register a policy callback function for a conditioin.
|
|
*
|
|
* @details Un-register the policy callback for a condition.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
|
|
|
|
/**
|
|
* @brief enable the health check for a group
|
|
*
|
|
* @details For each group, only one parameter can be set. If you want to
|
|
* clear the setting for a group, set component == 0x0
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[in] components The list of components that should be enabled for health check
|
|
* for example, RDC_HEALTH_WATCH_THERMAL | RDC_HEALTH_WATCH_POWER
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_health_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
unsigned int components);
|
|
|
|
/**
|
|
* @brief get the health check settings of a group
|
|
*
|
|
* @details get the health check settings of a component
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[out] components The list of components that should be enabled for health check
|
|
* for example, RDC_HEALTH_WATCH_THERMAL | RDC_HEALTH_WATCH_POWER
|
|
* if it is 0x0, then the health check not set for the group yet.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
unsigned int* components);
|
|
|
|
/**
|
|
* @brief Check health watch results
|
|
*
|
|
* @details If it has incidents.
|
|
* For each incident, check the component and error message.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @param[inout] response The detail results of the health.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_health_check(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_health_response_t* response);
|
|
|
|
/**
|
|
* @brief clear the health watch
|
|
*
|
|
* @details For each group, clear the setting.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] group_id The GPU group id.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
|
|
|
|
/**
|
|
* @brief Get the topology of the device
|
|
*
|
|
* @details topology of the device
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] gpu_index The GPU gpu index.
|
|
*
|
|
* @param[out] results The device topology
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_device_topology_get(rdc_handle_t p_rdc_handle, uint32_t gpu_index,
|
|
rdc_device_topology_t* results);
|
|
/**
|
|
* @brief Get the link status
|
|
*
|
|
* @details the link is up or down
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
*
|
|
* @param[out] results
|
|
* lts The link up or down status
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_link_status_get(rdc_handle_t p_rdc_handle, rdc_link_status_t* results);
|
|
/**
|
|
* @brief Set one configuration
|
|
*
|
|
* @details Set the given configuration to all nodes belong to the given group
|
|
*
|
|
* @param[in] p_rdc_handle Node handle
|
|
*
|
|
* @param[in] group_id Group id to which node belongs
|
|
*
|
|
* @param[in] setting Configuration to be set for the nodes
|
|
*
|
|
* @retval RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_config_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_config_setting_t setting);
|
|
|
|
/**
|
|
* @brief Get the configrations
|
|
*
|
|
* @details Get all the configurations for all nodes belong to the given group
|
|
*
|
|
* @param[in] p_rdc_handle Node handle
|
|
*
|
|
* @param[in] group_id Group id to which nodes belong
|
|
*
|
|
* @param[out] settings List of configurations returned.
|
|
*
|
|
* @retval RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_config_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
|
|
rdc_config_setting_list_t* settings);
|
|
|
|
/**
|
|
* @brief Clear the setting
|
|
*
|
|
* @details Clear all the configurations for the nodes belongs to the given group
|
|
*
|
|
* @param[in] p_rdc_handle Node handle
|
|
*
|
|
* @param[in] group_id Group id to which nodes belong
|
|
*
|
|
* @retval RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_config_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);
|
|
|
|
const char* get_rocm_path(const char* search_string);
|
|
|
|
/**
|
|
* @brief The device role
|
|
*/
|
|
typedef enum {
|
|
RDC_DEVICE_ROLE_PHYSICAL,
|
|
RDC_DEVICE_ROLE_PARTITION_INSTANCE //!< The partition instance
|
|
} rdc_device_role_t;
|
|
|
|
/**
|
|
* @brief The device type
|
|
*/
|
|
typedef enum { RDC_DEVICE_TYPE_GPU, RDC_DEVICE_TYPE_CPU } rdc_device_type_t;
|
|
|
|
typedef struct {
|
|
uint32_t device_index; //!< Physical device index
|
|
uint32_t instance_index; //!< Instance or core index
|
|
rdc_device_role_t entity_role; //!< Physical device or partition instance
|
|
rdc_device_type_t device_type; //!< Type
|
|
} rdc_entity_info_t;
|
|
|
|
/**
|
|
* @brief The function to decode the entity info from entity index
|
|
* @details
|
|
* | 31 30 29| 28 27 | 21 20 19 ... 12 11 | 10 9 8 7 6 5 4 3 2 1 0 |
|
|
* |---------|-------|--------------------|---------------------------|
|
|
* | Type | Role | Instance | Device |
|
|
* |---------|-------|--------------------|---------------------------|
|
|
* the 32 bit entity index is crafted based on above structure, this function
|
|
* will decode them into a data structure
|
|
*
|
|
* @param[in] entity_index The entity index.
|
|
*
|
|
* @retval rdc_entity_info_t is returned for decode structure
|
|
*/
|
|
|
|
rdc_entity_info_t rdc_get_info_from_entity_index(uint32_t entity_index);
|
|
|
|
/**
|
|
* @brief The function to encode the entity info to entity index
|
|
* @details
|
|
* | 31 30 29| 28 27 | 21 20 19 ... 12 11 | 10 9 8 7 6 5 4 3 2 1 0 |
|
|
* |---------|-------|--------------------|---------------------------|
|
|
* | Type | Role | Instance | Device |
|
|
* |---------|-------|--------------------|---------------------------|
|
|
* the 32 bit entity index is crafted based on above structure, this function
|
|
* will encode them to index
|
|
*
|
|
* @param[in] info The entity info to encode.
|
|
*
|
|
* @retval entity_index is returned
|
|
*/
|
|
uint32_t rdc_get_entity_index_from_info(rdc_entity_info_t info);
|
|
|
|
// map from amdsmi_accelerator_partition_resource_type_t
|
|
typedef enum {
|
|
RDC_ACCELERATOR_XCC = 0,
|
|
RDC_ACCELERATOR_ENCODER,
|
|
RDC_ACCELERATOR_DECODER,
|
|
RDC_ACCELERATOR_DMA,
|
|
RDC_ACCELERATOR_JPEG,
|
|
RDC_ACCELERATOR_RESOURCE_MAX,
|
|
RDC_ACCELERATOR_LAST = RDC_ACCELERATOR_RESOURCE_MAX
|
|
} rdc_instance_resource_type_t;
|
|
|
|
// map from amdsmi_accelerator_partition_resource_profile_t
|
|
typedef struct {
|
|
rdc_instance_resource_type_t resource_type;
|
|
uint32_t partition_resource; // The resources a partition can be used, which may be shared
|
|
uint32_t num_partitions_share_resource; // If it is greater than 1, then resource is shared.
|
|
} rdc_resource_profile_t;
|
|
|
|
/**
|
|
* @brief Query the resource allocation for a device/instance
|
|
*
|
|
* @details The profile contains detail information how resource is allocated.
|
|
*
|
|
* As an example, MI300X has 8 XCCs and 4 Decoders, in DPX mode, the physical device is
|
|
* partitioned to 2 instances, so each instance will have 4 XCC and 2 Decoder and they are
|
|
* not shared.
|
|
* [XCC, 4, 0], [DECODER, 2, 0]
|
|
*
|
|
* If it is CPX mode, the physical device is partitioned to 8 instances, and each instance
|
|
* have 1 XCC and 2 instances are sharing the same decoder.
|
|
* [XCC, 1, 0], [DECODER, 1, 1]
|
|
*
|
|
* If entity_index is the physical device, it should return all resources of the device:
|
|
* [XCC, 8, 0], [DECODER, 4, 0]
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
*
|
|
* @param[in] entity_index The GPU index to query. It can be physical device or instance.
|
|
*
|
|
* @param[in] resource_type Which resource type to query
|
|
*
|
|
* @param[out] profile The details how the resource is allocated.
|
|
*
|
|
* @retval ::RDC_ST_OK is returned upon successful call.
|
|
*/
|
|
rdc_status_t rdc_instance_profile_get(rdc_handle_t p_rdc_handle, uint32_t entity_index,
|
|
rdc_instance_resource_type_t resource_type,
|
|
rdc_resource_profile_t* profile);
|
|
|
|
/**
|
|
* @brief Get the number of partitions for the specified GPU index.
|
|
*
|
|
* @param[in] p_rdc_handle The RDC handler.
|
|
* @param[in] index The GPU index to query.
|
|
* @param[out] num_partition Pointer to a variable to receive the number of partitions.
|
|
*
|
|
* @retval ::RDC_ST_OK on success.
|
|
*/
|
|
rdc_status_t rdc_get_num_partition(rdc_handle_t p_rdc_handle, uint32_t index,
|
|
uint16_t* num_partition);
|
|
|
|
/**
|
|
* @brief Check if gpuid is partition string
|
|
*
|
|
* @param[in] s - singular partition string
|
|
* @retval bool - if partition string or not
|
|
*/
|
|
bool rdc_is_partition_string(const char* s);
|
|
|
|
/**
|
|
* @brief Parse partition id into physical gpu and partition
|
|
*
|
|
* @param[in] s - singular partition string
|
|
* @param[out] physicalGpu - socket id
|
|
* @param[out] partition - partition id
|
|
*
|
|
* @retval bool - success
|
|
*/
|
|
bool rdc_parse_partition_string(const char* s, uint32_t* physicalGpu, uint32_t* partition);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif // __cplusplus
|
|
|
|
#endif // INCLUDE_RDC_RDC_H_
|