Merge rocm-smi/amd-staging into amd-dev 20240119
Change-Id: Ie706473ff92a91b19e95d2d58f64904cad73a89a
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
[ROCm/amdsmi commit: 6132074089]
Tá an tiomantas seo le fáil i:
@@ -0,0 +1,4 @@
|
||||
---
|
||||
Language: Cpp
|
||||
BasedOnStyle: Google
|
||||
ColumnLimit: 100
|
||||
@@ -0,0 +1,25 @@
|
||||
# THIS FILE IS GENERATED FROM .clangd!
|
||||
# Run .update-clang-tidy.sh to regenerate.
|
||||
Checks:
|
||||
bugprone*,
|
||||
clang-analyzer*,
|
||||
google*,
|
||||
misc*,
|
||||
modernize*,
|
||||
-abseil*,
|
||||
-bugprone-easily-swappable-parameters,
|
||||
-bugprone-reserved-identifier,
|
||||
-clang-analyzer-security.insecureAPI.strcpy,
|
||||
-cppcoreguidelines*,
|
||||
-cppcoreguidelines-pro*,
|
||||
-misc-non-copyable-objects,
|
||||
-misc-use-anonymous-namespace,
|
||||
-modernize-avoid-c-arrays,
|
||||
-modernize-redundant-void-arg,
|
||||
-modernize-use-auto,
|
||||
-modernize-use-nodiscard,
|
||||
-modernize-use-noexcept,
|
||||
-modernize-use-trailing-return-type,
|
||||
-modernize-use-using,
|
||||
-performance*,
|
||||
-readability*,
|
||||
@@ -0,0 +1,37 @@
|
||||
CompileFlags:
|
||||
Remove: -W*
|
||||
Add: [-Wall, -pedantic, -I/opt/rocm/include, -I/opt/rocm/include/hsa, -I/opt/rocm/include/rocprofiler]
|
||||
Compiler: clang++
|
||||
|
||||
# list here: https://clang.llvm.org/extra/clang-tidy/checks/list.html
|
||||
Diagnostics:
|
||||
UnusedIncludes: Strict
|
||||
# rules below are copied into .clang-tidy using ./.update-clang-tidy.sh
|
||||
# please keep the rules sorted alphabetically
|
||||
ClangTidy:
|
||||
Add: [
|
||||
bugprone*,
|
||||
clang-analyzer*,
|
||||
google*,
|
||||
misc*,
|
||||
modernize*,
|
||||
]
|
||||
Remove: [
|
||||
abseil*,
|
||||
bugprone-easily-swappable-parameters,
|
||||
bugprone-reserved-identifier,
|
||||
clang-analyzer-security.insecureAPI.strcpy,
|
||||
cppcoreguidelines*,
|
||||
cppcoreguidelines-pro*,
|
||||
misc-non-copyable-objects,
|
||||
misc-use-anonymous-namespace,
|
||||
modernize-avoid-c-arrays,
|
||||
modernize-redundant-void-arg,
|
||||
modernize-use-auto,
|
||||
modernize-use-nodiscard,
|
||||
modernize-use-noexcept,
|
||||
modernize-use-trailing-return-type,
|
||||
modernize-use-using,
|
||||
performance*,
|
||||
readability*,
|
||||
]
|
||||
@@ -13,3 +13,4 @@ indent_style = space
|
||||
charset = utf-8
|
||||
indent_style = space
|
||||
indent_size = 2
|
||||
max_line_length = 100
|
||||
|
||||
@@ -34,3 +34,8 @@ device/
|
||||
|
||||
# misc
|
||||
esmi_ib_library/
|
||||
|
||||
# do NOT ignore these files
|
||||
!.clang-format
|
||||
!.clang-tidy
|
||||
!.clangd
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
# - How to use:
|
||||
# python3 -m pip install pre-commit
|
||||
# pre-commit install --install hooks
|
||||
# Upon a new commit - the hooks should automagically run
|
||||
#
|
||||
# - How to skip:
|
||||
# git commit --no-verify
|
||||
# or
|
||||
# SKIP=clang-format-docker git commit
|
||||
# SKIP=cpplint-docker git commit
|
||||
|
||||
fail_fast: false
|
||||
repos:
|
||||
# For portability I decided to use Docker containers
|
||||
- repo: https://github.com/dmitrii-galantsev/pre-commit-docker-cpplint
|
||||
rev: 0.0.3
|
||||
hooks:
|
||||
- id: clang-format-docker
|
||||
- id: cpplint-docker
|
||||
# Below is a local way of running formatters and linters
|
||||
# NOTE: clang-tidy is not used in the above tests
|
||||
# - repo: https://github.com/pocc/pre-commit-hooks
|
||||
# rev: v1.3.5
|
||||
# hooks:
|
||||
# - id: clang-format
|
||||
# args: [--no-diff, -i]
|
||||
# - id: clang-tidy
|
||||
# args: [-p=build, --quiet]
|
||||
# - id: cpplint
|
||||
# args: [--verbose=5]
|
||||
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -x # trace
|
||||
set -e # exit immediately if command fails
|
||||
set -u # exit if an undefined variable is found
|
||||
|
||||
awk '
|
||||
BEGIN {
|
||||
print "# THIS FILE IS GENERATED FROM .clangd!"
|
||||
print "# Run ./.update-clang-tidy.sh to regenerate."
|
||||
print "Checks:"
|
||||
}
|
||||
/Add: \[$/{
|
||||
a=1
|
||||
next
|
||||
}
|
||||
/]/{
|
||||
a=0
|
||||
}
|
||||
a{
|
||||
gsub(/^\s+/," ")
|
||||
print
|
||||
}
|
||||
|
||||
/Remove: \[$/{
|
||||
r=1
|
||||
next
|
||||
}
|
||||
/]/{
|
||||
r=0
|
||||
}
|
||||
r{
|
||||
gsub(/^\s+/," -")
|
||||
print
|
||||
}
|
||||
' .clangd | tee .clang-tidy
|
||||
@@ -1,21 +1,25 @@
|
||||
# Change Log for ROCm SMI Library
|
||||
|
||||
Full documentation for rocm_smi_lib is available at [https://docs.amd.com/](https://docs.amd.com/category/SMI%20API%20Guides).
|
||||
Full documentation for rocm_smi_lib is available at [https://docs.amd.com/](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/).
|
||||
|
||||
## rocm_smi_lib for ROCm 5.5.0
|
||||
|
||||
### Optimizations
|
||||
|
||||
- Add new test to measure api execution time.
|
||||
- Remove the shared mutex if no process is using it.
|
||||
|
||||
### Added
|
||||
|
||||
- ROCm SMI CLI: Add --showtempgraph Feature.
|
||||
|
||||
### Changed
|
||||
|
||||
- Relying on vendor ID to detect AMDGPU.
|
||||
- Change pragma message to warning for backward compatibility.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fix --showproductname when device's SKU cannot be parsed out of the VBIOS string.
|
||||
- Fix compile error: ‘memcpy’ was not declared.
|
||||
- Fix order of CE and UE reporting in ROCm SMI CLI.
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
set noparent
|
||||
linelength=100
|
||||
filter=-build/include_subdir,-legal/copyright,-runtime/printf,-build/c++11,-runtime/int,-build/header_guard
|
||||
@@ -919,16 +919,6 @@ amdsmi_process_handle_t = ctypes.c_uint32
|
||||
class struct_amdsmi_proc_info_t(Structure):
|
||||
pass
|
||||
|
||||
class struct_engine_usage_(Structure):
|
||||
pass
|
||||
|
||||
struct_engine_usage_._pack_ = 1 # source:False
|
||||
struct_engine_usage_._fields_ = [
|
||||
('gfx', ctypes.c_uint64),
|
||||
('enc', ctypes.c_uint64),
|
||||
('reserved', ctypes.c_uint32 * 12),
|
||||
]
|
||||
|
||||
class struct_memory_usage_(Structure):
|
||||
pass
|
||||
|
||||
@@ -940,6 +930,16 @@ struct_memory_usage_._fields_ = [
|
||||
('reserved', ctypes.c_uint32 * 10),
|
||||
]
|
||||
|
||||
class struct_engine_usage_(Structure):
|
||||
pass
|
||||
|
||||
struct_engine_usage_._pack_ = 1 # source:False
|
||||
struct_engine_usage_._fields_ = [
|
||||
('gfx', ctypes.c_uint64),
|
||||
('enc', ctypes.c_uint64),
|
||||
('reserved', ctypes.c_uint32 * 12),
|
||||
]
|
||||
|
||||
struct_amdsmi_proc_info_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_proc_info_t._fields_ = [
|
||||
('name', ctypes.c_char * 32),
|
||||
|
||||
@@ -40,11 +40,12 @@ if(${ROCM_PATCH_VERSION})
|
||||
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
|
||||
else()
|
||||
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}")
|
||||
endif()
|
||||
set(${ROCM_SMI}_VERSION_MAJOR "${VERSION_MAJOR}")
|
||||
set(${ROCM_SMI}_VERSION_MINOR "${VERSION_MINOR}")
|
||||
set(${ROCM_SMI}_VERSION_PATCH "0")
|
||||
endif ()
|
||||
set(${ROCM_SMI}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}")
|
||||
set(${ROCM_SMI}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}")
|
||||
set(${ROCM_SMI}_VERSION_PATCH "${CPACK_PACKAGE_VERSION_PATCH}")
|
||||
set(${ROCM_SMI}_VERSION_BUILD "0")
|
||||
set(${ROCM_SMI}_VERSION_HASH "${PKG_VERSION_HASH}")
|
||||
message("SOVERSION: ${SO_VERSION_STRING}")
|
||||
|
||||
# Create a configure file to get version info from within library
|
||||
|
||||
@@ -745,8 +745,8 @@ auto print_error_or_value(rsmi_status_t status_code, const T& metric) {
|
||||
return str_values;
|
||||
}
|
||||
else if constexpr ((std::is_same_v<T, std::uint16_t>) ||
|
||||
(std::is_same_v<T, std::uint32_t>) ||
|
||||
(std::is_same_v<T, std::uint64_t>)) {
|
||||
(std::is_same_v<T, std::uint32_t>) ||
|
||||
(std::is_same_v<T, std::uint64_t>)) {
|
||||
return std::to_string(metric);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,6 +80,7 @@ extern "C" {
|
||||
//! The number of points that make up a voltage-frequency curve definition
|
||||
#define RSMI_NUM_VOLTAGE_CURVE_POINTS 3
|
||||
|
||||
|
||||
/**
|
||||
* @brief Error codes retured by rocm_smi_lib functions
|
||||
*/
|
||||
@@ -353,7 +354,7 @@ typedef struct {
|
||||
* Clock types
|
||||
*/
|
||||
typedef enum {
|
||||
RSMI_CLK_TYPE_SYS = 0x0, //!< System clock
|
||||
RSMI_CLK_TYPE_SYS = 0x0, //!< System clock
|
||||
RSMI_CLK_TYPE_FIRST = RSMI_CLK_TYPE_SYS,
|
||||
RSMI_CLK_TYPE_DF, //!< Data Fabric clock (for ASICs
|
||||
//!< running on a separate clock)
|
||||
@@ -970,6 +971,9 @@ struct metrics_table_header_t {
|
||||
uint8_t content_revision;
|
||||
/// \endcond
|
||||
};
|
||||
/// \cond Ignore in docs.
|
||||
typedef struct metrics_table_header_t metrics_table_header_t;
|
||||
/// \endcond
|
||||
|
||||
/**
|
||||
* @brief The following structure holds the gpu metrics values for a device.
|
||||
@@ -986,9 +990,14 @@ struct metrics_table_header_t {
|
||||
#define RSMI_NUM_HBM_INSTANCES 4
|
||||
|
||||
/**
|
||||
* @brief This should match kRSMI_MAX_NUM_VCN
|
||||
* @brief This should match kRSMI_MAX_NUM_VCNS
|
||||
*/
|
||||
#define RSMI_MAX_NUM_VCN 4
|
||||
#define RSMI_MAX_NUM_VCNS 4
|
||||
|
||||
/**
|
||||
* @brief This should match kRSMI_MAX_JPEG_ENGINES
|
||||
*/
|
||||
#define RSMI_MAX_NUM_JPEG_ENGS 32
|
||||
|
||||
/**
|
||||
* @brief This should match kRSMI_MAX_NUM_CLKS
|
||||
@@ -1109,7 +1118,7 @@ typedef struct {
|
||||
uint16_t current_socket_power;
|
||||
|
||||
// Utilization (%)
|
||||
uint16_t vcn_activity[RSMI_MAX_NUM_VCN]; // VCN instances activity percent (encode/decode)
|
||||
uint16_t vcn_activity[RSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode)
|
||||
|
||||
// Clock Lock Status. Each bit corresponds to clock instance
|
||||
uint32_t gfxclk_lock_status;
|
||||
@@ -1143,6 +1152,19 @@ typedef struct {
|
||||
uint16_t current_vclk0s[RSMI_MAX_NUM_CLKS];
|
||||
uint16_t current_dclk0s[RSMI_MAX_NUM_CLKS];
|
||||
|
||||
/*
|
||||
* v1.5 additions
|
||||
*/
|
||||
// JPEG activity percent (encode/decode)
|
||||
uint16_t jpeg_activity[RSMI_MAX_NUM_JPEG_ENGS];
|
||||
|
||||
// PCIE NAK sent accumulated count
|
||||
uint32_t pcie_nak_sent_count_acc;
|
||||
|
||||
// PCIE NAK received accumulated count
|
||||
uint32_t pcie_nak_rcvd_count_acc;
|
||||
|
||||
|
||||
/// \endcond
|
||||
} rsmi_gpu_metrics_t;
|
||||
|
||||
@@ -1358,7 +1380,7 @@ rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision);
|
||||
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
|
||||
*
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *sku);
|
||||
rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, char *sku);
|
||||
|
||||
/**
|
||||
* @brief Get the device vendor id associated with the device with provided
|
||||
@@ -1733,7 +1755,6 @@ rsmi_status_t rsmi_dev_subsystem_vendor_id_get(uint32_t dv_ind, uint16_t *id);
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Get the XGMI physical id associated with the device
|
||||
*
|
||||
@@ -4097,7 +4118,7 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst,
|
||||
/** @} */ // end of HWTopo
|
||||
|
||||
/*****************************************************************************/
|
||||
/** @defgroup compute_partition Compute Partition Functions
|
||||
/** @defgroup ComputePartition Compute Partition Functions
|
||||
* These functions are used to configure and query the device's
|
||||
* compute parition setting.
|
||||
* @{
|
||||
@@ -4182,10 +4203,10 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind);
|
||||
|
||||
/** @} */ // end of compute_partition
|
||||
/** @} */ // end of ComputePartition
|
||||
|
||||
/*****************************************************************************/
|
||||
/** @defgroup memory_partition Memory Partition Functions
|
||||
/** @defgroup memory_partition The Memory Partition Functions
|
||||
* These functions are used to query and set the device's current memory
|
||||
* partition.
|
||||
* @{
|
||||
@@ -4627,7 +4648,8 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind);
|
||||
* Metric multi-valued counter types
|
||||
*/
|
||||
typedef uint16_t GPUMetricTempHbm_t[RSMI_NUM_HBM_INSTANCES];
|
||||
typedef uint16_t GPUMetricVcnActivity_t[RSMI_MAX_NUM_VCN];
|
||||
typedef uint16_t GPUMetricVcnActivity_t[RSMI_MAX_NUM_VCNS];
|
||||
typedef uint16_t GPUMetricJpegActivity_t[RSMI_MAX_NUM_JPEG_ENGS];
|
||||
typedef uint64_t GPUMetricXgmiReadDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS];
|
||||
typedef uint64_t GPUMetricXgmiWriteDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS];
|
||||
typedef uint16_t GPUMetricCurrGfxClk_t[RSMI_MAX_NUM_GFX_CLKS];
|
||||
@@ -5113,7 +5135,7 @@ rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_valu
|
||||
*
|
||||
* @param[inout] vcn_activity_value a pointer to uint16_t to which the device gpu
|
||||
* metric unit will be stored
|
||||
* - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_VCN)
|
||||
* - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_VCNS)
|
||||
* element array (GPUMetricVcnActivity_t)
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
|
||||
|
||||
@@ -255,6 +255,7 @@ class Device {
|
||||
rsmi_status_t dev_log_gpu_metrics(std::ostringstream& outstream_metrics);
|
||||
AMGpuMetricsPublicLatestTupl_t dev_copy_internal_to_external_metrics();
|
||||
|
||||
|
||||
private:
|
||||
std::shared_ptr<Monitor> monitor_;
|
||||
std::shared_ptr<PowerMon> power_monitor_;
|
||||
@@ -277,7 +278,6 @@ class Device {
|
||||
bool returnWriteErr = false);
|
||||
rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query);
|
||||
|
||||
|
||||
uint64_t bdfid_;
|
||||
uint64_t kfd_gpu_id_;
|
||||
std::unordered_set<rsmi_event_group_t,
|
||||
|
||||
@@ -92,7 +92,10 @@ constexpr uint32_t kRSMI_MAX_NUM_GFX_CLKS = 8;
|
||||
constexpr uint32_t kRSMI_MAX_NUM_CLKS = 4;
|
||||
|
||||
// Note: This *must* match NUM_VCN
|
||||
constexpr uint32_t kRSMI_MAX_NUM_VCN = 4;
|
||||
constexpr uint32_t kRSMI_MAX_NUM_VCNS = 4;
|
||||
|
||||
// Note: This *must* match NUM_JPEG_ENG
|
||||
constexpr uint32_t kRSMI_MAX_JPEG_ENGINES = 32;
|
||||
|
||||
|
||||
struct AMDGpuMetricsHeader_v1_t
|
||||
@@ -326,7 +329,7 @@ struct AMDGpuMetrics_v14_t
|
||||
// Utilization (%)
|
||||
uint16_t m_average_gfx_activity;
|
||||
uint16_t m_average_umc_activity; // memory controller
|
||||
uint16_t m_vcn_activity[kRSMI_MAX_NUM_VCN]; // VCN instances activity percent (encode/decode)
|
||||
uint16_t m_vcn_activity[kRSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode)
|
||||
|
||||
// Energy (15.259uJ (2^-16) units)
|
||||
uint64_t m_energy_accumulator;
|
||||
@@ -383,7 +386,89 @@ struct AMDGpuMetrics_v14_t
|
||||
|
||||
uint16_t m_padding;
|
||||
};
|
||||
using AMGpuMetricsLatest_t = AMDGpuMetrics_v14_t;
|
||||
|
||||
struct AMDGpuMetrics_v15_t
|
||||
{
|
||||
~AMDGpuMetrics_v15_t() = default;
|
||||
|
||||
struct AMDGpuMetricsHeader_v1_t m_common_header;
|
||||
|
||||
// Temperature (Celsius). It will be zero (0) if unsupported.
|
||||
uint16_t m_temperature_hotspot;
|
||||
uint16_t m_temperature_mem;
|
||||
uint16_t m_temperature_vrsoc;
|
||||
|
||||
// Power (Watts)
|
||||
uint16_t m_current_socket_power;
|
||||
|
||||
// Utilization (%)
|
||||
uint16_t m_average_gfx_activity;
|
||||
uint16_t m_average_umc_activity; // memory controller
|
||||
uint16_t m_vcn_activity[kRSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode)
|
||||
uint16_t m_jpeg_activity[kRSMI_MAX_JPEG_ENGINES]; // JPEG activity percent (encode/decode)
|
||||
|
||||
// Energy (15.259uJ (2^-16) units)
|
||||
uint64_t m_energy_accumulator;
|
||||
|
||||
// Driver attached timestamp (in ns)
|
||||
uint64_t m_system_clock_counter;
|
||||
|
||||
// Throttle status
|
||||
uint32_t m_throttle_status;
|
||||
|
||||
// Clock Lock Status. Each bit corresponds to clock instance
|
||||
uint32_t m_gfxclk_lock_status;
|
||||
|
||||
// Link width (number of lanes) and speed (in 0.1 GT/s)
|
||||
uint16_t m_pcie_link_width;
|
||||
uint16_t m_pcie_link_speed; // in 0.1 GT/s
|
||||
|
||||
// XGMI bus width and bitrate (in Gbps)
|
||||
uint16_t m_xgmi_link_width;
|
||||
uint16_t m_xgmi_link_speed;
|
||||
|
||||
// Utilization Accumulated (%)
|
||||
uint32_t m_gfx_activity_acc;
|
||||
uint32_t m_mem_activity_acc;
|
||||
|
||||
// PCIE accumulated bandwidth (GB/sec)
|
||||
uint64_t m_pcie_bandwidth_acc;
|
||||
|
||||
// PCIE instantaneous bandwidth (GB/sec)
|
||||
uint64_t m_pcie_bandwidth_inst;
|
||||
|
||||
// PCIE L0 to recovery state transition accumulated count
|
||||
uint64_t m_pcie_l0_to_recov_count_acc;
|
||||
|
||||
// PCIE replay accumulated count
|
||||
uint64_t m_pcie_replay_count_acc;
|
||||
|
||||
// PCIE replay rollover accumulated count
|
||||
uint64_t m_pcie_replay_rover_count_acc;
|
||||
|
||||
// PCIE NAK sent accumulated count
|
||||
uint32_t m_pcie_nak_sent_count_acc;
|
||||
|
||||
// PCIE NAK received accumulated count
|
||||
uint32_t m_pcie_nak_rcvd_count_acc;
|
||||
|
||||
// XGMI accumulated data transfer size(KiloBytes)
|
||||
uint64_t m_xgmi_read_data_acc[kRSMI_MAX_NUM_XGMI_LINKS];
|
||||
uint64_t m_xgmi_write_data_acc[kRSMI_MAX_NUM_XGMI_LINKS];
|
||||
|
||||
// PMFW attached timestamp (10ns resolution)
|
||||
uint64_t m_firmware_timestamp;
|
||||
|
||||
// Current clocks (Mhz)
|
||||
uint16_t m_current_gfxclk[kRSMI_MAX_NUM_GFX_CLKS];
|
||||
uint16_t m_current_socclk[kRSMI_MAX_NUM_CLKS];
|
||||
uint16_t m_current_vclk0[kRSMI_MAX_NUM_CLKS];
|
||||
uint16_t m_current_dclk0[kRSMI_MAX_NUM_CLKS];
|
||||
uint16_t m_current_uclk;
|
||||
|
||||
uint16_t m_padding;
|
||||
};
|
||||
using AMGpuMetricsLatest_t = AMDGpuMetrics_v15_t;
|
||||
|
||||
/**
|
||||
* This is GPU Metrics version that gets to public access.
|
||||
@@ -410,6 +495,9 @@ using GPUMetricTempHbmTbl_t = GpuMetricU16Tbl_t;
|
||||
using GPUMetricVcnActivity_t = decltype(AMDGpuMetrics_v14_t::m_vcn_activity);
|
||||
using GPUMetricVcnActivityTbl_t = GpuMetricU16Tbl_t;
|
||||
|
||||
using GPUMetricJpegActivity_t = decltype(AMDGpuMetrics_v15_t::m_jpeg_activity);
|
||||
using GPUMetricJpegActivityTbl_t = GpuMetricU16Tbl_t;
|
||||
|
||||
using GPUMetricXgmiReadDataAcc_t = decltype(AMDGpuMetrics_v14_t::m_xgmi_read_data_acc);
|
||||
using GPUMetricXgmiWriteDataAcc_t = decltype(AMDGpuMetrics_v14_t::m_xgmi_write_data_acc);
|
||||
using GPUMetricXgmiAccTbl_t = GpuMetricU64Tbl_t;
|
||||
@@ -518,6 +606,7 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t
|
||||
kMetricGfxActivityAccumulator,
|
||||
kMetricMemActivityAccumulator,
|
||||
kMetricVcnActivity, //v1.4
|
||||
kMetricJpegActivity, //v1.5
|
||||
|
||||
// kGpuMetricAverageClock counters
|
||||
kMetricAvgGfxClockFrequency,
|
||||
@@ -559,6 +648,8 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t
|
||||
kMetricPcieL0RecovCountAccumulator, //v1.4
|
||||
kMetricPcieReplayCountAccumulator, //v1.4
|
||||
kMetricPcieReplayRollOverCountAccumulator, //v1.4
|
||||
kMetricPcieNakSentCountAccumulator, //v1.5
|
||||
kMetricPcieNakReceivedCountAccumulator, //v1.5
|
||||
|
||||
// kGpuMetricPowerEnergy counters
|
||||
kMetricAvgSocketPower,
|
||||
@@ -608,6 +699,7 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t
|
||||
kGpuMetricV12 = (0x1 << 2),
|
||||
kGpuMetricV13 = (0x1 << 3),
|
||||
kGpuMetricV14 = (0x1 << 4),
|
||||
kGpuMetricV15 = (0x1 << 5),
|
||||
};
|
||||
using AMDGpuMetricVersionTranslationTbl_t = std::map<uint16_t, AMDGpuMetricVersionFlags_t>;
|
||||
using GpuMetricTypePtr_t = std::shared_ptr<void>;
|
||||
@@ -780,6 +872,40 @@ class GpuMetricsBase_v14_t final : public GpuMetricsBase_t
|
||||
|
||||
};
|
||||
|
||||
class GpuMetricsBase_v15_t final : public GpuMetricsBase_t
|
||||
{
|
||||
public:
|
||||
~GpuMetricsBase_v15_t() = default;
|
||||
|
||||
size_t sizeof_metric_table() override {
|
||||
return sizeof(AMDGpuMetrics_v15_t);
|
||||
}
|
||||
|
||||
GpuMetricTypePtr_t get_metrics_table() override
|
||||
{
|
||||
if (!m_gpu_metric_ptr) {
|
||||
m_gpu_metric_ptr.reset(&m_gpu_metrics_tbl, [](AMDGpuMetrics_v15_t*){});
|
||||
}
|
||||
assert(m_gpu_metric_ptr != nullptr);
|
||||
return m_gpu_metric_ptr;
|
||||
}
|
||||
|
||||
void dump_internal_metrics_table() override;
|
||||
|
||||
AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override
|
||||
{
|
||||
return AMDGpuMetricVersionFlags_t::kGpuMetricV15;
|
||||
}
|
||||
|
||||
rsmi_status_t populate_metrics_dynamic_tbl() override;
|
||||
AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override;
|
||||
|
||||
|
||||
private:
|
||||
AMDGpuMetrics_v15_t m_gpu_metrics_tbl;
|
||||
std::shared_ptr<AMDGpuMetrics_v15_t> m_gpu_metric_ptr;
|
||||
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, T& metric_value);
|
||||
|
||||
@@ -29,10 +29,12 @@ from rsmiBindings import *
|
||||
# Major version - Increment when backwards-compatibility breaks
|
||||
# Minor version - Increment when adding a new feature, set to 0 when major is incremented
|
||||
# Patch version - Increment when adding a fix, set to 0 when minor is incremented
|
||||
SMI_MAJ = 1
|
||||
SMI_MIN = 5
|
||||
# Hash version - Shortened commit hash. Print here and not with lib for consistency with amd-smi
|
||||
SMI_MAJ = 2
|
||||
SMI_MIN = 0
|
||||
SMI_PAT = 0
|
||||
__version__ = '%s.%s.%s' % (SMI_MAJ, SMI_MIN, SMI_PAT)
|
||||
# SMI_HASH is provided by rsmiBindings
|
||||
__version__ = '%s.%s.%s+%s' % (SMI_MAJ, SMI_MIN, SMI_PAT, SMI_HASH)
|
||||
|
||||
# Set to 1 if an error occurs
|
||||
RETCODE = 0
|
||||
@@ -828,23 +830,20 @@ def printTableRow(space, displayString, v_delim=" "):
|
||||
|
||||
def checkIfSecondaryDie(device):
|
||||
""" Checks if GCD(die) is the secondary die in a MCM.
|
||||
MI200 device specific feature check.
|
||||
The secondary dies lacks power management features.
|
||||
|
||||
Secondary dies lack power management features.
|
||||
TODO: switch to more robust way to check for primary/secondary die, when implemented in Kernel and rocm_smi_lib.
|
||||
@param device: The device to check
|
||||
"""
|
||||
power_cap = c_uint64()
|
||||
# secondary die can currently be determined by checking if all power1_* (power cap) values are equal to zero.
|
||||
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
|
||||
if not (rsmi_ret_ok(ret, None, 'get_power_cap', False) and power_cap.value == 0):
|
||||
return False
|
||||
ret = rocmsmi.rsmi_dev_power_cap_default_get(device, byref(power_cap))
|
||||
if not (rsmi_ret_ok(ret, None, 'get_power_cap_default', False) and power_cap.value == 0):
|
||||
return False
|
||||
ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power_cap))
|
||||
if not (rsmi_ret_ok(ret, None, 'get_power_avg', False) and power_cap.value == 0):
|
||||
return False
|
||||
return True
|
||||
energy_count = c_uint64()
|
||||
counter_resoution = c_float()
|
||||
timestamp = c_uint64()
|
||||
|
||||
# secondary die can be determined by checking if energy counter == 0
|
||||
ret = rocmsmi.rsmi_dev_energy_count_get(device, byref(energy_count), byref(counter_resoution), byref(timestamp))
|
||||
if (rsmi_ret_ok(ret, None, 'energy_count_secondary_die_check', silent=False)) and (energy_count.value == 0):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def resetClocks(deviceList):
|
||||
|
||||
@@ -55,6 +55,8 @@ dv_id = c_uint64()
|
||||
# GPU ID
|
||||
gpu_id = c_uint32(0)
|
||||
|
||||
SMI_HASH = '@PKG_VERSION_HASH@'
|
||||
|
||||
|
||||
# Policy enums
|
||||
RSMI_MAX_NUM_FREQUENCIES = 33
|
||||
|
||||
@@ -594,7 +594,7 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", returning get_dev_value_line() response = "
|
||||
<< getRSMIStatusString(ret);
|
||||
<< amd::smi::getRSMIStatusString(ret);
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
@@ -613,7 +613,7 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", returning strtoul() response = "
|
||||
<< getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(errno));
|
||||
<< amd::smi::getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(errno));
|
||||
LOG_TRACE(ss);
|
||||
|
||||
return amd::smi::ErrnoToRsmiStatus(errno);
|
||||
@@ -667,7 +667,7 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block,
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", returning rsmi_dev_ecc_enabled_get() response = "
|
||||
<< getRSMIStatusString(ret);
|
||||
<< amd::smi::getRSMIStatusString(ret);
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
@@ -728,7 +728,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
|
||||
default:
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", default case -> reporting "
|
||||
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED);
|
||||
<< amd::smi::getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED);
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
@@ -748,7 +748,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS"
|
||||
<< " -> reporting " << getRSMIStatusString(ret);
|
||||
<< " -> reporting " << amd::smi::getRSMIStatusString(ret);
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
@@ -767,7 +767,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
|
||||
fs2 >> ec->correctable_err;
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", reporting " << getRSMIStatusString(ret);;
|
||||
<< ", reporting " << amd::smi::getRSMIStatusString(ret);;
|
||||
LOG_TRACE(ss);
|
||||
return ret;
|
||||
CATCH
|
||||
@@ -935,7 +935,7 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) {
|
||||
|
||||
ret = get_id(dv_ind, amd::smi::kDevDevID, id);
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", reporting " << getRSMIStatusString(ret);
|
||||
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
|
||||
LOG_TRACE(ss);
|
||||
return ret;
|
||||
}
|
||||
@@ -950,7 +950,7 @@ rsmi_dev_oam_id_get(uint32_t dv_ind, uint16_t *id) {
|
||||
|
||||
ret = get_id(dv_ind, amd::smi::kDevXGMIPhysicalID, id);
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", reporting " << getRSMIStatusString(ret);
|
||||
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
|
||||
LOG_TRACE(ss);
|
||||
return ret;
|
||||
}
|
||||
@@ -965,7 +965,7 @@ rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision) {
|
||||
|
||||
ret = get_id(dv_ind, amd::smi::kDevDevRevID, revision);
|
||||
outss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", reporting " << getRSMIStatusString(ret);
|
||||
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
|
||||
LOG_TRACE(outss);
|
||||
return ret;
|
||||
}
|
||||
@@ -980,7 +980,7 @@ rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *id) {
|
||||
CHK_SUPPORT_NAME_ONLY(id)
|
||||
ret = get_id(dv_ind, amd::smi::kDevDevProdNum, id);
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", reporting " << getRSMIStatusString(ret);
|
||||
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
|
||||
LOG_TRACE(ss);
|
||||
return ret;
|
||||
CATCH
|
||||
@@ -4045,6 +4045,7 @@ rsmi_status_t rsmi_dev_serial_number_get(uint32_t dv_ind,
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint32_t ln = static_cast<uint32_t>(val_str.copy(serial_num, len));
|
||||
|
||||
serial_num[std::min(len - 1, ln)] = '\0';
|
||||
@@ -5125,15 +5126,11 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
|
||||
<< devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
|
||||
<< " | Cause: device board name does not support this action"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
|
||||
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
std::string newMemoryPartition
|
||||
= mapRSMIToStringMemoryPartitionTypes.at(memory_partition);
|
||||
std::string currentMemoryPartition;
|
||||
|
||||
switch (memory_partition) {
|
||||
case RSMI_MEMORY_PARTITION_NPS1:
|
||||
case RSMI_MEMORY_PARTITION_NPS2:
|
||||
@@ -5154,6 +5151,9 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
std::string newMemoryPartition
|
||||
= mapRSMIToStringMemoryPartitionTypes.at(memory_partition);
|
||||
std::string currentMemoryPartition;
|
||||
|
||||
// do nothing if memory_partition is the current mode
|
||||
rsmi_status_t ret_get = get_memory_partition(dv_ind, currentMemoryPartition);
|
||||
@@ -5196,13 +5196,16 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
|
||||
|
||||
if (amd::smi::ErrnoToRsmiStatus(ret) != RSMI_STATUS_SUCCESS) {
|
||||
rsmi_status_t err = amd::smi::ErrnoToRsmiStatus(ret);
|
||||
if (ret == EACCES) {
|
||||
err = RSMI_STATUS_NOT_SUPPORTED; // already verified permissions
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: "
|
||||
<< devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
|
||||
<< " | Cause: issue writing requested setting of " + newMemoryPartition
|
||||
<< " | Cause: issue writing reqested setting of " + newMemoryPartition
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(err) << " |";
|
||||
LOG_ERROR(ss);
|
||||
@@ -6027,7 +6030,6 @@ rsmi_dev_metrics_curr_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_v
|
||||
CATCH
|
||||
}
|
||||
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_metrics_avg_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value)
|
||||
{
|
||||
@@ -6541,7 +6543,6 @@ rsmi_dev_metrics_pcie_bandwidth_inst_get(uint32_t dv_ind, uint64_t* pcie_bandwid
|
||||
CATCH
|
||||
}
|
||||
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_metrics_pcie_l0_recov_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value)
|
||||
{
|
||||
@@ -6666,19 +6667,24 @@ rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_valu
|
||||
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempHbm);
|
||||
amd::smi::GPUMetricTempHbmTbl_t tmp_hbl_tbl{};
|
||||
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_hbl_tbl);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value));
|
||||
std::copy_n(std::begin(tmp_hbl_tbl), max_num_elems, *temp_hbm_value);
|
||||
}
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value));
|
||||
const auto copy_size =
|
||||
static_cast<uint16_t>((max_num_elems < tmp_hbl_tbl.size()) ? max_num_elems : tmp_hbl_tbl.size());
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | End Result "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< " | Metric Size: " << tmp_hbl_tbl.size()
|
||||
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
<< "\n | ======= end ======= "
|
||||
<< "\n | End Result "
|
||||
<< "\n | Device #: " << dv_ind
|
||||
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< "\n | Metric Size: " << tmp_hbl_tbl.size()
|
||||
<< "\n | Max num of elements: " << max_num_elems
|
||||
<< "\n | Copy size: " << copy_size
|
||||
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
LOG_INFO(ostrstream);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
std::memset(temp_hbm_value, 0, sizeof(*temp_hbm_value));
|
||||
std::copy_n(std::begin(tmp_hbl_tbl), copy_size, *temp_hbm_value);
|
||||
}
|
||||
|
||||
return status_code;
|
||||
CATCH
|
||||
@@ -6700,19 +6706,24 @@ rsmi_dev_metrics_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivity_t* vcn_a
|
||||
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity);
|
||||
amd::smi::GPUMetricVcnActivityTbl_t tmp_vcn_tbl{};
|
||||
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_vcn_tbl);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value));
|
||||
std::copy_n(std::begin(tmp_vcn_tbl), max_num_elems, *vcn_activity_value);
|
||||
}
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value));
|
||||
const auto copy_size =
|
||||
static_cast<uint16_t>((max_num_elems < tmp_vcn_tbl.size()) ? max_num_elems : tmp_vcn_tbl.size());
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | End Result "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< " | Metric Size: " << tmp_vcn_tbl.size()
|
||||
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
<< "\n | ======= end ======= "
|
||||
<< "\n | End Result "
|
||||
<< "\n | Device #: " << dv_ind
|
||||
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< "\n | Metric Size: " << tmp_vcn_tbl.size()
|
||||
<< "\n | Max num of elements: " << max_num_elems
|
||||
<< "\n | Copy size: " << copy_size
|
||||
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
LOG_INFO(ostrstream);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
std::memset(vcn_activity_value, 0, sizeof(*vcn_activity_value));
|
||||
std::copy_n(std::begin(tmp_vcn_tbl), copy_size, *vcn_activity_value);
|
||||
}
|
||||
|
||||
return status_code;
|
||||
CATCH
|
||||
@@ -6734,19 +6745,24 @@ rsmi_dev_metrics_xgmi_read_data_get(uint32_t dv_ind, GPUMetricXgmiReadDataAcc_t*
|
||||
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator);
|
||||
amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{};
|
||||
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value));
|
||||
std::copy_n(std::begin(tmp_xgmi_acc_tbl), max_num_elems, *xgmi_read_data_acc_value);
|
||||
}
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value));
|
||||
const auto copy_size =
|
||||
static_cast<uint16_t>((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size());
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | End Result "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< " | Metric Size: " << tmp_xgmi_acc_tbl.size()
|
||||
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
<< "\n | ======= end ======= "
|
||||
<< "\n | End Result "
|
||||
<< "\n | Device #: " << dv_ind
|
||||
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< "\n | Metric Size: " << tmp_xgmi_acc_tbl.size()
|
||||
<< "\n | Max num of elements: " << max_num_elems
|
||||
<< "\n | Copy size: " << copy_size
|
||||
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
LOG_INFO(ostrstream);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
std::memset(xgmi_read_data_acc_value, 0, sizeof(*xgmi_read_data_acc_value));
|
||||
std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_read_data_acc_value);
|
||||
}
|
||||
|
||||
return status_code;
|
||||
CATCH
|
||||
@@ -6768,19 +6784,24 @@ rsmi_dev_metrics_xgmi_write_data_get(uint32_t dv_ind, GPUMetricXgmiWriteDataAcc_
|
||||
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator);
|
||||
amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{};
|
||||
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value));
|
||||
std::copy_n(std::begin(tmp_xgmi_acc_tbl), max_num_elems, *xgmi_write_data_acc_value);
|
||||
}
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value));
|
||||
const auto copy_size =
|
||||
static_cast<uint16_t>((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size());
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | End Result "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< " | Metric Size: " << tmp_xgmi_acc_tbl.size()
|
||||
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
<< "\n | ======= end ======= "
|
||||
<< "\n | End Result "
|
||||
<< "\n | Device #: " << dv_ind
|
||||
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< "\n | Metric Size: " << tmp_xgmi_acc_tbl.size()
|
||||
<< "\n | Max num of elements: " << max_num_elems
|
||||
<< "\n | Copy size: " << copy_size
|
||||
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
LOG_INFO(ostrstream);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
std::memset(xgmi_write_data_acc_value, 0, sizeof(*xgmi_write_data_acc_value));
|
||||
std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_write_data_acc_value);
|
||||
}
|
||||
|
||||
return status_code;
|
||||
CATCH
|
||||
@@ -6800,26 +6821,28 @@ rsmi_dev_metrics_curr_gfxclk_get(uint32_t dv_ind, GPUMetricCurrGfxClk_t* current
|
||||
}
|
||||
|
||||
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock);
|
||||
rsmi_gpu_metrics_t gpu = {};
|
||||
auto status = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu);
|
||||
if (status == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
std::copy_n(std::begin(gpu.current_gfxclks),
|
||||
static_cast<uint16_t>(
|
||||
sizeof(gpu.current_gfxclks)/sizeof(gpu.current_gfxclks[0])),
|
||||
*current_gfxclk_value);
|
||||
}
|
||||
amd::smi::GPUMetricCurrGfxClkTbl_t tmp_curr_gfxclk_tbl{};
|
||||
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_gfxclk_tbl);
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*current_gfxclk_value) - std::begin(*current_gfxclk_value));
|
||||
const auto copy_size =
|
||||
static_cast<uint16_t>((max_num_elems < tmp_curr_gfxclk_tbl.size()) ? max_num_elems : tmp_curr_gfxclk_tbl.size());
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | End Result "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< " | Metric Size: " << static_cast<uint16_t>(
|
||||
sizeof(gpu.current_gfxclks)/sizeof(gpu.current_gfxclks[0]))
|
||||
<< " | Returning = " << status << " "
|
||||
<< getRSMIStatusString(status) << " |";
|
||||
<< "\n | ======= end ======= "
|
||||
<< "\n | End Result "
|
||||
<< "\n | Device #: " << dv_ind
|
||||
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< "\n | Metric Size: " << tmp_curr_gfxclk_tbl.size()
|
||||
<< "\n | Max num of elements: " << max_num_elems
|
||||
<< "\n | Copy size: " << copy_size
|
||||
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
LOG_INFO(ostrstream);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
std::memset(current_gfxclk_value, 0, sizeof(*current_gfxclk_value));
|
||||
std::copy_n(std::begin(tmp_curr_gfxclk_tbl), copy_size, *current_gfxclk_value);
|
||||
}
|
||||
|
||||
return status;
|
||||
return status_code;
|
||||
CATCH
|
||||
}
|
||||
|
||||
@@ -6839,19 +6862,23 @@ rsmi_dev_metrics_curr_socclk_get(uint32_t dv_ind, GPUMetricCurrSocClk_t* current
|
||||
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrSocClock);
|
||||
amd::smi::GPUMetricCurrSocClkTbl_t tmp_curr_socclk_tbl{};
|
||||
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_socclk_tbl);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*current_socclk_value) - std::begin(*current_socclk_value));
|
||||
std::copy_n(std::begin(tmp_curr_socclk_tbl), max_num_elems, *current_socclk_value);
|
||||
}
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*current_socclk_value) - std::begin(*current_socclk_value));
|
||||
const auto copy_size =
|
||||
static_cast<uint16_t>((max_num_elems < tmp_curr_socclk_tbl.size()) ? max_num_elems : tmp_curr_socclk_tbl.size());
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | End Result "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< " | Metric Size: " << tmp_curr_socclk_tbl.size()
|
||||
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
<< "\n | ======= end ======= "
|
||||
<< "\n | End Result "
|
||||
<< "\n | Device #: " << dv_ind
|
||||
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< "\n | Metric Size: " << tmp_curr_socclk_tbl.size()
|
||||
<< "\n | Max num of elements: " << max_num_elems
|
||||
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
LOG_INFO(ostrstream);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
std::memset(current_socclk_value, 0, sizeof(*current_socclk_value));
|
||||
std::copy_n(std::begin(tmp_curr_socclk_tbl), copy_size, *current_socclk_value);
|
||||
}
|
||||
|
||||
return status_code;
|
||||
CATCH
|
||||
@@ -6873,19 +6900,24 @@ rsmi_dev_metrics_curr_vclk0_get(uint32_t dv_ind, GPUMetricCurrVClk0_t* current_v
|
||||
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrVClock0);
|
||||
amd::smi::GPUMetricCurrVClkTbl_t tmp_curr_vclk0_tbl{};
|
||||
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_vclk0_tbl);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*current_vclk_value) - std::begin(*current_vclk_value));
|
||||
std::copy_n(std::begin(tmp_curr_vclk0_tbl), max_num_elems, *current_vclk_value);
|
||||
}
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*current_vclk_value) - std::begin(*current_vclk_value));
|
||||
const auto copy_size =
|
||||
static_cast<uint16_t>((max_num_elems < tmp_curr_vclk0_tbl.size()) ? max_num_elems : tmp_curr_vclk0_tbl.size());
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | End Result "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< " | Metric Size: " << tmp_curr_vclk0_tbl.size()
|
||||
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
<< "\n | ======= end ======= "
|
||||
<< "\n | End Result "
|
||||
<< "\n | Device #: " << dv_ind
|
||||
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< "\n | Metric Size: " << tmp_curr_vclk0_tbl.size()
|
||||
<< "\n | Max num of elements: " << max_num_elems
|
||||
<< "\n | Copy size: " << copy_size
|
||||
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
LOG_INFO(ostrstream);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
std::memset(current_vclk_value, 0, sizeof(*current_vclk_value));
|
||||
std::copy_n(std::begin(tmp_curr_vclk0_tbl), copy_size, *current_vclk_value);
|
||||
}
|
||||
|
||||
return status_code;
|
||||
CATCH
|
||||
@@ -6934,19 +6966,24 @@ rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_d
|
||||
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrDClock0);
|
||||
amd::smi::GPUMetricCurrDClkTbl_t tmp_curr_dclk0_tbl;
|
||||
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_dclk0_tbl);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*current_dclk_value) - std::begin(*current_dclk_value));
|
||||
std::copy_n(std::begin(tmp_curr_dclk0_tbl), max_num_elems, *current_dclk_value);
|
||||
}
|
||||
const auto max_num_elems =
|
||||
static_cast<uint16_t>(std::end(*current_dclk_value) - std::begin(*current_dclk_value));
|
||||
const auto copy_size =
|
||||
static_cast<uint16_t>((max_num_elems < tmp_curr_dclk0_tbl.size()) ? max_num_elems : tmp_curr_dclk0_tbl.size());
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | End Result "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< " | Metric Size: " << tmp_curr_dclk0_tbl.size()
|
||||
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
<< "\n | ======= end ======= "
|
||||
<< "\n | End Result "
|
||||
<< "\n | Device #: " << dv_ind
|
||||
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
|
||||
<< "\n | Metric Size: " << tmp_curr_dclk0_tbl.size()
|
||||
<< "\n | Max num of elements: " << max_num_elems
|
||||
<< "\n | Copy size: " << copy_size
|
||||
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
|
||||
LOG_INFO(ostrstream);
|
||||
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
std::memset(current_dclk_value, 0, sizeof(*current_dclk_value));
|
||||
std::copy_n(std::begin(tmp_curr_dclk0_tbl), copy_size, *current_dclk_value);
|
||||
}
|
||||
|
||||
return status_code;
|
||||
CATCH
|
||||
@@ -7277,6 +7314,7 @@ rsmi_dev_metrics_header_info_get(uint32_t dv_ind, metrics_table_header_t* header
|
||||
CATCH
|
||||
}
|
||||
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_metrics_xcd_counter_get(uint32_t dv_ind, uint16_t* xcd_counter_value)
|
||||
{
|
||||
@@ -7335,6 +7373,7 @@ rsmi_dev_metrics_log_get(uint32_t dv_ind)
|
||||
return status_code;
|
||||
CATCH
|
||||
}
|
||||
|
||||
//
|
||||
// End of: new GPU Metrics related work.
|
||||
//
|
||||
|
||||
@@ -52,5 +52,6 @@
|
||||
#define rocm_smi_VERSION_MINOR @rocm_smi_VERSION_MINOR@
|
||||
#define rocm_smi_VERSION_PATCH @rocm_smi_VERSION_PATCH@
|
||||
#define rocm_smi_VERSION_BUILD "@rocm_smi_VERSION_BUILD@"
|
||||
#define rocm_smi_VERSION_HASH "@rocm_smi_VERSION_HASH@"
|
||||
|
||||
#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_
|
||||
#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_
|
||||
|
||||
@@ -738,7 +738,7 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) {
|
||||
<< " | " << (fs.fail() ? "[ERROR] Failed read - format error" :
|
||||
"[GOOD] No fail - Successful read operation")
|
||||
<< " | " << (fs.eof() ? "[ERROR] Failed read - EOF error" :
|
||||
"[GOOD] No eof error - Successful read operation")
|
||||
"[GOOD] No eof - Successful read operation")
|
||||
<< " | " << (fs.good() ? "[GOOD] read good - Successful read operation" :
|
||||
"[ERROR] Failed read - good error");
|
||||
LOG_INFO(ss);
|
||||
@@ -800,7 +800,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr,
|
||||
<< " | " << (fs.fail() ? "[ERROR] Failed write - format error" :
|
||||
"[GOOD] No fail - Successful write operation")
|
||||
<< " | " << (fs.eof() ? "[ERROR] Failed write - EOF error" :
|
||||
"[GOOD] No eof error - Successful write operation")
|
||||
"[GOOD] No eof - Successful write operation")
|
||||
<< " | " << (fs.good() ?
|
||||
"[GOOD] Write good - Successful write operation" :
|
||||
"[ERROR] Failed write - good error");
|
||||
|
||||
@@ -163,6 +163,7 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl
|
||||
{join_metrics_version(1, 2), AMDGpuMetricVersionFlags_t::kGpuMetricV12},
|
||||
{join_metrics_version(1, 3), AMDGpuMetricVersionFlags_t::kGpuMetricV13},
|
||||
{join_metrics_version(1, 4), AMDGpuMetricVersionFlags_t::kGpuMetricV14},
|
||||
{join_metrics_version(1, 5), AMDGpuMetricVersionFlags_t::kGpuMetricV15},
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -201,7 +202,8 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation
|
||||
{AMDGpuMetricsUnitType_t::kMetricAvgMmActivity, "AvgMmActivity"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, "GfxActivityAcc"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, "MemActivityAcc"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricVcnActivity, "VcnActivity"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricVcnActivity, "VcnActivity"}, /* v1.4 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricJpegActivity, "JpegActivity"}, /* v1.5 */
|
||||
|
||||
// kGpuMetricAverageClock counters
|
||||
{AMDGpuMetricsUnitType_t::kMetricAvgGfxClockFrequency, "AvgGfxClockFrequency"},
|
||||
@@ -213,11 +215,11 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation
|
||||
{AMDGpuMetricsUnitType_t::kMetricAvgDClock1Frequency, "AvgDClock1Frequency"},
|
||||
|
||||
// kGpuMetricCurrentClock counters
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, "CurrGfxClock"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrSocClock, "CurrSocClock"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, "CurrGfxClock"}, /* v1.4: Changed to array */
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrSocClock, "CurrSocClock"}, /* v1.4: Changed to array */
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrUClock, "CurrUClock"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrVClock0, "CurrVClock0"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrDClock0, "CurrDClock0"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrVClock0, "CurrVClock0"}, /* v1.4: Changed to array */
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrDClock0, "CurrDClock0"}, /* v1.4: Changed to array */
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrVClock1, "CurrVClock1"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrDClock1, "CurrDClock1"},
|
||||
|
||||
@@ -226,7 +228,7 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation
|
||||
{AMDGpuMetricsUnitType_t::kMetricIndepThrottleStatus, "IndepThrottleStatus"},
|
||||
|
||||
// kGpuMetricGfxClkLockStatus counters
|
||||
{AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, "GfxClkLockStatus"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, "GfxClkLockStatus"}, /* v1.4 */
|
||||
|
||||
// kGpuMetricCurrentFanSpeed counters
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrFanSpeed, "CurrFanSpeed"},
|
||||
@@ -234,19 +236,21 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation
|
||||
// kGpuMetricLinkWidthSpeed counters
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, "PcieLinkWidth"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, "PcieLinkSpeed"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, "PcieBandwidthAcc"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, "PcieBandwidthInst"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, "XgmiLinkWidth"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, "XgmiLinkSpeed"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, "XgmiReadDataAcc"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, "XgmiWriteDataAcc"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, "PcieL0RecovCountAcc"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, "PcieReplayCountAcc"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, "PcieReplayRollOverCountAcc"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, "PcieBandwidthAcc"}, /* v1.4 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, "PcieBandwidthInst"}, /* v1.4 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, "XgmiLinkWidth"}, /* v1.4 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, "XgmiLinkSpeed"}, /* v1.4 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, "XgmiReadDataAcc"}, /* v1.4 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, "XgmiWriteDataAcc"}, /* v1.4 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, "PcieL0RecovCountAcc"}, /* v1.4 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, "PcieReplayCountAcc"}, /* v1.4 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, "PcieReplayRollOverCountAcc"}, /* v1.4 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator, "PcieNakSentCountAcc"}, /* v1.5 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator, "PcieNakRcvdCountAcc"}, /* v1.5 */
|
||||
|
||||
// kGpuMetricPowerEnergy counters
|
||||
{AMDGpuMetricsUnitType_t::kMetricAvgSocketPower, "AvgSocketPower"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, "CurrSocketPower"},
|
||||
{AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, "CurrSocketPower"}, /* v1.4 */
|
||||
{AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, "EnergyAcc"},
|
||||
|
||||
// kGpuMetricVoltage counters
|
||||
@@ -343,6 +347,7 @@ AMDGpuMetricFactories_t amd_gpu_metrics_factory_table
|
||||
{AMDGpuMetricVersionFlags_t::kGpuMetricV12, std::make_shared<GpuMetricsBase_v12_t>(GpuMetricsBase_v12_t{})},
|
||||
{AMDGpuMetricVersionFlags_t::kGpuMetricV13, std::make_shared<GpuMetricsBase_v13_t>(GpuMetricsBase_v13_t{})},
|
||||
{AMDGpuMetricVersionFlags_t::kGpuMetricV14, std::make_shared<GpuMetricsBase_v14_t>(GpuMetricsBase_v14_t{})},
|
||||
{AMDGpuMetricVersionFlags_t::kGpuMetricV15, std::make_shared<GpuMetricsBase_v15_t>(GpuMetricsBase_v15_t{})},
|
||||
};
|
||||
|
||||
GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version)
|
||||
@@ -462,6 +467,341 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str
|
||||
return multi_values;
|
||||
}
|
||||
|
||||
void GpuMetricsBase_v15_t::dump_internal_metrics_table()
|
||||
{
|
||||
std::ostringstream ostrstream;
|
||||
std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n";
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= DEBUG ======= "
|
||||
<< " | Metric Version: " << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header)
|
||||
<< " | Size: " << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size)
|
||||
<< " |"
|
||||
<< "\n";
|
||||
ostrstream << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n"
|
||||
<< " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n"
|
||||
<< " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n"
|
||||
|
||||
<< " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n"
|
||||
|
||||
<< " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
|
||||
<< " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n";
|
||||
|
||||
ostrstream << " vcn_activity: " << "\n";
|
||||
auto idx = uint64_t(0);
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_vcn_activity) {
|
||||
ostrstream << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ostrstream << " jpeg_activity: " << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_jpeg_activity) {
|
||||
ostrstream << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ostrstream << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n"
|
||||
<< " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n"
|
||||
|
||||
<< " throttle_status: " << m_gpu_metrics_tbl.m_throttle_status << "\n"
|
||||
|
||||
<< " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
|
||||
<< " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"
|
||||
|
||||
<< " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n"
|
||||
|
||||
<< " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n"
|
||||
<< " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n"
|
||||
|
||||
<< " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n"
|
||||
<< " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n"
|
||||
|
||||
<< " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n"
|
||||
<< " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n"
|
||||
|
||||
<< " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n"
|
||||
<< " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n"
|
||||
<< " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n"
|
||||
<< " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n"
|
||||
<< " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n"
|
||||
<< " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n"
|
||||
<< " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n";
|
||||
|
||||
ostrstream << " xgmi_read_data_acc: " << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) {
|
||||
ostrstream << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ostrstream << " xgmi_write_data_acc: " << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) {
|
||||
ostrstream << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ostrstream << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n";
|
||||
|
||||
ostrstream << " current_gfxclk: " << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) {
|
||||
ostrstream << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ostrstream << " current_socclk: " << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) {
|
||||
ostrstream << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ostrstream << " current_vclk0: " << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) {
|
||||
ostrstream << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ostrstream << " current_dclk0: " << "\n";
|
||||
idx = 0;
|
||||
for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) {
|
||||
ostrstream << "\t [" << idx << "]: " << temp << "\n";
|
||||
++idx;
|
||||
}
|
||||
|
||||
ostrstream << " padding: " << m_gpu_metrics_tbl.m_padding << "\n";
|
||||
LOG_DEBUG(ostrstream);
|
||||
}
|
||||
|
||||
rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl()
|
||||
{
|
||||
std::ostringstream ostrstream;
|
||||
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
|
||||
ostrstream << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ostrstream);
|
||||
|
||||
//
|
||||
// Note: Any metric treatment/changes (if any) should happen before they
|
||||
// get written to internal/external tables.
|
||||
//
|
||||
auto run_metric_adjustments_v15 = [&]() {
|
||||
ostrstream << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
const auto gpu_metrics_version = translate_flag_to_metric_version(get_gpu_metrics_version_used());
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= info ======= "
|
||||
<< " | Applying adjustments "
|
||||
<< " | Metric Version: " << stringfy_metric_header_version(
|
||||
disjoin_metrics_version(gpu_metrics_version))
|
||||
<< " |";
|
||||
LOG_TRACE(ostrstream);
|
||||
|
||||
// firmware_timestamp is at 10ns resolution
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= Changes ======= "
|
||||
<< " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp
|
||||
<< " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
|
||||
m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
|
||||
LOG_DEBUG(ostrstream);
|
||||
};
|
||||
|
||||
|
||||
// Adjustments/Changes specific to this version
|
||||
run_metric_adjustments_v15();
|
||||
|
||||
// Temperature Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot,
|
||||
"temperature_hotspot"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_temperature_mem,
|
||||
"temperature_mem"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc,
|
||||
"temperature_vrsoc"))
|
||||
);
|
||||
|
||||
// Power/Energy Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_current_socket_power,
|
||||
"curr_socket_power"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator,
|
||||
"energy_acc"))
|
||||
);
|
||||
|
||||
// Utilization Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity,
|
||||
"average_gfx_activity"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity,
|
||||
"average_umc_activity"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVcnActivity,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_vcn_activity,
|
||||
"[average_vcn_activity]"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricJpegActivity,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_jpeg_activity,
|
||||
"[average_jpeg_activity]"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc,
|
||||
"gfx_activity_acc"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc,
|
||||
"mem_activity_acc"))
|
||||
);
|
||||
|
||||
// Timestamp Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp,
|
||||
"firmware_timestamp"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter,
|
||||
"system_clock_counter"))
|
||||
);
|
||||
|
||||
// Throttle Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleStatus]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricThrottleStatus,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_throttle_status,
|
||||
"throttle_status"))
|
||||
);
|
||||
|
||||
// GfxLock Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_gfxclk_lock_status,
|
||||
"gfxclk_lock_status"))
|
||||
);
|
||||
|
||||
// Link/Width/Speed Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width,
|
||||
"pcie_link_width"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed,
|
||||
"pcie_link_speed"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_width,
|
||||
"xgmi_link_width"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_speed,
|
||||
"xgmi_link_speed"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc,
|
||||
"pcie_bandwidth_acc"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_inst,
|
||||
"pcie_bandwidth_inst"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc,
|
||||
"pcie_l0_recov_count_acc"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_count_acc,
|
||||
"pcie_replay_count_acc"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc,
|
||||
"pcie_replay_rollover_count_acc"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc,
|
||||
"pcie_nak_sent_count_acc"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc,
|
||||
"pcie_nak_rcvd_count_acc"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc,
|
||||
"[xgmi_read_data_acc]"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_xgmi_write_data_acc,
|
||||
"[xgmi_write_data_acc]"))
|
||||
);
|
||||
|
||||
// CurrentClock Info
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk,
|
||||
"[current_gfxclk]"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_current_socclk,
|
||||
"[current_socclk]"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_current_vclk0,
|
||||
"[current_vclk0]"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_current_dclk0,
|
||||
"[current_dclk0]"))
|
||||
);
|
||||
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
|
||||
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock,
|
||||
format_metric_row(m_gpu_metrics_tbl.m_current_uclk,
|
||||
"current_uclk"))
|
||||
);
|
||||
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Success "
|
||||
<< " | Returning = " << getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_TRACE(ostrstream);
|
||||
|
||||
return status_code;
|
||||
}
|
||||
|
||||
|
||||
void GpuMetricsBase_v14_t::dump_internal_metrics_table()
|
||||
{
|
||||
std::ostringstream ostrstream;
|
||||
@@ -827,6 +1167,10 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
|
||||
std::end(rsmi_gpu_metrics.vcn_activity),
|
||||
init_max_uint_types<std::uint16_t>());
|
||||
|
||||
std::fill(std::begin(rsmi_gpu_metrics.jpeg_activity),
|
||||
std::end(rsmi_gpu_metrics.jpeg_activity),
|
||||
init_max_uint_types<std::uint16_t>());
|
||||
|
||||
rsmi_gpu_metrics.gfxclk_lock_status = init_max_uint_types<decltype(rsmi_gpu_metrics.gfxclk_lock_status)>();
|
||||
rsmi_gpu_metrics.xgmi_link_width = init_max_uint_types<decltype(rsmi_gpu_metrics.xgmi_link_width)>();
|
||||
rsmi_gpu_metrics.xgmi_link_speed = init_max_uint_types<decltype(rsmi_gpu_metrics.xgmi_link_speed)>();
|
||||
@@ -836,35 +1180,33 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
|
||||
rsmi_gpu_metrics.pcie_replay_count_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_replay_count_acc)>();
|
||||
rsmi_gpu_metrics.pcie_replay_rover_count_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_replay_rover_count_acc)>();
|
||||
|
||||
std::fill_n(&rsmi_gpu_metrics.xgmi_read_data_acc[0],
|
||||
(sizeof(rsmi_gpu_metrics.xgmi_read_data_acc) /
|
||||
sizeof(rsmi_gpu_metrics.xgmi_read_data_acc[0])),
|
||||
std::numeric_limits<uint64_t>::max());
|
||||
std::fill(std::begin(rsmi_gpu_metrics.xgmi_read_data_acc),
|
||||
std::end(rsmi_gpu_metrics.xgmi_read_data_acc),
|
||||
init_max_uint_types<std::uint64_t>());
|
||||
|
||||
std::fill_n(&rsmi_gpu_metrics.xgmi_write_data_acc[0],
|
||||
(sizeof(rsmi_gpu_metrics.xgmi_write_data_acc) /
|
||||
sizeof(rsmi_gpu_metrics.xgmi_write_data_acc[0])),
|
||||
std::numeric_limits<uint64_t>::max());
|
||||
std::fill(std::begin(rsmi_gpu_metrics.xgmi_write_data_acc),
|
||||
std::end(rsmi_gpu_metrics.xgmi_write_data_acc),
|
||||
init_max_uint_types<std::uint64_t>());
|
||||
|
||||
std::fill_n(&rsmi_gpu_metrics.current_gfxclks[0],
|
||||
(sizeof(rsmi_gpu_metrics.current_gfxclks) /
|
||||
sizeof(rsmi_gpu_metrics.current_gfxclks[0])),
|
||||
std::numeric_limits<uint16_t>::max());
|
||||
std::fill(std::begin(rsmi_gpu_metrics.current_gfxclks),
|
||||
std::end(rsmi_gpu_metrics.current_gfxclks),
|
||||
init_max_uint_types<std::uint16_t>());
|
||||
|
||||
std::fill_n(&rsmi_gpu_metrics.current_socclks[0],
|
||||
(sizeof(rsmi_gpu_metrics.current_socclks) /
|
||||
sizeof(rsmi_gpu_metrics.current_socclks[0])),
|
||||
std::numeric_limits<uint16_t>::max());
|
||||
std::fill(std::begin(rsmi_gpu_metrics.current_socclks),
|
||||
std::end(rsmi_gpu_metrics.current_socclks),
|
||||
init_max_uint_types<std::uint16_t>());
|
||||
|
||||
std::fill_n(&rsmi_gpu_metrics.current_vclk0s[0],
|
||||
(sizeof(rsmi_gpu_metrics.current_vclk0s) /
|
||||
sizeof(rsmi_gpu_metrics.current_vclk0s[0])),
|
||||
std::numeric_limits<uint16_t>::max());
|
||||
std::fill(std::begin(rsmi_gpu_metrics.current_vclk0s),
|
||||
std::end(rsmi_gpu_metrics.current_vclk0s),
|
||||
init_max_uint_types<std::uint16_t>());
|
||||
|
||||
std::fill(std::begin(rsmi_gpu_metrics.current_dclk0s),
|
||||
std::end(rsmi_gpu_metrics.current_dclk0s),
|
||||
init_max_uint_types<std::uint16_t>());
|
||||
|
||||
rsmi_gpu_metrics.pcie_nak_sent_count_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_nak_sent_count_acc)>();
|
||||
rsmi_gpu_metrics.pcie_nak_rcvd_count_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_nak_rcvd_count_acc)>();
|
||||
|
||||
std::fill_n(&rsmi_gpu_metrics.current_dclk0s[0],
|
||||
(sizeof(rsmi_gpu_metrics.current_dclk0s) /
|
||||
sizeof(rsmi_gpu_metrics.current_dclk0s[0])),
|
||||
std::numeric_limits<uint16_t>::max());
|
||||
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
@@ -876,6 +1218,195 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
|
||||
return status_code;
|
||||
}
|
||||
|
||||
AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v15_t::copy_internal_to_external_metrics()
|
||||
{
|
||||
std::ostringstream ostrstream;
|
||||
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
|
||||
ostrstream << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ostrstream);
|
||||
|
||||
auto copy_data_from_internal_metrics_tbl = [&]() {
|
||||
AMGpuMetricsPublicLatest_t metrics_public_init{};
|
||||
|
||||
//
|
||||
// Note: Initializing data members with their max. If field is max,
|
||||
// no data was assigned to it.
|
||||
init_max_public_gpu_matrics(metrics_public_init);
|
||||
|
||||
// Header
|
||||
metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size;
|
||||
metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision;
|
||||
metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision;
|
||||
|
||||
|
||||
// Temperature
|
||||
metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot;
|
||||
metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem;
|
||||
metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc;
|
||||
|
||||
// Power
|
||||
metrics_public_init.current_socket_power = m_gpu_metrics_tbl.m_current_socket_power;
|
||||
|
||||
// Utilization
|
||||
metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity;
|
||||
metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity;
|
||||
|
||||
// vcn_activity
|
||||
const auto vcn_activity_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_vcn_activity) -
|
||||
std::begin(m_gpu_metrics_tbl.m_vcn_activity));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_vcn_activity),
|
||||
vcn_activity_num_elems,
|
||||
metrics_public_init.vcn_activity);
|
||||
|
||||
// jpeg_activity
|
||||
const auto jpeg_activity_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_jpeg_activity) -
|
||||
std::begin(m_gpu_metrics_tbl.m_jpeg_activity));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_jpeg_activity),
|
||||
jpeg_activity_num_elems,
|
||||
metrics_public_init.jpeg_activity);
|
||||
|
||||
// Power/Energy
|
||||
metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator;
|
||||
|
||||
// Driver attached timestamp (in ns)
|
||||
metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter;
|
||||
|
||||
// Throttle status
|
||||
metrics_public_init.throttle_status = m_gpu_metrics_tbl.m_throttle_status;
|
||||
|
||||
// Clock Lock Status. Each bit corresponds to clock instance
|
||||
metrics_public_init.gfxclk_lock_status = m_gpu_metrics_tbl.m_gfxclk_lock_status;
|
||||
|
||||
// Link width (number of lanes) and speed
|
||||
metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width;
|
||||
metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed;
|
||||
|
||||
// XGMI bus width and bitrate
|
||||
metrics_public_init.xgmi_link_width = m_gpu_metrics_tbl.m_xgmi_link_width;
|
||||
metrics_public_init.xgmi_link_speed = m_gpu_metrics_tbl.m_xgmi_link_speed;
|
||||
|
||||
// Utilization Accumulated
|
||||
metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc;
|
||||
metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc;
|
||||
|
||||
// PCIE accumulated bandwidth
|
||||
metrics_public_init.pcie_bandwidth_acc = m_gpu_metrics_tbl.m_pcie_bandwidth_acc;
|
||||
|
||||
// PCIE instantaneous bandwidth
|
||||
metrics_public_init.pcie_bandwidth_inst = m_gpu_metrics_tbl.m_pcie_bandwidth_inst;
|
||||
|
||||
// PCIE L0 to recovery state transition accumulated count
|
||||
metrics_public_init.pcie_l0_to_recov_count_acc = m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc;
|
||||
|
||||
// PCIE replay accumulated count
|
||||
metrics_public_init.pcie_replay_count_acc = m_gpu_metrics_tbl.m_pcie_replay_count_acc;
|
||||
|
||||
// PCIE replay rollover accumulated count
|
||||
metrics_public_init.pcie_replay_rover_count_acc = m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc;
|
||||
|
||||
// PCIE NAK sent accumulated count
|
||||
metrics_public_init.pcie_nak_sent_count_acc = m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc;
|
||||
|
||||
// PCIE NAK received accumulated count
|
||||
metrics_public_init.pcie_nak_rcvd_count_acc = m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc;
|
||||
|
||||
// XGMI accumulated data transfer size
|
||||
// xgmi_read_data
|
||||
const auto xgmi_read_data_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_xgmi_read_data_acc) -
|
||||
std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc),
|
||||
xgmi_read_data_num_elems,
|
||||
metrics_public_init.xgmi_read_data_acc);
|
||||
// xgmi_write_data
|
||||
const auto xgmi_write_data_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_xgmi_write_data_acc) -
|
||||
std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc),
|
||||
xgmi_write_data_num_elems,
|
||||
metrics_public_init.xgmi_write_data_acc);
|
||||
|
||||
// PMFW attached timestamp (10ns resolution)
|
||||
metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp;
|
||||
|
||||
// Current clocks
|
||||
// current_gfxclk
|
||||
const auto curr_gfxclk_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_current_gfxclk) -
|
||||
std::begin(m_gpu_metrics_tbl.m_current_gfxclk));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_gfxclk),
|
||||
curr_gfxclk_num_elems,
|
||||
metrics_public_init.current_gfxclks);
|
||||
|
||||
// current_socclk
|
||||
const auto curr_socclk_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_current_socclk) -
|
||||
std::begin(m_gpu_metrics_tbl.m_current_socclk));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_socclk),
|
||||
curr_socclk_num_elems,
|
||||
metrics_public_init.current_socclks);
|
||||
|
||||
// current_vclk0
|
||||
const auto curr_vclk0_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_current_vclk0) -
|
||||
std::begin(m_gpu_metrics_tbl.m_current_vclk0));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_vclk0),
|
||||
curr_vclk0_num_elems,
|
||||
metrics_public_init.current_vclk0s);
|
||||
|
||||
// current_dclk0
|
||||
const auto curr_dclk0_num_elems =
|
||||
static_cast<uint16_t>(
|
||||
std::end(m_gpu_metrics_tbl.m_current_dclk0) -
|
||||
std::begin(m_gpu_metrics_tbl.m_current_dclk0));
|
||||
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_dclk0),
|
||||
curr_dclk0_num_elems,
|
||||
metrics_public_init.current_dclk0s);
|
||||
|
||||
metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk;
|
||||
|
||||
//
|
||||
// Note: Backwards compatibility -> Handling extra/exception cases
|
||||
// related to earlier versions (1.3)
|
||||
metrics_public_init.current_gfxclk = metrics_public_init.current_gfxclks[0];
|
||||
// metrics_public_init.average_gfxclk_frequency = metrics_public_init.current_gfxclks[0];
|
||||
|
||||
metrics_public_init.current_socclk = metrics_public_init.current_socclks[0];
|
||||
// metrics_public_init.average_socclk_frequency = metrics_public_init.current_socclks[0];
|
||||
|
||||
metrics_public_init.current_vclk0 = metrics_public_init.current_vclk0s[0];
|
||||
// metrics_public_init.average_vclk0_frequency = metrics_public_init.current_vclk0s[0];
|
||||
|
||||
metrics_public_init.current_vclk1 = metrics_public_init.current_vclk0s[1];
|
||||
// metrics_public_init.average_vclk1_frequency = metrics_public_init.current_vclk0s[1];
|
||||
|
||||
metrics_public_init.current_dclk0 = metrics_public_init.current_dclk0s[0];
|
||||
// metrics_public_init.average_dclk0_frequency = metrics_public_init.current_dclk0s[0];
|
||||
|
||||
metrics_public_init.current_dclk1 = metrics_public_init.current_dclk0s[1];
|
||||
// metrics_public_init.average_dclk1_frequency = metrics_public_init.current_dclk0s[1];
|
||||
|
||||
return metrics_public_init;
|
||||
}();
|
||||
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Success "
|
||||
<< " | Returning = " << getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_TRACE(ostrstream);
|
||||
|
||||
return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl);
|
||||
}
|
||||
|
||||
AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v14_t::copy_internal_to_external_metrics()
|
||||
{
|
||||
@@ -2154,11 +2685,9 @@ rsmi_status_t Device::dev_read_gpu_metrics_header_data()
|
||||
|
||||
// Check if/when metrics table needs to be refreshed.
|
||||
auto now_ts = actual_timestamp_in_secs();
|
||||
if (((!m_gpu_metrics_header.m_structure_size) ||
|
||||
(!m_gpu_metrics_header.m_format_revision) ||
|
||||
(!m_gpu_metrics_header.m_content_revision)) ||
|
||||
((now_ts - m_gpu_metrics_updated_timestamp) >=
|
||||
kRSMI_GPU_METRICS_EXPIRATION_SECS)) {
|
||||
if ((!m_gpu_metrics_header.m_structure_size) ||
|
||||
(!m_gpu_metrics_header.m_format_revision) ||
|
||||
(!m_gpu_metrics_header.m_content_revision)) {
|
||||
auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics,
|
||||
sizeof(AMDGpuMetricsHeader_v1_t),
|
||||
&m_gpu_metrics_header);
|
||||
@@ -2617,7 +3146,7 @@ rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t met
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
LOG_TRACE(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
}
|
||||
@@ -2727,7 +3256,7 @@ rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnit
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
LOG_TRACE(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
|
||||
|
||||
@@ -52,8 +52,8 @@
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
@@ -391,10 +391,6 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
<< "\n | final update: device->bdfid() holds correct device bdf";
|
||||
LOG_TRACE(ss);
|
||||
}
|
||||
if (ret != 0) {
|
||||
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
|
||||
"Failed to initialize rocm_smi library (amdgpu node discovery).");
|
||||
}
|
||||
|
||||
std::shared_ptr<amd::smi::Device> dev;
|
||||
// Sort index based on the BDF, collect BDF id firstly.
|
||||
@@ -437,6 +433,7 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
for (it = io_link_map_tmp.begin(); it != io_link_map_tmp.end(); it++)
|
||||
io_link_map_[it->first] = it->second;
|
||||
|
||||
|
||||
// Remove any drm nodes that don't have a corresponding readable kfd node.
|
||||
// kfd nodes will not be added if their properties file is not readable.
|
||||
auto dev_iter = devices_.begin();
|
||||
@@ -480,6 +477,7 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
|
||||
logSystemDetails();
|
||||
}
|
||||
|
||||
// Leaving below to help debug temp file issues
|
||||
// displayAppTmpFilesContent();
|
||||
std::string amdGPUDeviceList = displayAllDevicePaths(devices_);
|
||||
|
||||
@@ -42,11 +42,14 @@
|
||||
*/
|
||||
#define _GNU_SOURCE 1 // REQUIRED: to utilize some GNU features/functions, see
|
||||
// _GNU_SOURCE functions which check
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include <dirent.h>
|
||||
#include <dlfcn.h>
|
||||
#include <glob.h>
|
||||
#include <sys/utsname.h>
|
||||
#include <unistd.h>
|
||||
#include <dlfcn.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
|
||||
Tagairt in Eagrán Nua
Cuir bac ar úsáideoir