[SWDEV-547088] Dynamic GPU Metrics Implementation (#692)

* Added ability to format gpu_metrics v1_9
* New gpu_metrics format from the driver should allow amd-smi to parse with future compatibility guaranteed

---------

Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
Signed-off-by: adapryor <Adam.pryor@amd.com>
Co-authored-by: Oliveira, Daniel <daniel.oliveira@amd.com>
Tá an tiomantas seo le fáil i:
Pryor, Adam
2025-10-01 15:46:10 -05:00
tiomanta ag GitHub
tuismitheoir b727fe1f8b
tiomantas 5ef0b3c34d
D'athraigh 7 comhad le 2165 breiseanna agus 24 scriosta
+2
Féach ar an gComhad
@@ -246,6 +246,7 @@ set(CMN_SRC_LIST
"${ROCM_SRC_DIR}/rocm_smi_kfd.cc"
"${ROCM_SRC_DIR}/rocm_smi_io_link.cc"
"${ROCM_SRC_DIR}/rocm_smi_gpu_metrics.cc"
"${ROCM_SRC_DIR}/rocm_smi_dyn_gpu_metrics.cc"
"${ROCM_SRC_DIR}/rocm_smi.cc"
"${ROCM_SRC_DIR}/rocm_smi_logger.cc"
"${SHR_MUTEX_DIR}/shared_mutex.cc"
@@ -271,6 +272,7 @@ set(CMN_INC_LIST
"${ROCM_INC_DIR}/rocm_smi_kfd.h"
"${ROCM_INC_DIR}/rocm_smi_io_link.h"
"${ROCM_INC_DIR}/rocm_smi_gpu_metrics.h"
"${ROCM_INC_DIR}/rocm_smi_dyn_gpu_metrics.h"
"${ROCM_INC_DIR}/rocm_smi.h"
"${ROCM_INC_DIR}/rocm_smi_logger.h"
"${SHR_MUTEX_DIR}/shared_mutex.h"
+1 -1
Féach ar an gComhad
@@ -1698,7 +1698,7 @@ class AMDSMICommands():
partition_id = "N/A"
num_partition = gpu_metric['num_partition']
if num_partition == "N/A" and isinstance(partition_id, int) and partition_id > 0:
if num_partition == "N/A":
num_partition = 1 # Workaround for XCP metrics not providing num_partition in v1.0
logging.debug(f"num_partition is N/A and partition_id: {partition_id} (greater > 0).\nModified num_partition: {num_partition} to adjust for XCP metrics.")
+5
Féach ar an gComhad
@@ -271,6 +271,8 @@ class Device {
rsmi_status_t get_smi_device_identifiers(uint32_t device_id,
rsmi_device_identifiers_t *device_identifiers);
auto is_dynamic_gpu_metrics_supported() const -> bool { return m_is_dynamic_gpu_metrics_supported; }
private:
std::shared_ptr<Monitor> monitor_;
std::shared_ptr<PowerMon> power_monitor_;
@@ -308,6 +310,9 @@ class Device {
uint64_t m_gpu_metrics_updated_timestamp;
uint32_t m_device_id;
uint32_t m_partition_id;
// New dynamic GPU metrics support
bool m_is_dynamic_gpu_metrics_supported = false;
};
Tá difríocht comhad cosc orthu toisc go bhfuil sé ró-mhór Difríocht Luchtaigh
+40 -4
Féach ar an gComhad
@@ -25,6 +25,7 @@
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_dyn_gpu_metrics.h"
#include <array>
#include <algorithm>
@@ -42,7 +43,6 @@
#include <variant>
#include <vector>
/**
* All 1.4 and newer GPU metrics are now defined in this header.
*
@@ -89,12 +89,12 @@ constexpr uint32_t kRSMI_MAX_NUM_XCC = 8;
// Note: This *must* match MAX_XCP
constexpr uint32_t kRSMI_MAX_NUM_XCP = 8;
struct AMDGpuMetricsHeader_v1_t {
uint16_t m_structure_size;
uint8_t m_format_revision;
uint8_t m_content_revision;
};
struct amdgpu_xcp_metrics {
/* Utilization Instantaneous (%) */
uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC];
@@ -788,7 +788,7 @@ struct AMDGpuMetrics_v18_t {
/* PCIE other end recovery counter */
uint32_t m_pcie_lc_perf_other_end_recovery;
};
using AMGpuMetricsLatest_t = AMDGpuMetrics_v18_t;
using AMGpuMetricsLatest_t = AMDGpuDynamicMetrics_t;
/**
* This is GPU Metrics version that gets to public access.
@@ -1053,7 +1053,8 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t
kGpuMetricV15 = (0x1 << 5),
kGpuMetricV16 = (0x1 << 6),
kGpuMetricV17 = (0x1 << 7),
kGpuMetricV18 = (0x1 << 8), // Added new version flag
kGpuMetricV18 = (0x1 << 8), // Added new version flag: Last static GPU Metrics
kGpuMetricV19 = (0x1 << 9), // Dyn.GPU Metrics
};
using AMDGpuMetricVersionTranslationTbl_t = std::map<uint16_t, AMDGpuMetricVersionFlags_t>;
using GpuMetricTypePtr_t = std::shared_ptr<void>;
@@ -1311,6 +1312,41 @@ class GpuMetricsBase_v18_t final : public GpuMetricsBase_t {
std::shared_ptr<AMDGpuMetrics_v18_t> m_gpu_metric_ptr;
};
class GpuMetricsBaseDynamic_t final : public GpuMetricsBase_t {
public:
~GpuMetricsBaseDynamic_t() = default;
// Unused
size_t sizeof_metric_table() override { return 0; }
// Unused
GpuMetricTypePtr_t get_metrics_table() override { return nullptr; }
AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override {
if (m_header.m_format_revision != 1) {
return AMDGpuMetricVersionFlags_t::kGpuMetricNone;
}
return static_cast<AMDGpuMetricVersionFlags_t>(1u << m_header.m_content_revision);
}
// Store header and metrics table
inline rsmi_status_t set_parsed_dynamic(AMDGpuDynamicMetrics_t&& parsed) noexcept {
m_dyn = std::move(parsed);
m_header = m_dyn.get_header();
return rsmi_status_t::RSMI_STATUS_SUCCESS;
}
rsmi_status_t populate_metrics_dynamic_tbl() override;
AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override;
private:
AMDGpuDynamicMetrics_t m_dyn;
details::AMDGpuDynamicMetricsHeader_v1_t m_header{};
};
template<typename T>
rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind,
AMDGpuMetricsUnitType_t metric_counter, T& metric_value);
+333
Féach ar an gComhad
@@ -0,0 +1,333 @@
/*
* MIT License
*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Developed by:
*
* AMD ML Software Engineering
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of Advanced Micro Devices, Inc,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*
*/
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_dyn_gpu_metrics.h"
#include "rocm_smi/rocm_smi_logger.h"
#include "rocm_smi/rocm_smi_utils.h"
#include <cstddef>
#include <cstring>
#include <shared_mutex>
#include <optional>
namespace amd::smi
{
using namespace details;
struct Cursor {
const std::byte* byte_ptr;
std::size_t remainder;
};
// Used when mismatch in schema to safely skip value
static inline bool skip_payload(Cursor& cur,
AMDGpuMetricAttributeType_t t,
uint64_t instances) {
const std::size_t elem = get_metric_bytes(t);
if (elem == 0 || instances > std::numeric_limits<size_t>::max() / elem) {
return false;
}
const std::size_t bytes = static_cast<std::size_t>(instances) * elem;
if (cur.remainder < bytes) {
return false;
}
cur.byte_ptr += bytes;
cur.remainder -= bytes;
return true;
}
// Lookup a schema instance for (attr_id, attr_type)
static inline rsmi_status_t schema_lookup_instance( AMDGpuMetricAttributeId_t attr_id,
AMDGpuMetricAttributeType_t attr_type,
AMDGpuMetricAttributeInstance_t& schema_inst) {
if (const auto attr_id_itr = AMDGpuMetricsBaseSchema.find(attr_id); attr_id_itr != AMDGpuMetricsBaseSchema.end()) {
const auto& inst = attr_id_itr->second.m_instance;
if (inst.m_attribute_type == attr_type) {
schema_inst = inst;
return RSMI_STATUS_SUCCESS;
}
return RSMI_STATUS_NOT_SUPPORTED;
}
return RSMI_STATUS_NOT_FOUND;
}
template <class T>
static inline std::optional<T> read_scalar(Cursor& c) {
// Ensure we can read safely
if (c.remainder < sizeof(T)) {
return std::nullopt;
}
T v{};
std::memcpy(&v, c.byte_ptr, sizeof(T));
c.byte_ptr += sizeof(T);
c.remainder -= sizeof(T);
return v;
}
template <class T>
static inline std::optional<std::vector<T>> read_vector(Cursor& c, std::size_t count) {
static_assert(std::is_integral_v<T> && std::is_trivially_copyable_v<T>,
"metrics expect integral element types");
// Prevent size_t overflow
if (count > SIZE_MAX / sizeof(T) || count == 0) {
return std::nullopt;
}
// Ensure we can read entire array safely
const std::size_t bytes = count * sizeof(T);
if (c.remainder < bytes) {
return std::nullopt;
}
std::vector<T> out;
out.resize(count);
std::memcpy(out.data(), c.byte_ptr, bytes);
c.byte_ptr += bytes;
c.remainder -= bytes;
return out;
}
// Template to fill AMDGpuMetricAttributeValue_t with either a scalar<T> or vector<T>
template <typename T>
static inline std::optional<AMDGpuMetricAttributeValue_t> read_metric_value(Cursor& c,
uint64_t instances) {
if (instances == 1) {
if (auto v = read_scalar<T>(c)) {
return AMDGpuMetricAttributeValue_t{*v};
}
return std::nullopt;
}
if (auto vv = read_vector<T>(c, static_cast<std::size_t>(instances))) {
return AMDGpuMetricAttributeValue_t{std::move(*vv)};
}
return std::nullopt;
}
auto AMDGpuDynamicMetrics_t::parse_from_buffer(const std::byte* data,
std::size_t size) noexcept -> rsmi_status_t {
rsmi_status_t status = RSMI_STATUS_SUCCESS;
if (!data || (size < (sizeof(AMDGpuDynamicMetricsHeader_v1_t) + sizeof(uint32_t)))) {
return RSMI_STATUS_INSUFFICIENT_SIZE;
}
// Grab header
details::AMDGpuDynamicMetricsHeader_v1_t hdr{};
std::memcpy(&hdr, data, sizeof(hdr));
// Advance metrics pointer past header and keep track of remaining file size
Cursor cur{ (data + sizeof(hdr)), (size - sizeof(hdr)) };
// Grab attribute count, directly after header and increment
auto attr_count_opt = read_scalar<uint32_t>(cur);
if (!attr_count_opt) {
return RSMI_STATUS_UNEXPECTED_SIZE;
}
uint32_t attr_count = *attr_count_opt;
if (attr_count == 0 || attr_count > size){
return RSMI_STATUS_UNEXPECTED_SIZE;
}
details::AMDGpuMetricSchemaType_t metrics_data;
metrics_data.reserve(attr_count);
AMDGpuDynamicMetricsOffsetMap_t offsets;
for (uint32_t i = 0; i < attr_count; ++i) {
if (cur.remainder < sizeof(uint64_t)) {
return RSMI_STATUS_UNEXPECTED_SIZE;
}
// Absolute offset for attribute start in file
const std::size_t entry_start = static_cast<std::size_t>(cur.byte_ptr - data);
// Read attribute instance and increment
auto enc_opt = read_scalar<uint64_t>(cur);
if (!enc_opt) {
return RSMI_STATUS_UNEXPECTED_SIZE;
}
const uint64_t enc = *enc_opt;
const auto dec = amdgpu_metrics_decode_attr(enc);
const auto attr_type = static_cast<AMDGpuMetricAttributeType_t>(dec.m_attr_type);
const auto attr_id = static_cast<AMDGpuMetricAttributeId_t>(dec.m_attr_id);
const auto instances = static_cast<uint64_t>(dec.m_attr_instance);
if (instances == 0) {
return RSMI_STATUS_UNEXPECTED_SIZE;
}
// Schema lookup
AMDGpuMetricAttributeInstance_t inst{};
status = schema_lookup_instance(attr_id, attr_type, inst);
if (status != RSMI_STATUS_SUCCESS){
std::ostringstream ss;
ss << __PRETTY_FUNCTION__
<< " | Warn: schema lookup miss"
<< " | Attr ID: " << static_cast<std::underlying_type_t<AMDGpuMetricAttributeId_t>>(attr_id)
<< " | Attr Type: " << static_cast<std::underlying_type_t<AMDGpuMetricAttributeType_t>>(attr_type)
<< " | Returning = " << getRSMIStatusString(status)
<< " |";
LOG_TRACE(ss);
if (!skip_payload(cur, attr_type, instances)){
return status;
}
continue; // Do not emit row, go to next attribute
}
// Read scalar or all vector values after attribute instance
AMDGpuMetricAttributeValue_t val{};
std::optional<AMDGpuMetricAttributeValue_t> mv;
switch (attr_type) {
case AMDGpuMetricAttributeType_t::TYPE_UINT8: {
mv = read_metric_value<std::uint8_t>(cur, instances);
break;
}
case AMDGpuMetricAttributeType_t::TYPE_INT8: {
mv = read_metric_value<std::int8_t>(cur, instances);
break;
}
case AMDGpuMetricAttributeType_t::TYPE_UINT16: {
mv = read_metric_value<std::uint16_t>(cur, instances);
break;
}
case AMDGpuMetricAttributeType_t::TYPE_INT16: {
mv = read_metric_value<std::int16_t>(cur, instances);
break;
}
case AMDGpuMetricAttributeType_t::TYPE_UINT32: {
mv = read_metric_value<std::uint32_t>(cur, instances);
break;
}
case AMDGpuMetricAttributeType_t::TYPE_INT32: {
mv = read_metric_value<std::int32_t>(cur, instances);
break;
}
case AMDGpuMetricAttributeType_t::TYPE_UINT64: {
mv = read_metric_value<std::uint64_t>(cur, instances);
break;
}
case AMDGpuMetricAttributeType_t::TYPE_INT64: {
mv = read_metric_value<std::int64_t>(cur, instances);
break;
}
default: return RSMI_STATUS_INSUFFICIENT_SIZE;
}
if (!mv) {
return RSMI_STATUS_UNEXPECTED_SIZE;
}
val = std::move(*mv); // safely set val
const uint32_t row_index = static_cast<uint32_t>(metrics_data.size());
metrics_data.emplace_back(inst, val);
offsets.try_emplace(entry_start, row_index);
}
{
std::unique_lock<std::shared_mutex> lk(m_mutex);
m_header = hdr;
m_attr_count = attr_count;
m_dynamic_metrics_data.swap(metrics_data);
m_dynamic_metrics_data_offsets.swap(offsets);
}
return RSMI_STATUS_SUCCESS;
}
auto AMDGpuDynamicMetrics_t::parse_from_file(const std::string& metrics_file_path,
std::size_t read_size) -> rsmi_status_t {
AMDGPUMetricsDynDataBuffer_t buf;
auto st = read_dynamic_gpu_metrics_file(metrics_file_path, read_size, buf);
if (st != RSMI_STATUS_SUCCESS) {
return st;
}
return parse_from_buffer(reinterpret_cast<const std::byte*>(buf.data()), buf.size());
}
rsmi_status_t read_dynamic_gpu_metrics_file(const std::string& metrics_file_path,
const size_t read_size,
AMDGPUMetricsDynDataBuffer_t& out) {
// Clear output buffer and open file stream
out.clear();
std::ifstream gpu_metrics_file(metrics_file_path, std::ios::binary);
if (!gpu_metrics_file.is_open()) {
return RSMI_STATUS_NOT_FOUND;
}
if ((read_size <= 0)) {
return RSMI_STATUS_UNEXPECTED_SIZE;
}
out.resize(read_size);
gpu_metrics_file.read(reinterpret_cast<char*>(out.data()),
static_cast<std::streamsize>(read_size));
const std::streamsize gpu_metrics_filesize = gpu_metrics_file.gcount();
if(gpu_metrics_filesize <= 0){
out.clear();
return RSMI_STATUS_NO_DATA;
}
out.resize(static_cast<std::size_t>(gpu_metrics_filesize));
return RSMI_STATUS_SUCCESS;
}
} // namespace amd::smi
+531 -19
Féach ar an gComhad
@@ -21,6 +21,7 @@
*/
#include "rocm_smi/rocm_smi_gpu_metrics.h"
#include "rocm_smi/rocm_smi_dyn_gpu_metrics.h" // Dynamic metrics
#include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_main.h"
@@ -46,6 +47,8 @@
#include <tuple>
#include <type_traits>
#include <vector>
#include <cstddef>
#include <variant>
using namespace amd::smi;
@@ -145,6 +148,7 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl
{join_metrics_version(1, 6), AMDGpuMetricVersionFlags_t::kGpuMetricV16},
{join_metrics_version(1, 7), AMDGpuMetricVersionFlags_t::kGpuMetricV17},
{join_metrics_version(1, 8), AMDGpuMetricVersionFlags_t::kGpuMetricV18},
{join_metrics_version(1, 9), AMDGpuMetricVersionFlags_t::kGpuMetricV19}, // Dynamic GPU Metrics
};
/**
@@ -365,6 +369,7 @@ AMDGpuMetricFactories_t amd_gpu_metrics_factory_table
{AMDGpuMetricVersionFlags_t::kGpuMetricV16, std::make_shared<GpuMetricsBase_v16_t>(GpuMetricsBase_v16_t{})},
{AMDGpuMetricVersionFlags_t::kGpuMetricV17, std::make_shared<GpuMetricsBase_v17_t>(GpuMetricsBase_v17_t{})},
{AMDGpuMetricVersionFlags_t::kGpuMetricV18, std::make_shared<GpuMetricsBase_v18_t>(GpuMetricsBase_v18_t{})},
{AMDGpuMetricVersionFlags_t::kGpuMetricV19, std::make_shared<GpuMetricsBaseDynamic_t>()},
};
GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version)
@@ -417,6 +422,58 @@ constexpr T init_max_uint_types()
}
}
AMDGpuMetricsDataType_t dtype_from_attr(details::AMDGpuMetricAttributeType_t t) {
switch (t) {
case details::AMDGpuMetricAttributeType_t::TYPE_UINT8: {
return AMDGpuMetricsDataType_t::kUInt8;
}
case details::AMDGpuMetricAttributeType_t::TYPE_UINT16: {
return AMDGpuMetricsDataType_t::kUInt16;
}
case details::AMDGpuMetricAttributeType_t::TYPE_UINT32: {
return AMDGpuMetricsDataType_t::kUInt32;
}
default: {
return AMDGpuMetricsDataType_t::kUInt64;
}
}
}
template<typename Tp>
constexpr uint64_t safe_way_to_uint64(Tp value) {
if constexpr (std::is_signed_v<Tp>) {
using intermediate_type = std::conditional_t<sizeof(Tp) <= sizeof(int64_t), int64_t, std::make_signed_t<Tp>>;
return static_cast<uint64_t>(static_cast<intermediate_type>(value));
} else {
return static_cast<uint64_t>(value);
}
}
// Existing format_metric_row doesn't take vectors, so overload and write our own
template<typename T, typename A>
AMDGpuDynamicMetricTblValues_t
format_metric_row(const std::vector<T, A>& vec, const std::string& value_title, details::AMDGpuMetricAttributeType_t attr_type)
{
AMDGpuDynamicMetricTblValues_t out;
out.reserve(vec.size());
const auto n = static_cast<uint16_t>(
std::min<std::size_t>(vec.size(), std::numeric_limits<uint16_t>::max()));
for (uint16_t idx = 0; idx < n; ++idx) {
uint64_t u64 = safe_way_to_uint64(vec[idx]);
AMDGpuDynamicMetricsValue_t amdgpu_dynamic_metric_value_init{};
amdgpu_dynamic_metric_value_init.m_value = u64;
amdgpu_dynamic_metric_value_init.m_info = value_title + " : " + std::to_string(idx);
amdgpu_dynamic_metric_value_init.m_original_type = dtype_from_attr(attr_type);
out.emplace_back(std::move(amdgpu_dynamic_metric_value_init));
}
return out;
}
template<class T> struct is_vector : std::false_type {};
template<class U, class A> struct is_vector<std::vector<U,A>> : std::true_type {};
template<typename T>
AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::string& value_title)
{
@@ -483,6 +540,235 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str
return multi_values;
}
rsmi_status_t GpuMetricsBaseDynamic_t::populate_metrics_dynamic_tbl() {
std::ostringstream ss;
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{};
auto emit = [&](AMDGpuMetricsClassId_t cls, AMDGpuMetricsUnitType_t unit,
const char* label,
const details::AMDGpuMetricAttributeData_t& row) {
auto rows = std::visit([&](const auto& x) -> AMDGpuDynamicMetricTblValues_t {
using S = std::decay_t<decltype(x)>;
if constexpr (is_vector<S>::value) { // Would like to use is_multivalued() here, but compiler needs well-formed
return format_metric_row(x, std::string(label), row.m_instance.m_attribute_type);
} else {
return format_metric_row(x, std::string(label));
}
}, row.m_value);
m_metrics_dynamic_tbl[cls].insert({unit, std::move(rows)});
};
for (const auto& r : m_dyn.get_metric_rows()) {
switch (r.m_instance.m_attribute_id) {
// Power energy and temperature
case details::AMDGpuMetricAttributeId_t::TEMPERATURE_HOTSPOT:
emit(AMDGpuMetricsClassId_t::kGpuMetricTemperature, AMDGpuMetricsUnitType_t::kMetricTempHotspot,
"temperature_hotspot", r);
break;
case details::AMDGpuMetricAttributeId_t::TEMPERATURE_MEM:
emit(AMDGpuMetricsClassId_t::kGpuMetricTemperature, AMDGpuMetricsUnitType_t::kMetricTempMem,
"temperature_mem", r);
break;
case details::AMDGpuMetricAttributeId_t::TEMPERATURE_VRSOC:
emit(AMDGpuMetricsClassId_t::kGpuMetricTemperature, AMDGpuMetricsUnitType_t::kMetricTempVrSoc,
"temperature_vrsoc", r);
break;
case details::AMDGpuMetricAttributeId_t::CURR_SOCKET_POWER:
emit(AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy, AMDGpuMetricsUnitType_t::kMetricCurrSocketPower,
"curr_socket_power", r);
break;
case details::AMDGpuMetricAttributeId_t::ENERGY_ACCUMULATOR:
emit(AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy, AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator,
"energy_acc", r);
break;
// Utilization
case details::AMDGpuMetricAttributeId_t::AVERAGE_GFX_ACTIVITY:
emit(AMDGpuMetricsClassId_t::kGpuMetricUtilization, AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity,
"average_gfx_activity", r);
break;
case details::AMDGpuMetricAttributeId_t::AVERAGE_UMC_ACTIVITY:
emit(AMDGpuMetricsClassId_t::kGpuMetricUtilization, AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity,
"average_umc_activity", r);
break;
case details::AMDGpuMetricAttributeId_t::GFX_ACTIVITY_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricUtilization, AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator,
"gfx_activity_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::MEM_ACTIVITY_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricUtilization, AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator,
"mem_activity_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::GFXCLK_LOCK_STATUS:
emit(AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus, AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus,
"gfxclk_lock_status", r);
break;
// Metric Timestamp
case details::AMDGpuMetricAttributeId_t::FIRMWARE_TIMESTAMP:
emit(AMDGpuMetricsClassId_t::kGpuMetricTimestamp, AMDGpuMetricsUnitType_t::kMetricTSFirmware,
"firmware_timestamp", r);
break;
case details::AMDGpuMetricAttributeId_t::SYSTEM_CLOCK_COUNTER:
emit(AMDGpuMetricsClassId_t::kGpuMetricTimestamp, AMDGpuMetricsUnitType_t::kMetricTSClockCounter,
"system_clock_counter", r);
break;
// Throttle Residency
case details::AMDGpuMetricAttributeId_t::ACCUMULATION_COUNTER:
emit(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, AMDGpuMetricsUnitType_t::kMetricAccumulationCounter,
"accumulation_counter", r);
break;
// Link Width Speed
case details::AMDGpuMetricAttributeId_t::PCIE_LINK_WIDTH:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth,
"pcie_link_width", r);
break;
case details::AMDGpuMetricAttributeId_t::PCIE_LINK_SPEED:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed,
"pcie_link_speed", r);
break;
case details::AMDGpuMetricAttributeId_t::XGMI_LINK_WIDTH:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth,
"xgmi_link_width", r);
break;
case details::AMDGpuMetricAttributeId_t::XGMI_LINK_SPEED:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed,
"xgmi_link_speed", r);
break;
case details::AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator,
"pcie_bandwidth_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_INST:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst,
"pcie_bandwidth_inst", r);
break;
case details::AMDGpuMetricAttributeId_t::PCIE_L0_TO_RECOV_COUNT_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator,
"pcie_l0_recov_count_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::PCIE_REPLAY_COUNT_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator,
"pcie_replay_count_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::PCIE_REPLAY_ROVER_COUNT_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator,
"pcie_replay_rollover_count_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::PCIE_NAK_SENT_COUNT_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator,
"pcie_nak_sent_count_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::PCIE_NAK_RCVD_COUNT_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator,
"pcie_nak_rcvd_count_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::XGMI_READ_DATA_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator,
"xgmi_read_data_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::XGMI_WRITE_DATA_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator,
"xgmi_write_data_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::XGMI_LINK_STATUS:
emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus,
"xgmi_link_status", r);
break;
// Current Clock
case details::AMDGpuMetricAttributeId_t::CURRENT_GFXCLK:
emit(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrGfxClock,
"current_gfxclk", r);
break;
case details::AMDGpuMetricAttributeId_t::CURRENT_SOCCLK:
emit(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrSocClock,
"current_socclk", r);
break;
case details::AMDGpuMetricAttributeId_t::CURRENT_VCLK0:
emit(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrVClock0,
"current_vclk0", r);
break;
case details::AMDGpuMetricAttributeId_t::CURRENT_DCLK0:
emit(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrDClock0,
"current_dclk0", r);
break;
case details::AMDGpuMetricAttributeId_t::CURRENT_UCLK:
emit(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrUClock,
"current_uclk", r);
break;
// Throttle Residency
case details::AMDGpuMetricAttributeId_t::PROCHOT_RESIDENCY_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, AMDGpuMetricsUnitType_t::kMetricProchotResidencyAccumulator,
"prochot_residency_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::PPT_RESIDENCY_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, AMDGpuMetricsUnitType_t::kMetricPPTResidencyAccumulator,
"ppt_residency_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::SOCKET_THM_RESIDENCY_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, AMDGpuMetricsUnitType_t::kMetricSocketThmResidencyAccumulator,
"socket_thm_residency_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::VR_THM_RESIDENCY_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, AMDGpuMetricsUnitType_t::kMetricVRThmResidencyAccumulator,
"vr_thm_residency_acc", r);
break;
case details::AMDGpuMetricAttributeId_t::HBM_THM_RESIDENCY_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator,
"hbm_thm_residency_acc", r);
break;
// XCP stats
case details::AMDGpuMetricAttributeId_t::GFX_BUSY_INST:
emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricGfxBusyInst,
"xcp_stats->gfx_busy_inst", r);
break;
case details::AMDGpuMetricAttributeId_t::JPEG_BUSY:
emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricJpegBusy,
"xcp_stats->jpeg_busy", r);
break;
case details::AMDGpuMetricAttributeId_t::VCN_BUSY:
emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricVcnBusy,
"xcp_stats->vcn_busy", r);
break;
case details::AMDGpuMetricAttributeId_t::GFX_BUSY_ACC:
emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc,
"xcp_stats->gfx_busy_acc", r);
break;
default:
ss << __PRETTY_FUNCTION__
<< " UNKNOWN Attribute "
<< static_cast<uint32_t>(r.m_instance.m_attribute_id)
<< " |";
LOG_ERROR(ss);
break;
}
}
ss << __PRETTY_FUNCTION__ << " | ======= end ======= "
<< " | Success "
<< " | Returning = " << getRSMIStatusString(status_code) << " |";
LOG_TRACE(ss);
{ std::lock_guard<std::mutex> lk(s_base_tbl_mu);
// Copy to base class
this->m_base_metrics_dynamic_tbl = m_metrics_dynamic_tbl;
}
return status_code;
}
rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() {
std::ostringstream ss;
@@ -1843,6 +2129,199 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
return status_code;
}
AMGpuMetricsPublicLatestTupl_t GpuMetricsBaseDynamic_t::copy_internal_to_external_metrics() {
std::ostringstream ss;
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
AMGpuMetricsPublicLatest_t out{};
init_max_public_gpu_matrics(out);
out.common_header.structure_size = m_header.m_structure_size;
out.common_header.format_revision = m_header.m_format_revision;
out.common_header.content_revision= m_header.m_content_revision;
auto assign_by_type = [&](auto& dst,
const details::AMDGpuMetricAttributeData_t& r) {
using D = std::decay_t<decltype(dst)>;
std::visit([&](const auto& x) {
using S = std::decay_t<decltype(x)>;
if constexpr (std::is_integral_v<S>) {
dst = static_cast<D>(x);
}
}, r.m_value);
};
auto assign_vector = [&]( auto& dst,
const details::AMDGpuMetricAttributeData_t& r,
std::size_t cap) {
using Dst = std::remove_reference_t<decltype(dst)>;
using T = std::remove_cv_t<std::remove_extent_t<Dst>>;
auto v = std::get_if<std::vector<T>>(&r.m_value);
const std::size_t n = std::min<std::size_t>(v->size(), cap);
std::copy_n(v->data(), n, dst);
};
for (const auto& r : m_dyn.get_metric_rows()) {
switch (r.m_instance.m_attribute_id) {
// Temps
case details::AMDGpuMetricAttributeId_t::TEMPERATURE_HOTSPOT:
assign_by_type(out.temperature_hotspot, r); break;
case details::AMDGpuMetricAttributeId_t::TEMPERATURE_MEM:
assign_by_type(out.temperature_mem, r); break;
case details::AMDGpuMetricAttributeId_t::TEMPERATURE_VRSOC:
assign_by_type(out.temperature_vrsoc, r); break;
// Power/Energy
case details::AMDGpuMetricAttributeId_t::CURR_SOCKET_POWER:
assign_by_type(out.current_socket_power, r); break;
case details::AMDGpuMetricAttributeId_t::ENERGY_ACCUMULATOR:
assign_by_type(out.energy_accumulator, r); break;
// Utilization
case details::AMDGpuMetricAttributeId_t::AVERAGE_GFX_ACTIVITY:
assign_by_type(out.average_gfx_activity, r); break;
case details::AMDGpuMetricAttributeId_t::AVERAGE_UMC_ACTIVITY:
assign_by_type(out.average_umc_activity, r); break;
case details::AMDGpuMetricAttributeId_t::GFX_ACTIVITY_ACC:
assign_by_type(out.gfx_activity_acc, r); break;
case details::AMDGpuMetricAttributeId_t::MEM_ACTIVITY_ACC:
assign_by_type(out.mem_activity_acc, r); break;
// Timestamps / Lock
case details::AMDGpuMetricAttributeId_t::SYSTEM_CLOCK_COUNTER:
assign_by_type(out.system_clock_counter, r); break;
case details::AMDGpuMetricAttributeId_t::FIRMWARE_TIMESTAMP:
assign_by_type(out.firmware_timestamp, r); break;
case details::AMDGpuMetricAttributeId_t::GFXCLK_LOCK_STATUS:
assign_by_type(out.gfxclk_lock_status, r); break;
// Link width/speed, bandwidth, counts
case details::AMDGpuMetricAttributeId_t::PCIE_LINK_WIDTH:
assign_by_type(out.pcie_link_width, r); break;
case details::AMDGpuMetricAttributeId_t::PCIE_LINK_SPEED:
assign_by_type(out.pcie_link_speed, r); break;
case details::AMDGpuMetricAttributeId_t::XGMI_LINK_WIDTH:
assign_by_type(out.xgmi_link_width, r); break;
case details::AMDGpuMetricAttributeId_t::XGMI_LINK_SPEED:
assign_by_type(out.xgmi_link_speed, r); break;
case details::AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_ACC:
assign_by_type(out.pcie_bandwidth_acc, r); break;
case details::AMDGpuMetricAttributeId_t::PCIE_BANDWIDTH_INST:
assign_by_type(out.pcie_bandwidth_inst, r); break;
case details::AMDGpuMetricAttributeId_t::PCIE_L0_TO_RECOV_COUNT_ACC:
assign_by_type(out.pcie_l0_to_recov_count_acc, r); break;
case details::AMDGpuMetricAttributeId_t::PCIE_REPLAY_COUNT_ACC:
assign_by_type(out.pcie_replay_count_acc, r); break;
case details::AMDGpuMetricAttributeId_t::PCIE_REPLAY_ROVER_COUNT_ACC:
assign_by_type(out.pcie_replay_rover_count_acc, r); break;
case details::AMDGpuMetricAttributeId_t::PCIE_NAK_SENT_COUNT_ACC:
assign_by_type(out.pcie_nak_sent_count_acc, r); break;
case details::AMDGpuMetricAttributeId_t::PCIE_NAK_RCVD_COUNT_ACC:
assign_by_type(out.pcie_nak_rcvd_count_acc, r); break;
// Residency / counters
case details::AMDGpuMetricAttributeId_t::ACCUMULATION_COUNTER:
assign_by_type(out.accumulation_counter, r); break;
case details::AMDGpuMetricAttributeId_t::PROCHOT_RESIDENCY_ACC:
assign_by_type(out.prochot_residency_acc, r); break;
case details::AMDGpuMetricAttributeId_t::PPT_RESIDENCY_ACC:
assign_by_type(out.ppt_residency_acc, r); break;
case details::AMDGpuMetricAttributeId_t::SOCKET_THM_RESIDENCY_ACC:
assign_by_type(out.socket_thm_residency_acc, r); break;
case details::AMDGpuMetricAttributeId_t::VR_THM_RESIDENCY_ACC:
assign_by_type(out.vr_thm_residency_acc, r); break;
case details::AMDGpuMetricAttributeId_t::HBM_THM_RESIDENCY_ACC:
assign_by_type(out.hbm_thm_residency_acc, r); break;
// VRAM max bandwidth
case details::AMDGpuMetricAttributeId_t::MEM_MAX_BANDWIDTH:
assign_by_type(out.vram_max_bandwidth, r); break;
// XGMI accumulators / link status (arrays)
case details::AMDGpuMetricAttributeId_t::XGMI_READ_DATA_ACC: {
assign_vector(out.xgmi_read_data_acc, r, RSMI_MAX_NUM_XGMI_LINKS); break;
}
case details::AMDGpuMetricAttributeId_t::XGMI_WRITE_DATA_ACC: {
assign_vector(out.xgmi_write_data_acc, r, RSMI_MAX_NUM_XGMI_LINKS); break;
}
case details::AMDGpuMetricAttributeId_t::XGMI_LINK_STATUS: {
assign_vector(out.xgmi_link_status, r, RSMI_MAX_NUM_XGMI_LINKS); break;
}
// Current clocks (arrays) + uclk (scalar)
case details::AMDGpuMetricAttributeId_t::CURRENT_GFXCLK: {
assign_vector(out.current_gfxclks, r, RSMI_MAX_NUM_GFX_CLKS); break;
}
case details::AMDGpuMetricAttributeId_t::CURRENT_SOCCLK: {
assign_vector(out.current_socclks, r, RSMI_MAX_NUM_CLKS); break;
}
case details::AMDGpuMetricAttributeId_t::CURRENT_VCLK0: {
assign_vector(out.current_vclk0s, r, RSMI_MAX_NUM_CLKS); break;
}
case details::AMDGpuMetricAttributeId_t::CURRENT_DCLK0: {
assign_vector(out.current_dclk0s, r, RSMI_MAX_NUM_CLKS); break;
}
case details::AMDGpuMetricAttributeId_t::CURRENT_UCLK:
assign_by_type(out.current_uclk, r); break;
case details::AMDGpuMetricAttributeId_t::PCIE_LC_PERF_OTHER_END_RECOVERY:
assign_by_type(out.pcie_lc_perf_other_end_recovery, r); break;
// XCP stats
// Only fill in entry 0
case details::AMDGpuMetricAttributeId_t::GFX_BUSY_INST: {
assign_vector(out.xcp_stats[0].gfx_busy_inst, r, RSMI_MAX_NUM_XCC); break;
}
case details::AMDGpuMetricAttributeId_t::JPEG_BUSY: {
assign_vector(out.xcp_stats[0].jpeg_busy, r, RSMI_MAX_NUM_JPEG_ENG_V1); break;
}
case details::AMDGpuMetricAttributeId_t::VCN_BUSY: {
assign_vector(out.xcp_stats[0].vcn_busy, r, RSMI_MAX_NUM_VCNS); break;
}
case details::AMDGpuMetricAttributeId_t::GFX_BUSY_ACC: {
assign_vector(out.xcp_stats[0].gfx_busy_acc, r, RSMI_MAX_NUM_XCC); break;
}
case details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_PPT_ACC: {
assign_vector(out.xcp_stats[0].gfx_below_host_limit_ppt_acc, r, RSMI_MAX_NUM_XCC); break;
}
case details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_THM_ACC: {
assign_vector(out.xcp_stats[0].gfx_below_host_limit_thm_acc, r, RSMI_MAX_NUM_XCC); break;
}
case details::AMDGpuMetricAttributeId_t::GFX_LOW_UTILIZATION_ACC: {
assign_vector(out.xcp_stats[0].gfx_low_utilization_acc, r, RSMI_MAX_NUM_XCC); break;
}
case details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_TOTAL_ACC: {
assign_vector(out.xcp_stats[0].gfx_below_host_limit_total_acc, r, RSMI_MAX_NUM_XCC); break;
}
default: break;
}
}
out.current_gfxclk = out.current_gfxclks[0];
out.current_socclk = out.current_socclks[0];
out.current_vclk0 = out.current_vclk0s[0];
out.current_vclk1 = out.current_vclk0s[1];
out.current_dclk0 = out.current_dclk0s[0];
out.current_dclk1 = out.current_dclk0s[1];
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success "
<< " | Returning = " << getRSMIStatusString(status_code)
<< " |";
LOG_TRACE(ss);
return std::make_tuple(status_code, out);
}
AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v18_t::copy_internal_to_external_metrics()
{
std::ostringstream ss;
@@ -3874,6 +4353,7 @@ rsmi_status_t Device::dev_read_gpu_metrics_header_data()
auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics,
sizeof(AMDGpuMetricsHeader_v1_t),
&m_gpu_metrics_header);
if ((status_code = ErrnoToRsmiStatus(op_result)) !=
rsmi_status_t::RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__
@@ -3948,24 +4428,54 @@ rsmi_status_t Device::dev_read_gpu_metrics_all_data()
return status_code;
}
auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics,
m_gpu_metrics_header.m_structure_size,
m_gpu_metrics_ptr->get_metrics_table().get());
if ((status_code = ErrnoToRsmiStatus(op_result)) !=
rsmi_status_t::RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << index()
<< " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header)
<< " | Cause: readDevInfo(kDevGpuMetrics)"
<< " | Returning = "
<< getRSMIStatusString(status_code)
<< " Could not read Metrics Header: "
<< print_unsigned_int(m_gpu_metrics_header.m_structure_size)
<< " |";
LOG_ERROR(ss);
return status_code;
if (m_is_dynamic_gpu_metrics_supported){
std::string file_name = "/sys/class/drm/card"
+ std::to_string(index())
+ "/device/gpu_metrics";
// Parse blob to schema rows AMDGpuDynamicMetrics_t
AMDGpuDynamicMetrics_t parsed;
rsmi_status_t st = parsed.parse_from_file(file_name, m_gpu_metrics_header.m_structure_size);
if (st != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << index()
<< " | Cause: read_dynamic_gpu_metrics_file()"
<< " | Returning rocmsmi_status = " << getRSMIStatusString(st) << " |";
LOG_ERROR(ss);
return rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
}
// Store AMDGpuDynamicMetrics_t
auto* dyn = static_cast<GpuMetricsBaseDynamic_t*>(m_gpu_metrics_ptr.get());
status_code = dyn->set_parsed_dynamic(std::move(parsed));
if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){
return status_code;
}
} else {
auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics,
m_gpu_metrics_header.m_structure_size,
m_gpu_metrics_ptr->get_metrics_table().get());
if ((status_code = ErrnoToRsmiStatus(op_result)) !=
rsmi_status_t::RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << index()
<< " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header)
<< " | Cause: readDevInfo(kDevGpuMetrics)"
<< " | Returning = "
<< getRSMIStatusString(status_code)
<< " Could not read Metrics Header: "
<< print_unsigned_int(m_gpu_metrics_header.m_structure_size)
<< " |";
LOG_ERROR(ss);
return status_code;
}
}
// All metric units are pushed in.
@@ -4027,6 +4537,9 @@ rsmi_status_t Device::setup_gpu_metrics_reading()
return status_code;
}
m_is_dynamic_gpu_metrics_supported = (static_cast<std::underlying_type_t<AMDGpuMetricVersionFlags_t>>(gpu_metrics_flag_version) >=
static_cast<std::underlying_type_t<AMDGpuMetricVersionFlags_t>>(AMDGpuMetricVersionFlags_t::kGpuMetricV19));
//
m_gpu_metrics_ptr.reset();
m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version);
@@ -4047,7 +4560,6 @@ rsmi_status_t Device::setup_gpu_metrics_reading()
m_gpu_metrics_ptr->set_device_id(m_device_id);
m_gpu_metrics_ptr->set_partition_id(m_partition_id);
//
// m_gpu_metrics_ptr has the pointer to the proper object type/version.
status_code = dev_read_gpu_metrics_all_data();
if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) {