fix: [rocm/amd_smi_lib] amdsmi_get_gpu_activity gfx/memory activity does not update
Checks and forces rereading gpu metrics unconditionally
Code changes related to the following:
* Device::dev_log_gpu_metrics()
* amdsmi_get_gpu_metrics_header_info()
Removed unintentionally during work on 'header cleanup Remove non-unified headers'
* Examples
* Unit tests
Change-Id: I83710e173c0f7102d0b7f865c18474c979a95cd8
Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
[ROCm/amdsmi commit: 78074d7d77]
Esse commit está contido em:
@@ -51,6 +51,7 @@
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
|
||||
@@ -212,6 +213,13 @@ void getFWNameFromId(int id, char *name)
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::string print_unsigned_int(T value) {
|
||||
std::stringstream ss;
|
||||
ss << static_cast<uint64_t>(value | 0);
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
int main() {
|
||||
amdsmi_status_t ret;
|
||||
@@ -655,6 +663,177 @@ int main() {
|
||||
<< "\n\n";
|
||||
std::cout << "\t\t Max Power Cap: " << cap_info.max_power_cap
|
||||
<< "\n\n";
|
||||
|
||||
/// Get GPU Metrics info
|
||||
std::cout << "\n\n";
|
||||
amdsmi_gpu_metrics_t gpu_metrics;
|
||||
ret = amdsmi_get_gpu_metrics_info(processor_handles[j], &gpu_metrics);
|
||||
CHK_AMDSMI_RET(ret)
|
||||
printf(" Output of amdsmi_get_gpu_metrics_info:\n");
|
||||
printf("\tDevice[%d] BDF %04lx:%02x:%02x.%d\n\n", i,
|
||||
bdf.fields.domain_number,
|
||||
bdf.fields.bus_number,
|
||||
bdf.fields.device_number,
|
||||
bdf.fields.function_number);
|
||||
|
||||
std::cout << "\t**.common_header.format_revision : "
|
||||
<< print_unsigned_int(gpu_metrics.common_header.format_revision) << "\n";
|
||||
std::cout << "\t**.common_header.content_revision : "
|
||||
<< print_unsigned_int(gpu_metrics.common_header.content_revision) << "\n";
|
||||
std::cout << "\t**.temperature_edge : " << std::dec
|
||||
<< gpu_metrics.temperature_edge << "\n";
|
||||
std::cout << "\t**.temperature_hotspot : " << std::dec
|
||||
<< gpu_metrics.temperature_hotspot << "\n";
|
||||
std::cout << "\t**.temperature_mem : " << std::dec
|
||||
<< gpu_metrics.temperature_mem << "\n";
|
||||
std::cout << "\t**.temperature_vrgfx : " << std::dec
|
||||
<< gpu_metrics.temperature_vrgfx << "\n";
|
||||
std::cout << "\t**.temperature_vrsoc : " << std::dec
|
||||
<< gpu_metrics.temperature_vrsoc << "\n";
|
||||
std::cout << "\t**.temperature_vrmem : " << std::dec
|
||||
<< gpu_metrics.temperature_vrmem << "\n";
|
||||
std::cout << "\t**.average_gfx_activity : " << std::dec
|
||||
<< gpu_metrics.average_gfx_activity << "\n";
|
||||
std::cout << "\t**.average_umc_activity : " << std::dec
|
||||
<< gpu_metrics.average_umc_activity << "\n";
|
||||
std::cout << "\t**.average_mm_activity : " << std::dec
|
||||
<< gpu_metrics.average_mm_activity << "\n";
|
||||
std::cout << "\t**.average_socket_power : " << std::dec
|
||||
<< gpu_metrics.average_socket_power << "\n";
|
||||
std::cout << "\t**.energy_accumulator : " << std::dec
|
||||
<< gpu_metrics.energy_accumulator << "\n";
|
||||
std::cout << "\t**.system_clock_counter : " << std::dec
|
||||
<< gpu_metrics.system_clock_counter << "\n";
|
||||
std::cout << "\t**.average_gfxclk_frequency : " << std::dec
|
||||
<< gpu_metrics.average_gfxclk_frequency << "\n";
|
||||
std::cout << "\t**.average_socclk_frequency : " << std::dec
|
||||
<< gpu_metrics.average_socclk_frequency << "\n";
|
||||
std::cout << "\t**.average_uclk_frequency : " << std::dec
|
||||
<< gpu_metrics.average_uclk_frequency << "\n";
|
||||
std::cout << "\t**.average_vclk0_frequency : " << std::dec
|
||||
<< gpu_metrics.average_vclk0_frequency<< "\n";
|
||||
std::cout << "\t**.average_dclk0_frequency : " << std::dec
|
||||
<< gpu_metrics.average_dclk0_frequency << "\n";
|
||||
std::cout << "\t**.average_vclk1_frequency : " << std::dec
|
||||
<< gpu_metrics.average_vclk1_frequency << "\n";
|
||||
std::cout << "\t**.average_dclk1_frequency : " << std::dec
|
||||
<< gpu_metrics.average_dclk1_frequency << "\n";
|
||||
std::cout << "\t**.current_gfxclk : " << std::dec
|
||||
<< gpu_metrics.current_gfxclk << "\n";
|
||||
std::cout << "\t**.current_socclk : " << std::dec
|
||||
<< gpu_metrics.current_socclk << "\n";
|
||||
std::cout << "\t**.current_uclk : " << std::dec
|
||||
<< gpu_metrics.current_uclk << "\n";
|
||||
std::cout << "\t**.current_vclk0 : " << std::dec
|
||||
<< gpu_metrics.current_vclk0 << "\n";
|
||||
std::cout << "\t**.current_dclk0 : " << std::dec
|
||||
<< gpu_metrics.current_dclk0 << "\n";
|
||||
std::cout << "\t**.current_vclk1 : " << std::dec
|
||||
<< gpu_metrics.current_vclk1 << "\n";
|
||||
std::cout << "\t**.current_dclk1 : " << std::dec
|
||||
<< gpu_metrics.current_dclk1 << "\n";
|
||||
std::cout << "\t**.throttle_status : " << std::dec
|
||||
<< gpu_metrics.throttle_status << "\n";
|
||||
std::cout << "\t**.current_fan_speed : " << std::dec
|
||||
<< gpu_metrics.current_fan_speed << "\n";
|
||||
std::cout << "\t**.pcie_link_width : " << std::dec
|
||||
<< gpu_metrics.pcie_link_width << "\n";
|
||||
std::cout << "\t**.pcie_link_speed : " << std::dec
|
||||
<< gpu_metrics.pcie_link_speed << "\n";
|
||||
std::cout << "\t**.gfx_activity_acc : " << std::dec
|
||||
<< gpu_metrics.gfx_activity_acc << "\n";
|
||||
std::cout << "\t**.mem_activity_acc : " << std::dec
|
||||
<< gpu_metrics.mem_activity_acc << "\n";
|
||||
std::cout << "\t**.firmware_timestamp : " << std::dec
|
||||
<< gpu_metrics.firmware_timestamp << "\n";
|
||||
std::cout << "\t**.voltage_soc : " << std::dec
|
||||
<< gpu_metrics.voltage_soc << "\n";
|
||||
std::cout << "\t**.voltage_gfx : " << std::dec
|
||||
<< gpu_metrics.voltage_gfx << "\n";
|
||||
std::cout << "\t**.voltage_mem : " << std::dec
|
||||
<< gpu_metrics.voltage_mem << "\n";
|
||||
std::cout << "\t**.indep_throttle_status : " << std::dec
|
||||
<< gpu_metrics.indep_throttle_status << "\n";
|
||||
std::cout << "\t**.current_socket_power : " << std::dec
|
||||
<< gpu_metrics.current_socket_power << "\n";
|
||||
std::cout << "\t**.gfxclk_lock_status : " << std::dec
|
||||
<< gpu_metrics.gfxclk_lock_status << "\n";
|
||||
std::cout << "\t**.xgmi_link_width : " << std::dec
|
||||
<< gpu_metrics.xgmi_link_width << "\n";
|
||||
std::cout << "\t**.xgmi_link_speed : " << std::dec
|
||||
<< gpu_metrics.xgmi_link_speed << "\n";
|
||||
std::cout << "\t**.pcie_bandwidth_acc : " << std::dec
|
||||
<< gpu_metrics.pcie_bandwidth_acc << "\n";
|
||||
std::cout << "\t**.pcie_bandwidth_inst : " << std::dec
|
||||
<< gpu_metrics.pcie_bandwidth_inst << "\n";
|
||||
std::cout << "\t**.pcie_l0_to_recov_count_acc : " << std::dec
|
||||
<< gpu_metrics.pcie_l0_to_recov_count_acc << "\n";
|
||||
std::cout << "\t**.pcie_replay_count_acc : " << std::dec
|
||||
<< gpu_metrics.pcie_replay_count_acc << "\n";
|
||||
std::cout << "\t**.pcie_replay_rover_count_acc : " << std::dec
|
||||
<< gpu_metrics.pcie_replay_rover_count_acc << "\n";
|
||||
|
||||
std::cout << "\t**.temperature_hbm[] : " << std::dec << "\n";
|
||||
for (const auto& temp : gpu_metrics.temperature_hbm) {
|
||||
std::cout << "\t -> " << std::dec << temp << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\t**.vcn_activity[] : " << std::dec << "\n";
|
||||
for (const auto& vcn : gpu_metrics.vcn_activity) {
|
||||
std::cout << "\t -> " << std::dec << vcn << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\t**.xgmi_read_data_acc[] : " << std::dec << "\n";
|
||||
for (const auto& read_data : gpu_metrics.xgmi_read_data_acc) {
|
||||
std::cout << "\t -> " << std::dec << read_data << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\t**.xgmi_write_data_acc[] : " << std::dec << "\n";
|
||||
for (const auto& write_data : gpu_metrics.xgmi_write_data_acc) {
|
||||
std::cout << "\t -> " << std::dec << write_data << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\t**.current_gfxclks[] : " << std::dec << "\n";
|
||||
for (const auto& gfxclk : gpu_metrics.current_gfxclks) {
|
||||
std::cout << "\t -> " << std::dec << gfxclk << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\t**.current_socclks[] : " << std::dec << "\n";
|
||||
for (const auto& socclk : gpu_metrics.current_socclks) {
|
||||
std::cout << "\t -> " << std::dec << socclk << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\t**.current_vclk0s[] : " << std::dec << "\n";
|
||||
for (const auto& vclk : gpu_metrics.current_vclk0s) {
|
||||
std::cout << "\t -> " << std::dec << vclk << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\t**.current_dclk0s[] : " << std::dec << "\n";
|
||||
for (const auto& dclk : gpu_metrics.current_dclk0s) {
|
||||
std::cout << "\t -> " << std::dec << dclk << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\n";
|
||||
std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n";
|
||||
constexpr uint16_t kMAX_ITER_TEST = 10;
|
||||
amdsmi_gpu_metrics_t gpu_metrics_check;
|
||||
for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) {
|
||||
amdsmi_get_gpu_metrics_info(processor_handles[j], &gpu_metrics_check);
|
||||
std::cout << "\t\t -> firmware_timestamp [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.firmware_timestamp << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\n";
|
||||
for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) {
|
||||
amdsmi_get_gpu_metrics_info(processor_handles[j], &gpu_metrics_check);
|
||||
std::cout << "\t\t -> system_clock_counter [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.system_clock_counter << "\n";
|
||||
}
|
||||
std::cout << "\n";
|
||||
|
||||
std::cout << "\n";
|
||||
std::cout << "\t ** Note: Values MAX'ed out (UINTX MAX are unsupported for the version in question) ** " << "\n";
|
||||
std::cout << "\n";
|
||||
std::cout << "+=======+==================+============+=============="
|
||||
<< "+=============+=============+=============+============+\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1619,7 +1619,7 @@ typedef struct __attribute__((__packed__)){
|
||||
/**
|
||||
* @brief Initialize the AMD SMI library
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf}
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf}
|
||||
* @platform{guest_mvf} @platform{guest_windows}
|
||||
*
|
||||
* @details This function initializes the library and the internal data structures,
|
||||
@@ -1642,7 +1642,7 @@ amdsmi_status_t amdsmi_init(uint64_t init_flags);
|
||||
/**
|
||||
* @brief Shutdown the AMD SMI library
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf}
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf}
|
||||
* @platform{guest_mvf} @platform{guest_windows}
|
||||
*
|
||||
* @details This function shuts down the library and internal data structures and
|
||||
@@ -1663,7 +1663,7 @@ amdsmi_status_t amdsmi_shut_down(void);
|
||||
/**
|
||||
* @brief Get the list of socket handles in the system.
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf}
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf}
|
||||
* @platform{guest_mvf} @platform{guest_windows}
|
||||
*
|
||||
* @details Depends on what flag is passed to ::amdsmi_init. AMDSMI_INIT_AMD_GPUS
|
||||
@@ -1726,7 +1726,7 @@ amdsmi_status_t amdsmi_get_cpusocket_handles(uint32_t *socket_count,
|
||||
/**
|
||||
* @brief Get information about the given socket
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf}
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf}
|
||||
* @platform{guest_mvf} @platform{guest_windows}
|
||||
*
|
||||
* @details This function retrieves socket information. The @p socket_handle must
|
||||
@@ -1818,7 +1818,7 @@ amdsmi_status_t amdsmi_get_processor_handles_by_type(amdsmi_socket_handle socket
|
||||
/**
|
||||
* @brief Get the list of the processor handles associated to a socket.
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf}
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf}
|
||||
* @platform{guest_mvf} @platform{guest_windows}
|
||||
*
|
||||
* @details This function retrieves the processor handles of a socket. The
|
||||
@@ -1886,7 +1886,7 @@ amdsmi_status_t amdsmi_get_cpucore_handles(amdsmi_cpusocket_handle socket_handle
|
||||
/**
|
||||
* @brief Get the processor type of the processor_handle
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf}
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf}
|
||||
* @platform{guest_mvf} @platform{guest_windows}
|
||||
*
|
||||
* @details This function retrieves the processor type. A processor_handle must be provided
|
||||
@@ -1906,7 +1906,7 @@ amdsmi_status_t amdsmi_get_processor_type(amdsmi_processor_handle processor_hand
|
||||
/**
|
||||
* @brief Get processor handle with the matching bdf.
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf}
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf}
|
||||
* @platform{guest_mvf} @platform{guest_windows}
|
||||
*
|
||||
* @details Given bdf info @p bdf, this function will get
|
||||
@@ -2462,7 +2462,7 @@ amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t
|
||||
* @brief Returns RAS features info.
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host}
|
||||
*
|
||||
*
|
||||
* @param[in] processor_handle Device handle which to query
|
||||
*
|
||||
* @param[out] ras_feature RAS features that are currently enabled and supported on
|
||||
@@ -2635,7 +2635,7 @@ amdsmi_status_t amdsmi_get_gpu_fan_speed_max(amdsmi_processor_handle processor_h
|
||||
* specified temperature sensor on the specified device. It is not supported on
|
||||
* virtual machine guest
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host}
|
||||
* @platform{gpu_bm_linux} @platform{host}
|
||||
*
|
||||
* @details Given a processor handle @p processor_handle, a sensor type @p sensor_type, a
|
||||
* ::amdsmi_temperature_metric_t @p metric and a pointer to an int64_t @p
|
||||
@@ -2666,7 +2666,7 @@ amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle,
|
||||
/**
|
||||
* @brief Returns gpu cache info.
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host}
|
||||
* @platform{gpu_bm_linux} @platform{host}
|
||||
*
|
||||
* @param[in] processor_handle PF of a processor for which to query
|
||||
*
|
||||
@@ -2935,6 +2935,27 @@ amdsmi_status_t amdsmi_reset_gpu(amdsmi_processor_handle processor_handle);
|
||||
amdsmi_status_t amdsmi_get_gpu_od_volt_info(amdsmi_processor_handle processor_handle,
|
||||
amdsmi_od_volt_freq_data_t *odv);
|
||||
|
||||
/**
|
||||
* @brief Get the 'metrics_header_info' from the GPU metrics associated with the device
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{guest_1vf}
|
||||
*
|
||||
* @details Given a processor handle @p processor_handle and a pointer to a amd_metrics_table_header_t in which
|
||||
* the 'metrics_header_info' will stored
|
||||
*
|
||||
* @param[in] processor_handle Device which to query
|
||||
*
|
||||
* @param[inout] header_value a pointer to amd_metrics_table_header_t to which the device gpu
|
||||
* metric unit will be stored
|
||||
*
|
||||
* @retval ::AMDSMI_STATUS_SUCCESS is returned upon successful call.
|
||||
* ::AMDSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit
|
||||
* does not exist for the given device
|
||||
*
|
||||
*/
|
||||
amdsmi_status_t
|
||||
amdsmi_get_gpu_metrics_header_info(amdsmi_processor_handle processor_handle, amd_metrics_table_header_t* header_value);
|
||||
|
||||
/**
|
||||
* @brief This function retrieves the gpu metrics information. It is not supported
|
||||
* on virtual machine guest
|
||||
@@ -4375,7 +4396,7 @@ amdsmi_status_t amdsmi_get_gpu_vram_info(
|
||||
/**
|
||||
* @brief Returns the board part number and board information for the requested device
|
||||
*
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} @platform{guest_mvf}
|
||||
* @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} @platform{guest_mvf}
|
||||
*
|
||||
* @param[in] processor_handle Device which to query
|
||||
*
|
||||
|
||||
@@ -746,19 +746,6 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum
|
||||
class struct_amdsmi_pcie_info_t(Structure):
|
||||
pass
|
||||
|
||||
class struct_pcie_static_(Structure):
|
||||
pass
|
||||
|
||||
struct_pcie_static_._pack_ = 1 # source:False
|
||||
struct_pcie_static_._fields_ = [
|
||||
('max_pcie_lanes', ctypes.c_uint16),
|
||||
('PADDING_0', ctypes.c_ubyte * 2),
|
||||
('max_pcie_speed', ctypes.c_uint32),
|
||||
('pcie_interface_version', ctypes.c_uint32),
|
||||
('slot_type', amdsmi_card_form_factor_t),
|
||||
('reserved', ctypes.c_uint64 * 10),
|
||||
]
|
||||
|
||||
class struct_pcie_metric_(Structure):
|
||||
pass
|
||||
|
||||
@@ -777,6 +764,19 @@ struct_pcie_metric_._fields_ = [
|
||||
('reserved', ctypes.c_uint64 * 13),
|
||||
]
|
||||
|
||||
class struct_pcie_static_(Structure):
|
||||
pass
|
||||
|
||||
struct_pcie_static_._pack_ = 1 # source:False
|
||||
struct_pcie_static_._fields_ = [
|
||||
('max_pcie_lanes', ctypes.c_uint16),
|
||||
('PADDING_0', ctypes.c_ubyte * 2),
|
||||
('max_pcie_speed', ctypes.c_uint32),
|
||||
('pcie_interface_version', ctypes.c_uint32),
|
||||
('slot_type', amdsmi_card_form_factor_t),
|
||||
('reserved', ctypes.c_uint64 * 10),
|
||||
]
|
||||
|
||||
struct_amdsmi_pcie_info_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_pcie_info_t._fields_ = [
|
||||
('pcie_static', struct_pcie_static_),
|
||||
@@ -1993,6 +1993,9 @@ amdsmi_reset_gpu.argtypes = [amdsmi_processor_handle]
|
||||
amdsmi_get_gpu_od_volt_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_od_volt_info
|
||||
amdsmi_get_gpu_od_volt_info.restype = amdsmi_status_t
|
||||
amdsmi_get_gpu_od_volt_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_od_volt_freq_data_t)]
|
||||
amdsmi_get_gpu_metrics_header_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_metrics_header_info
|
||||
amdsmi_get_gpu_metrics_header_info.restype = amdsmi_status_t
|
||||
amdsmi_get_gpu_metrics_header_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amd_metrics_table_header_t)]
|
||||
amdsmi_get_gpu_metrics_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_metrics_info
|
||||
amdsmi_get_gpu_metrics_info.restype = amdsmi_status_t
|
||||
amdsmi_get_gpu_metrics_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_gpu_metrics_t)]
|
||||
@@ -2527,6 +2530,7 @@ __all__ = \
|
||||
'amdsmi_get_gpu_id', 'amdsmi_get_gpu_memory_partition',
|
||||
'amdsmi_get_gpu_memory_reserved_pages',
|
||||
'amdsmi_get_gpu_memory_total', 'amdsmi_get_gpu_memory_usage',
|
||||
'amdsmi_get_gpu_metrics_header_info',
|
||||
'amdsmi_get_gpu_metrics_info',
|
||||
'amdsmi_get_gpu_od_volt_curve_regions',
|
||||
'amdsmi_get_gpu_od_volt_info', 'amdsmi_get_gpu_overdrive_level',
|
||||
|
||||
@@ -2685,47 +2685,42 @@ rsmi_status_t Device::dev_read_gpu_metrics_header_data()
|
||||
|
||||
// Check if/when metrics table needs to be refreshed.
|
||||
auto now_ts = actual_timestamp_in_secs();
|
||||
if ((!m_gpu_metrics_header.m_structure_size) ||
|
||||
(!m_gpu_metrics_header.m_format_revision) ||
|
||||
(!m_gpu_metrics_header.m_content_revision)) {
|
||||
auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics,
|
||||
sizeof(AMDGpuMetricsHeader_v1_t),
|
||||
&m_gpu_metrics_header);
|
||||
if ((status_code = ErrnoToRsmiStatus(op_result)) !=
|
||||
rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header)
|
||||
<< " | Cause: readDevInfo(kDevGpuMetrics)"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " Could not read Metrics Header: "
|
||||
<< print_unsigned_int(m_gpu_metrics_header.m_structure_size)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
if ((status_code = is_gpu_metrics_version_supported(m_gpu_metrics_header)) ==
|
||||
rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) {
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header)
|
||||
<< " | Cause: gpu metric file version is not supported: "
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " Could not read Metrics Header: "
|
||||
<< print_unsigned_int(m_gpu_metrics_header.m_structure_size)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
|
||||
m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs();
|
||||
auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics,
|
||||
sizeof(AMDGpuMetricsHeader_v1_t),
|
||||
&m_gpu_metrics_header);
|
||||
if ((status_code = ErrnoToRsmiStatus(op_result)) !=
|
||||
rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header)
|
||||
<< " | Cause: readDevInfo(kDevGpuMetrics)"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " Could not read Metrics Header: "
|
||||
<< print_unsigned_int(m_gpu_metrics_header.m_structure_size)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
if ((status_code = is_gpu_metrics_version_supported(m_gpu_metrics_header)) ==
|
||||
rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) {
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header)
|
||||
<< " | Cause: gpu metric file version is not supported: "
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " Could not read Metrics Header: "
|
||||
<< print_unsigned_int(m_gpu_metrics_header.m_structure_size)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs();
|
||||
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
@@ -2847,23 +2842,21 @@ rsmi_status_t Device::setup_gpu_metrics_reading()
|
||||
}
|
||||
|
||||
//
|
||||
// if/in case setup_gpu_metrics_reading() was called already use the same pointer
|
||||
m_gpu_metrics_ptr.reset();
|
||||
m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version);
|
||||
if (!m_gpu_metrics_ptr) {
|
||||
m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version);
|
||||
if (!m_gpu_metrics_ptr) {
|
||||
status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header())
|
||||
<< " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header())
|
||||
<< " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
|
||||
//
|
||||
@@ -2943,23 +2936,21 @@ rsmi_status_t Device::dev_log_gpu_metrics(std::ostringstream& outstream_metrics)
|
||||
// meaning, we didn't run any queries, and just want to
|
||||
// print all the gpu metrics content, we need to setup
|
||||
// the environment first.
|
||||
if (!m_gpu_metrics_ptr) {
|
||||
status_code = setup_gpu_metrics_reading();
|
||||
if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) {
|
||||
// At this point we should have a valid gpu_metrics pointer.
|
||||
status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header())
|
||||
<< " | Cause: Couldn't get a valid metric object"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
status_code = setup_gpu_metrics_reading();
|
||||
if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) {
|
||||
// At this point we should have a valid gpu_metrics pointer.
|
||||
status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header())
|
||||
<< " | Cause: Couldn't get a valid metric object"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
|
||||
// Header info
|
||||
@@ -3105,22 +3096,20 @@ rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t met
|
||||
ostrstream << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ostrstream);
|
||||
|
||||
if (!m_gpu_metrics_ptr) {
|
||||
status_code = setup_gpu_metrics_reading();
|
||||
if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) {
|
||||
status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header())
|
||||
<< " | Cause: Couldn't get a valid metric object"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
status_code = setup_gpu_metrics_reading();
|
||||
if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) {
|
||||
status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA;
|
||||
ostrstream << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << index()
|
||||
<< " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header())
|
||||
<< " | Cause: Couldn't get a valid metric object"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code)
|
||||
<< " |";
|
||||
LOG_ERROR(ostrstream);
|
||||
return status_code;
|
||||
}
|
||||
|
||||
// Lookup the dynamic table
|
||||
|
||||
@@ -1108,6 +1108,17 @@ amdsmi_status_t amdsmi_get_gpu_ecc_status(amdsmi_processor_handle processor_han
|
||||
reinterpret_cast<rsmi_ras_err_state_t*>(state));
|
||||
}
|
||||
|
||||
amdsmi_status_t
|
||||
amdsmi_get_gpu_metrics_header_info(amdsmi_processor_handle processor_handle,
|
||||
amd_metrics_table_header_t *header_value)
|
||||
{
|
||||
AMDSMI_CHECK_INIT();
|
||||
// nullptr api supported
|
||||
|
||||
return rsmi_wrapper(rsmi_dev_metrics_header_info_get, processor_handle,
|
||||
reinterpret_cast<metrics_table_header_t*>(header_value));
|
||||
}
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_metrics_info(
|
||||
amdsmi_processor_handle processor_handle,
|
||||
amdsmi_gpu_metrics_t *pgpu_metrics) {
|
||||
|
||||
@@ -349,6 +349,23 @@ void TestGpuMetricsRead::Run(void) {
|
||||
<< static_cast<uint32_t>(smu.pcie_nak_rcvd_count_acc) << "\n";
|
||||
std::cout << "pcie_replay_rover_count_acc= " << std::dec
|
||||
<< static_cast<uint64_t>(smu.pcie_replay_rover_count_acc) << "\n";
|
||||
|
||||
// Check for constant changes/refresh metrics
|
||||
std::cout << "\n";
|
||||
std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n";
|
||||
constexpr uint16_t kMAX_ITER_TEST = 10;
|
||||
amdsmi_gpu_metrics_t gpu_metrics_check;
|
||||
for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) {
|
||||
amdsmi_get_gpu_metrics_info(processor_handles_[i], &gpu_metrics_check);
|
||||
std::cout << "\t\t -> firmware_timestamp [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.firmware_timestamp << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\n";
|
||||
for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) {
|
||||
amdsmi_get_gpu_metrics_info(processor_handles_[i], &gpu_metrics_check);
|
||||
std::cout << "\t\t -> system_clock_counter [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.system_clock_counter << "\n";
|
||||
}
|
||||
std::cout << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário