diff --git a/projects/amdsmi/example/amd_smi_drm_example.cc b/projects/amdsmi/example/amd_smi_drm_example.cc index 076aeb3942..53a1863240 100644 --- a/projects/amdsmi/example/amd_smi_drm_example.cc +++ b/projects/amdsmi/example/amd_smi_drm_example.cc @@ -51,6 +51,7 @@ #include #include #include +#include #include "amd_smi/amdsmi.h" @@ -212,6 +213,13 @@ void getFWNameFromId(int id, char *name) } } +template +std::string print_unsigned_int(T value) { + std::stringstream ss; + ss << static_cast(value | 0); + + return ss.str(); +} int main() { amdsmi_status_t ret; @@ -655,6 +663,177 @@ int main() { << "\n\n"; std::cout << "\t\t Max Power Cap: " << cap_info.max_power_cap << "\n\n"; + + /// Get GPU Metrics info + std::cout << "\n\n"; + amdsmi_gpu_metrics_t gpu_metrics; + ret = amdsmi_get_gpu_metrics_info(processor_handles[j], &gpu_metrics); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_gpu_metrics_info:\n"); + printf("\tDevice[%d] BDF %04lx:%02x:%02x.%d\n\n", i, + bdf.fields.domain_number, + bdf.fields.bus_number, + bdf.fields.device_number, + bdf.fields.function_number); + + std::cout << "\t**.common_header.format_revision : " + << print_unsigned_int(gpu_metrics.common_header.format_revision) << "\n"; + std::cout << "\t**.common_header.content_revision : " + << print_unsigned_int(gpu_metrics.common_header.content_revision) << "\n"; + std::cout << "\t**.temperature_edge : " << std::dec + << gpu_metrics.temperature_edge << "\n"; + std::cout << "\t**.temperature_hotspot : " << std::dec + << gpu_metrics.temperature_hotspot << "\n"; + std::cout << "\t**.temperature_mem : " << std::dec + << gpu_metrics.temperature_mem << "\n"; + std::cout << "\t**.temperature_vrgfx : " << std::dec + << gpu_metrics.temperature_vrgfx << "\n"; + std::cout << "\t**.temperature_vrsoc : " << std::dec + << gpu_metrics.temperature_vrsoc << "\n"; + std::cout << "\t**.temperature_vrmem : " << std::dec + << gpu_metrics.temperature_vrmem << "\n"; + std::cout << "\t**.average_gfx_activity : " << std::dec + << gpu_metrics.average_gfx_activity << "\n"; + std::cout << "\t**.average_umc_activity : " << std::dec + << gpu_metrics.average_umc_activity << "\n"; + std::cout << "\t**.average_mm_activity : " << std::dec + << gpu_metrics.average_mm_activity << "\n"; + std::cout << "\t**.average_socket_power : " << std::dec + << gpu_metrics.average_socket_power << "\n"; + std::cout << "\t**.energy_accumulator : " << std::dec + << gpu_metrics.energy_accumulator << "\n"; + std::cout << "\t**.system_clock_counter : " << std::dec + << gpu_metrics.system_clock_counter << "\n"; + std::cout << "\t**.average_gfxclk_frequency : " << std::dec + << gpu_metrics.average_gfxclk_frequency << "\n"; + std::cout << "\t**.average_socclk_frequency : " << std::dec + << gpu_metrics.average_socclk_frequency << "\n"; + std::cout << "\t**.average_uclk_frequency : " << std::dec + << gpu_metrics.average_uclk_frequency << "\n"; + std::cout << "\t**.average_vclk0_frequency : " << std::dec + << gpu_metrics.average_vclk0_frequency<< "\n"; + std::cout << "\t**.average_dclk0_frequency : " << std::dec + << gpu_metrics.average_dclk0_frequency << "\n"; + std::cout << "\t**.average_vclk1_frequency : " << std::dec + << gpu_metrics.average_vclk1_frequency << "\n"; + std::cout << "\t**.average_dclk1_frequency : " << std::dec + << gpu_metrics.average_dclk1_frequency << "\n"; + std::cout << "\t**.current_gfxclk : " << std::dec + << gpu_metrics.current_gfxclk << "\n"; + std::cout << "\t**.current_socclk : " << std::dec + << gpu_metrics.current_socclk << "\n"; + std::cout << "\t**.current_uclk : " << std::dec + << gpu_metrics.current_uclk << "\n"; + std::cout << "\t**.current_vclk0 : " << std::dec + << gpu_metrics.current_vclk0 << "\n"; + std::cout << "\t**.current_dclk0 : " << std::dec + << gpu_metrics.current_dclk0 << "\n"; + std::cout << "\t**.current_vclk1 : " << std::dec + << gpu_metrics.current_vclk1 << "\n"; + std::cout << "\t**.current_dclk1 : " << std::dec + << gpu_metrics.current_dclk1 << "\n"; + std::cout << "\t**.throttle_status : " << std::dec + << gpu_metrics.throttle_status << "\n"; + std::cout << "\t**.current_fan_speed : " << std::dec + << gpu_metrics.current_fan_speed << "\n"; + std::cout << "\t**.pcie_link_width : " << std::dec + << gpu_metrics.pcie_link_width << "\n"; + std::cout << "\t**.pcie_link_speed : " << std::dec + << gpu_metrics.pcie_link_speed << "\n"; + std::cout << "\t**.gfx_activity_acc : " << std::dec + << gpu_metrics.gfx_activity_acc << "\n"; + std::cout << "\t**.mem_activity_acc : " << std::dec + << gpu_metrics.mem_activity_acc << "\n"; + std::cout << "\t**.firmware_timestamp : " << std::dec + << gpu_metrics.firmware_timestamp << "\n"; + std::cout << "\t**.voltage_soc : " << std::dec + << gpu_metrics.voltage_soc << "\n"; + std::cout << "\t**.voltage_gfx : " << std::dec + << gpu_metrics.voltage_gfx << "\n"; + std::cout << "\t**.voltage_mem : " << std::dec + << gpu_metrics.voltage_mem << "\n"; + std::cout << "\t**.indep_throttle_status : " << std::dec + << gpu_metrics.indep_throttle_status << "\n"; + std::cout << "\t**.current_socket_power : " << std::dec + << gpu_metrics.current_socket_power << "\n"; + std::cout << "\t**.gfxclk_lock_status : " << std::dec + << gpu_metrics.gfxclk_lock_status << "\n"; + std::cout << "\t**.xgmi_link_width : " << std::dec + << gpu_metrics.xgmi_link_width << "\n"; + std::cout << "\t**.xgmi_link_speed : " << std::dec + << gpu_metrics.xgmi_link_speed << "\n"; + std::cout << "\t**.pcie_bandwidth_acc : " << std::dec + << gpu_metrics.pcie_bandwidth_acc << "\n"; + std::cout << "\t**.pcie_bandwidth_inst : " << std::dec + << gpu_metrics.pcie_bandwidth_inst << "\n"; + std::cout << "\t**.pcie_l0_to_recov_count_acc : " << std::dec + << gpu_metrics.pcie_l0_to_recov_count_acc << "\n"; + std::cout << "\t**.pcie_replay_count_acc : " << std::dec + << gpu_metrics.pcie_replay_count_acc << "\n"; + std::cout << "\t**.pcie_replay_rover_count_acc : " << std::dec + << gpu_metrics.pcie_replay_rover_count_acc << "\n"; + + std::cout << "\t**.temperature_hbm[] : " << std::dec << "\n"; + for (const auto& temp : gpu_metrics.temperature_hbm) { + std::cout << "\t -> " << std::dec << temp << "\n"; + } + + std::cout << "\t**.vcn_activity[] : " << std::dec << "\n"; + for (const auto& vcn : gpu_metrics.vcn_activity) { + std::cout << "\t -> " << std::dec << vcn << "\n"; + } + + std::cout << "\t**.xgmi_read_data_acc[] : " << std::dec << "\n"; + for (const auto& read_data : gpu_metrics.xgmi_read_data_acc) { + std::cout << "\t -> " << std::dec << read_data << "\n"; + } + + std::cout << "\t**.xgmi_write_data_acc[] : " << std::dec << "\n"; + for (const auto& write_data : gpu_metrics.xgmi_write_data_acc) { + std::cout << "\t -> " << std::dec << write_data << "\n"; + } + + std::cout << "\t**.current_gfxclks[] : " << std::dec << "\n"; + for (const auto& gfxclk : gpu_metrics.current_gfxclks) { + std::cout << "\t -> " << std::dec << gfxclk << "\n"; + } + + std::cout << "\t**.current_socclks[] : " << std::dec << "\n"; + for (const auto& socclk : gpu_metrics.current_socclks) { + std::cout << "\t -> " << std::dec << socclk << "\n"; + } + + std::cout << "\t**.current_vclk0s[] : " << std::dec << "\n"; + for (const auto& vclk : gpu_metrics.current_vclk0s) { + std::cout << "\t -> " << std::dec << vclk << "\n"; + } + + std::cout << "\t**.current_dclk0s[] : " << std::dec << "\n"; + for (const auto& dclk : gpu_metrics.current_dclk0s) { + std::cout << "\t -> " << std::dec << dclk << "\n"; + } + + std::cout << "\n"; + std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; + constexpr uint16_t kMAX_ITER_TEST = 10; + amdsmi_gpu_metrics_t gpu_metrics_check; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + amdsmi_get_gpu_metrics_info(processor_handles[j], &gpu_metrics_check); + std::cout << "\t\t -> firmware_timestamp [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.firmware_timestamp << "\n"; + } + + std::cout << "\n"; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + amdsmi_get_gpu_metrics_info(processor_handles[j], &gpu_metrics_check); + std::cout << "\t\t -> system_clock_counter [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.system_clock_counter << "\n"; + } + std::cout << "\n"; + + std::cout << "\n"; + std::cout << "\t ** Note: Values MAX'ed out (UINTX MAX are unsupported for the version in question) ** " << "\n"; + std::cout << "\n"; + std::cout << "+=======+==================+============+==============" + << "+=============+=============+=============+============+\n"; } } diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index bb6af21350..bea7d5df00 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -1619,7 +1619,7 @@ typedef struct __attribute__((__packed__)){ /** * @brief Initialize the AMD SMI library * - * @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf} + * @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf} * @platform{guest_mvf} @platform{guest_windows} * * @details This function initializes the library and the internal data structures, @@ -1642,7 +1642,7 @@ amdsmi_status_t amdsmi_init(uint64_t init_flags); /** * @brief Shutdown the AMD SMI library * - * @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf} + * @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf} * @platform{guest_mvf} @platform{guest_windows} * * @details This function shuts down the library and internal data structures and @@ -1663,7 +1663,7 @@ amdsmi_status_t amdsmi_shut_down(void); /** * @brief Get the list of socket handles in the system. * - * @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf} + * @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf} * @platform{guest_mvf} @platform{guest_windows} * * @details Depends on what flag is passed to ::amdsmi_init. AMDSMI_INIT_AMD_GPUS @@ -1726,7 +1726,7 @@ amdsmi_status_t amdsmi_get_cpusocket_handles(uint32_t *socket_count, /** * @brief Get information about the given socket * - * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} + * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} * @platform{guest_mvf} @platform{guest_windows} * * @details This function retrieves socket information. The @p socket_handle must @@ -1818,7 +1818,7 @@ amdsmi_status_t amdsmi_get_processor_handles_by_type(amdsmi_socket_handle socket /** * @brief Get the list of the processor handles associated to a socket. * - * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} + * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} * @platform{guest_mvf} @platform{guest_windows} * * @details This function retrieves the processor handles of a socket. The @@ -1886,7 +1886,7 @@ amdsmi_status_t amdsmi_get_cpucore_handles(amdsmi_cpusocket_handle socket_handle /** * @brief Get the processor type of the processor_handle * - * @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf} + * @platform{gpu_bm_linux} @platform{host} @platform{cpu_bm} @platform{guest_1vf} * @platform{guest_mvf} @platform{guest_windows} * * @details This function retrieves the processor type. A processor_handle must be provided @@ -1906,7 +1906,7 @@ amdsmi_status_t amdsmi_get_processor_type(amdsmi_processor_handle processor_hand /** * @brief Get processor handle with the matching bdf. * - * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} + * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} * @platform{guest_mvf} @platform{guest_windows} * * @details Given bdf info @p bdf, this function will get @@ -2462,7 +2462,7 @@ amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t * @brief Returns RAS features info. * * @platform{gpu_bm_linux} @platform{host} - * + * * @param[in] processor_handle Device handle which to query * * @param[out] ras_feature RAS features that are currently enabled and supported on @@ -2635,7 +2635,7 @@ amdsmi_status_t amdsmi_get_gpu_fan_speed_max(amdsmi_processor_handle processor_h * specified temperature sensor on the specified device. It is not supported on * virtual machine guest * - * @platform{gpu_bm_linux} @platform{host} + * @platform{gpu_bm_linux} @platform{host} * * @details Given a processor handle @p processor_handle, a sensor type @p sensor_type, a * ::amdsmi_temperature_metric_t @p metric and a pointer to an int64_t @p @@ -2666,7 +2666,7 @@ amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle, /** * @brief Returns gpu cache info. * - * @platform{gpu_bm_linux} @platform{host} + * @platform{gpu_bm_linux} @platform{host} * * @param[in] processor_handle PF of a processor for which to query * @@ -2935,6 +2935,27 @@ amdsmi_status_t amdsmi_reset_gpu(amdsmi_processor_handle processor_handle); amdsmi_status_t amdsmi_get_gpu_od_volt_info(amdsmi_processor_handle processor_handle, amdsmi_od_volt_freq_data_t *odv); +/** + * @brief Get the 'metrics_header_info' from the GPU metrics associated with the device + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle and a pointer to a amd_metrics_table_header_t in which + * the 'metrics_header_info' will stored + * + * @param[in] processor_handle Device which to query + * + * @param[inout] header_value a pointer to amd_metrics_table_header_t to which the device gpu + * metric unit will be stored + * + * @retval ::AMDSMI_STATUS_SUCCESS is returned upon successful call. + * ::AMDSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit + * does not exist for the given device + * + */ +amdsmi_status_t +amdsmi_get_gpu_metrics_header_info(amdsmi_processor_handle processor_handle, amd_metrics_table_header_t* header_value); + /** * @brief This function retrieves the gpu metrics information. It is not supported * on virtual machine guest @@ -4375,7 +4396,7 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( /** * @brief Returns the board part number and board information for the requested device * - * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} @platform{guest_mvf} + * @platform{gpu_bm_linux} @platform{host} @platform{guest_1vf} @platform{guest_mvf} * * @param[in] processor_handle Device which to query * diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index 88d58e5303..5587576fac 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -746,19 +746,6 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum class struct_amdsmi_pcie_info_t(Structure): pass -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_lanes', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('reserved', ctypes.c_uint64 * 10), -] - class struct_pcie_metric_(Structure): pass @@ -777,6 +764,19 @@ struct_pcie_metric_._fields_ = [ ('reserved', ctypes.c_uint64 * 13), ] +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_lanes', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('reserved', ctypes.c_uint64 * 10), +] + struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -1993,6 +1993,9 @@ amdsmi_reset_gpu.argtypes = [amdsmi_processor_handle] amdsmi_get_gpu_od_volt_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_od_volt_info amdsmi_get_gpu_od_volt_info.restype = amdsmi_status_t amdsmi_get_gpu_od_volt_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_od_volt_freq_data_t)] +amdsmi_get_gpu_metrics_header_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_metrics_header_info +amdsmi_get_gpu_metrics_header_info.restype = amdsmi_status_t +amdsmi_get_gpu_metrics_header_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amd_metrics_table_header_t)] amdsmi_get_gpu_metrics_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_metrics_info amdsmi_get_gpu_metrics_info.restype = amdsmi_status_t amdsmi_get_gpu_metrics_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_gpu_metrics_t)] @@ -2527,6 +2530,7 @@ __all__ = \ 'amdsmi_get_gpu_id', 'amdsmi_get_gpu_memory_partition', 'amdsmi_get_gpu_memory_reserved_pages', 'amdsmi_get_gpu_memory_total', 'amdsmi_get_gpu_memory_usage', + 'amdsmi_get_gpu_metrics_header_info', 'amdsmi_get_gpu_metrics_info', 'amdsmi_get_gpu_od_volt_curve_regions', 'amdsmi_get_gpu_od_volt_info', 'amdsmi_get_gpu_overdrive_level', diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc index 1618e328a9..37b6577625 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -2685,47 +2685,42 @@ rsmi_status_t Device::dev_read_gpu_metrics_header_data() // Check if/when metrics table needs to be refreshed. auto now_ts = actual_timestamp_in_secs(); - if ((!m_gpu_metrics_header.m_structure_size) || - (!m_gpu_metrics_header.m_format_revision) || - (!m_gpu_metrics_header.m_content_revision)) { - auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, - sizeof(AMDGpuMetricsHeader_v1_t), - &m_gpu_metrics_header); - if ((status_code = ErrnoToRsmiStatus(op_result)) != - rsmi_status_t::RSMI_STATUS_SUCCESS) { - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) - << " | Cause: readDevInfo(kDevGpuMetrics)" - << " | Returning = " - << getRSMIStatusString(status_code) - << " Could not read Metrics Header: " - << print_unsigned_int(m_gpu_metrics_header.m_structure_size) - << " |"; - LOG_ERROR(ostrstream); - return status_code; - } - if ((status_code = is_gpu_metrics_version_supported(m_gpu_metrics_header)) == - rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) { - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) - << " | Cause: gpu metric file version is not supported: " - << " | Returning = " - << getRSMIStatusString(status_code) - << " Could not read Metrics Header: " - << print_unsigned_int(m_gpu_metrics_header.m_structure_size) - << " |"; - LOG_ERROR(ostrstream); - return status_code; - } - - m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs(); + auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, + sizeof(AMDGpuMetricsHeader_v1_t), + &m_gpu_metrics_header); + if ((status_code = ErrnoToRsmiStatus(op_result)) != + rsmi_status_t::RSMI_STATUS_SUCCESS) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) + << " | Cause: readDevInfo(kDevGpuMetrics)" + << " | Returning = " + << getRSMIStatusString(status_code) + << " Could not read Metrics Header: " + << print_unsigned_int(m_gpu_metrics_header.m_structure_size) + << " |"; + LOG_ERROR(ostrstream); + return status_code; } + if ((status_code = is_gpu_metrics_version_supported(m_gpu_metrics_header)) == + rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) + << " | Cause: gpu metric file version is not supported: " + << " | Returning = " + << getRSMIStatusString(status_code) + << " Could not read Metrics Header: " + << print_unsigned_int(m_gpu_metrics_header.m_structure_size) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs(); ostrstream << __PRETTY_FUNCTION__ << " | ======= end ======= " @@ -2847,23 +2842,21 @@ rsmi_status_t Device::setup_gpu_metrics_reading() } // - // if/in case setup_gpu_metrics_reading() was called already use the same pointer + m_gpu_metrics_ptr.reset(); + m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version); if (!m_gpu_metrics_ptr) { - m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version); - if (!m_gpu_metrics_ptr) { - status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; - LOG_ERROR(ostrstream); - return status_code; - } + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; } // @@ -2943,23 +2936,21 @@ rsmi_status_t Device::dev_log_gpu_metrics(std::ostringstream& outstream_metrics) // meaning, we didn't run any queries, and just want to // print all the gpu metrics content, we need to setup // the environment first. - if (!m_gpu_metrics_ptr) { - status_code = setup_gpu_metrics_reading(); - if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) { - // At this point we should have a valid gpu_metrics pointer. - status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: Couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; - LOG_ERROR(ostrstream); - return status_code; - } + status_code = setup_gpu_metrics_reading(); + if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) { + // At this point we should have a valid gpu_metrics pointer. + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; } // Header info @@ -3105,22 +3096,20 @@ rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t met ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ostrstream); - if (!m_gpu_metrics_ptr) { - status_code = setup_gpu_metrics_reading(); - if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) { - status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: Couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; - LOG_ERROR(ostrstream); - return status_code; - } + status_code = setup_gpu_metrics_reading(); + if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) { + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; } // Lookup the dynamic table diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 52d069a04c..2a992742ff 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -1108,6 +1108,17 @@ amdsmi_status_t amdsmi_get_gpu_ecc_status(amdsmi_processor_handle processor_han reinterpret_cast(state)); } +amdsmi_status_t +amdsmi_get_gpu_metrics_header_info(amdsmi_processor_handle processor_handle, + amd_metrics_table_header_t *header_value) +{ + AMDSMI_CHECK_INIT(); + // nullptr api supported + + return rsmi_wrapper(rsmi_dev_metrics_header_info_get, processor_handle, + reinterpret_cast(header_value)); +} + amdsmi_status_t amdsmi_get_gpu_metrics_info( amdsmi_processor_handle processor_handle, amdsmi_gpu_metrics_t *pgpu_metrics) { diff --git a/projects/amdsmi/tests/amd_smi_test/functional/gpu_metrics_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/gpu_metrics_read.cc index 031897da9f..ea86c2982f 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/gpu_metrics_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/gpu_metrics_read.cc @@ -349,6 +349,23 @@ void TestGpuMetricsRead::Run(void) { << static_cast(smu.pcie_nak_rcvd_count_acc) << "\n"; std::cout << "pcie_replay_rover_count_acc= " << std::dec << static_cast(smu.pcie_replay_rover_count_acc) << "\n"; + + // Check for constant changes/refresh metrics + std::cout << "\n"; + std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; + constexpr uint16_t kMAX_ITER_TEST = 10; + amdsmi_gpu_metrics_t gpu_metrics_check; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + amdsmi_get_gpu_metrics_info(processor_handles_[i], &gpu_metrics_check); + std::cout << "\t\t -> firmware_timestamp [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.firmware_timestamp << "\n"; + } + + std::cout << "\n"; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + amdsmi_get_gpu_metrics_info(processor_handles_[i], &gpu_metrics_check); + std::cout << "\t\t -> system_clock_counter [" << idx << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.system_clock_counter << "\n"; + } + std::cout << "\n"; } }