diff --git a/projects/rocm-smi-lib/CMakeLists.txt b/projects/rocm-smi-lib/CMakeLists.txt index 0fb959ad07..c5bb82ce3a 100755 --- a/projects/rocm-smi-lib/CMakeLists.txt +++ b/projects/rocm-smi-lib/CMakeLists.txt @@ -73,6 +73,7 @@ endif() ## Compiler flags set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti -m64 -msse -msse2 ") + # Security options set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion -Wcast-align ") @@ -148,6 +149,7 @@ set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_exception.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_counters.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_kfd.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_io_link.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_gpu_metrics.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_logger.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_properties.h") diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 7817acd6ea..900dcee0f2 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -929,12 +929,10 @@ struct metrics_table_header_t { * @brief The GPU metrics version 3 */ #define RSMI_GPU_METRICS_API_CONTENT_VER_3 3 - /** * @brief This should match NUM_HBM_INSTANCES */ #define RSMI_NUM_HBM_INSTANCES 4 - /** * @brief Unit conversion factor for HBM temperatures */ diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h index 7a28e1327c..0880350a3e 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h @@ -60,6 +60,7 @@ #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_counters.h" #include "rocm_smi/rocm_smi_properties.h" +#include "rocm_smi/rocm_smi_gpu_metrics.h" #include "shared_mutex.h" //NOLINT namespace amd { @@ -228,6 +229,18 @@ class Device { template std::string readBootPartitionState(uint32_t dv_ind); rsmi_status_t check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type); + void dev_set_gpu_metric(GpuMetricsBasePtr gpu_metrics_ptr) { m_gpu_metrics_ptr = gpu_metrics_ptr; }; + GpuMetricsBasePtr& dev_get_gpu_metric() { return m_gpu_metrics_ptr; }; + const AMDGpuMetricsHeader_v1_t& dev_get_metrics_header() {return m_gpu_metrics_header; } + rsmi_status_t setup_gpu_metrics_reading(); + rsmi_status_t dev_read_gpu_metrics_header_data(); + rsmi_status_t dev_read_gpu_metrics_all_data(); + rsmi_status_t dev_log_gpu_metrics(); + rsmi_status_t run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t metric_counter, AMDGpuDynamicMetricTblValues_t& values); + + template + rsmi_status_t dev_run_gpu_metrics_query(AMDGpuMetricsUnitType_t metric_counter, T& metric_value); + private: std::shared_ptr monitor_; @@ -249,6 +262,8 @@ class Device { void *p_binary_data); int writeDevInfoStr(DevInfoTypes type, std::string valStr); rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query); + + uint64_t bdfid_; uint64_t kfd_gpu_id_; std::unordered_set, + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#ifndef ROCM_SMI_ROCM_SMI_GPU_METRICS_H_ +#define ROCM_SMI_ROCM_SMI_GPU_METRICS_H_ + +#include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi.h" + +#include +#include +#include +#include +#include + + +/** + * All 1.4 and newer GPU metrics are now defined in this header. + * + */ +namespace amd::smi +{ + +constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MAJOR_VER_1 = 1; +constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_1 = 1; +constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_2 = 2; +constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_3 = 3; +constexpr uint32_t kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_4 = 4; +constexpr uint32_t kRSMI_LATEST_GPU_METRICS_API_CONTENT_MAJOR_VER = kRSMI_GPU_METRICS_API_CONTENT_MAJOR_VER_1; +constexpr uint32_t kRSMI_LATEST_GPU_METRICS_API_CONTENT_MINON_VER = kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_4; + + +// Note: As gpu metrics are updating +constexpr uint32_t kRSMI_GPU_METRICS_EXPIRATION_SECS = 5; + +// Note: This *must* match NUM_HBM_INSTANCES +constexpr uint32_t kRSMI_MAX_NUM_HBM_INSTANCES = 4; + +// Note: This *must* match NUM_XGMI_LINKS +constexpr uint32_t kRSMI_MAX_NUM_XGMI_LINKS = 8; + +// Note: This *must* match MAX_GFX_CLKS +constexpr uint32_t kRSMI_MAX_NUM_GFX_CLKS = 8; + +// Note: This *must* match MAX_CLKS +constexpr uint32_t kRSMI_MAX_NUM_CLKS = 4; + +// Note: This *must* match NUM_VCN +constexpr uint32_t kRSMI_MAX_NUM_VCN = 4; + + +struct AMDGpuMetricsHeader_v1_t +{ + uint16_t m_structure_size; + uint8_t m_format_revision; + uint8_t m_content_revision; +}; + + +struct AMDGpuMetricsBase_t; +using AMDGpuMetricsBaseRef = AMDGpuMetricsBase_t&; +struct AMDGpuMetricsBase_t +{ + virtual ~AMDGpuMetricsBase_t() = default; +}; + +struct AMDGpuMetrics_v11_t : AMDGpuMetricsBase_t +{ + ~AMDGpuMetrics_v11_t() = default; + + struct AMDGpuMetricsHeader_v1_t m_common_header; + + // Temperature + uint16_t m_temperature_edge; + uint16_t m_temperature_hotspot; + uint16_t m_temperature_mem; + uint16_t m_temperature_vrgfx; + uint16_t m_temperature_vrsoc; + uint16_t m_temperature_vrmem; + + // Utilization + uint16_t m_average_gfx_activity; + uint16_t m_average_umc_activity; // memory controller + uint16_t m_average_mm_activity; // UVD or VCN + + // Power/Energy + uint16_t m_average_socket_power; + uint64_t m_energy_accumulator; + + // Driver attached timestamp (in ns) + uint64_t m_system_clock_counter; + + // Average clocks + uint16_t m_average_gfxclk_frequency; + uint16_t m_average_socclk_frequency; + uint16_t m_average_uclk_frequency; + uint16_t m_average_vclk0_frequency; + uint16_t m_average_dclk0_frequency; + uint16_t m_average_vclk1_frequency; + uint16_t m_average_dclk1_frequency; + + // Current clocks + uint16_t m_current_gfxclk; + uint16_t m_current_socclk; + uint16_t m_current_uclk; + uint16_t m_current_vclk0; + uint16_t m_current_dclk0; + uint16_t m_current_vclk1; + uint16_t m_current_dclk1; + + // Throttle status + uint32_t m_throttle_status; + + // Fans + uint16_t m_current_fan_speed; + + // Link width/speed + uint16_t m_pcie_link_width; + uint16_t m_pcie_link_speed; // in 0.1 GT/s + + uint16_t m_padding; + + uint32_t m_gfx_activity_acc; + uint32_t m_mem_activity_acc; + + uint16_t m_temperature_hbm[kRSMI_MAX_NUM_HBM_INSTANCES]; +}; + +struct AMDGpuMetrics_v12_t : AMDGpuMetricsBase_t +{ + ~AMDGpuMetrics_v12_t() = default; + + struct AMDGpuMetricsHeader_v1_t m_common_header; + + // Temperature + uint16_t m_temperature_edge; + uint16_t m_temperature_hotspot; + uint16_t m_temperature_mem; + uint16_t m_temperature_vrgfx; + uint16_t m_temperature_vrsoc; + uint16_t m_temperature_vrmem; + + // Utilization + uint16_t m_average_gfx_activity; + uint16_t m_average_umc_activity; // memory controller + uint16_t m_average_mm_activity; // UVD or VCN + + // Power/Energy + uint16_t m_average_socket_power; + uint64_t m_energy_accumulator; // v1 mod. (32->64) + + // Driver attached timestamp (in ns) + uint64_t m_system_clock_counter; // v1 mod. (moved from top of struct) + + // Average clocks + uint16_t m_average_gfxclk_frequency; + uint16_t m_average_socclk_frequency; + uint16_t m_average_uclk_frequency; + uint16_t m_average_vclk0_frequency; + uint16_t m_average_dclk0_frequency; + uint16_t m_average_vclk1_frequency; + uint16_t m_average_dclk1_frequency; + + // Current clocks + uint16_t m_current_gfxclk; + uint16_t m_current_socclk; + uint16_t m_current_uclk; + uint16_t m_current_vclk0; + uint16_t m_current_dclk0; + uint16_t m_current_vclk1; + uint16_t m_current_dclk1; + + // Throttle status + uint32_t m_throttle_status; + + // Fans + uint16_t m_current_fan_speed; + + // Link width/speed + uint16_t m_pcie_link_width; // v1 mod.(8->16) + uint16_t m_pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16) + + uint16_t m_padding; // new in v1 + + uint32_t m_gfx_activity_acc; // new in v1 + uint32_t m_mem_activity_acc; // new in v1 + uint16_t m_temperature_hbm[kRSMI_MAX_NUM_HBM_INSTANCES]; // new in v1 + + // PMFW attached timestamp (10ns resolution) + uint64_t m_firmware_timestamp; +}; + +struct AMDGpuMetrics_v13_t : AMDGpuMetricsBase_t +{ + ~AMDGpuMetrics_v13_t() = default; + + struct AMDGpuMetricsHeader_v1_t m_common_header; + + // Temperature + uint16_t m_temperature_edge; + uint16_t m_temperature_hotspot; + uint16_t m_temperature_mem; + uint16_t m_temperature_vrgfx; + uint16_t m_temperature_vrsoc; + uint16_t m_temperature_vrmem; + + // Utilization + uint16_t m_average_gfx_activity; + uint16_t m_average_umc_activity; // memory controller + uint16_t m_average_mm_activity; // UVD or VCN + + // Power/Energy + uint16_t m_average_socket_power; + uint64_t m_energy_accumulator; // v1 mod. (32->64) + + // Driver attached timestamp (in ns) + uint64_t m_system_clock_counter; // v1 mod. (moved from top of struct) + + // Average clocks + uint16_t m_average_gfxclk_frequency; + uint16_t m_average_socclk_frequency; + uint16_t m_average_uclk_frequency; + uint16_t m_average_vclk0_frequency; + uint16_t m_average_dclk0_frequency; + uint16_t m_average_vclk1_frequency; + uint16_t m_average_dclk1_frequency; + + // Current clocks + uint16_t m_current_gfxclk; + uint16_t m_current_socclk; + uint16_t m_current_uclk; + uint16_t m_current_vclk0; + uint16_t m_current_dclk0; + uint16_t m_current_vclk1; + uint16_t m_current_dclk1; + + // Throttle status + uint32_t m_throttle_status; + + // Fans + uint16_t m_current_fan_speed; + + // Link width/speed + uint16_t m_pcie_link_width; // v1 mod.(8->16) + uint16_t m_pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16) + + uint16_t m_padding; // new in v1 + + uint32_t m_gfx_activity_acc; // new in v1 + uint32_t m_mem_activity_acc; // new in v1 + uint16_t m_temperature_hbm[kRSMI_MAX_NUM_HBM_INSTANCES]; // new in v1 + + // PMFW attached timestamp (10ns resolution) + uint64_t m_firmware_timestamp; + + // Voltage (mV) + uint16_t m_voltage_soc; + uint16_t m_voltage_gfx; + uint16_t m_voltage_mem; + + uint16_t m_padding1; + + // Throttle status + uint64_t m_indep_throttle_status; +}; + +struct AMDGpuMetrics_v14_t : AMDGpuMetricsBase_t +{ + ~AMDGpuMetrics_v14_t() = default; + + struct AMDGpuMetricsHeader_v1_t m_common_header; + + // Temperature (Celsius). It will be zero (0) if unsupported. + uint16_t m_temperature_hotspot; + uint16_t m_temperature_mem; + uint16_t m_temperature_vrsoc; + + // Power (Watts) + uint16_t m_curr_socket_power; + + // Utilization (%) + uint16_t m_average_gfx_activity; + uint16_t m_average_umc_activity; // memory controller + uint16_t m_vcn_activity[kRSMI_MAX_NUM_VCN]; // VCN instances activity percent (encode/decode) + + // Energy (15.259uJ (2^-16) units) + uint64_t m_energy_accumulator; + + // Driver attached timestamp (in ns) + uint64_t m_system_clock_counter; + + // Throttle status + uint32_t m_throttle_status; + + // Clock Lock Status. Each bit corresponds to clock instance + uint32_t m_gfxclk_lock_status; + + // Link width (number of lanes) and speed (in 0.1 GT/s) + uint16_t m_pcie_link_width; + uint16_t m_pcie_link_speed; // in 0.1 GT/s + + // XGMI bus width and bitrate (in Gbps) + uint16_t m_xgmi_link_width; + uint16_t m_xgmi_link_speed; + + // Utilization Accumulated (%) + uint32_t m_gfx_activity_acc; + uint32_t m_mem_activity_acc; + + // PCIE accumulated bandwidth (GB/sec) + uint64_t m_pcie_bandwidth_acc; + + // PCIE instantaneous bandwidth (GB/sec) + uint64_t m_pcie_bandwidth_inst; + + // XGMI accumulated data transfer size(KiloBytes) + uint64_t m_xgmi_read_data_acc[kRSMI_MAX_NUM_XGMI_LINKS]; + uint64_t m_xgmi_write_data_acc[kRSMI_MAX_NUM_XGMI_LINKS]; + + // PMFW attached timestamp (10ns resolution) + uint64_t m_firmware_timestamp; + + // Current clocks (Mhz) + uint16_t m_current_gfxclk[kRSMI_MAX_NUM_GFX_CLKS]; + uint16_t m_current_socclk[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_vclk0[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_dclk0[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_uclk; + + uint16_t m_padding; +}; +using AMGpuMetricsLatest_t = AMDGpuMetrics_v14_t; + + +using GPUMetricTempHbm_t = decltype(AMDGpuMetrics_v13_t::m_temperature_hbm); +using GPUMetricTempHbmTbl_t = std::array; + +using GPUMetricVcnActivity_t = decltype(AMDGpuMetrics_v14_t::m_vcn_activity); +using GPUMetricVcnActivityTbl_t = std::array; + +using GPUMetricXgmiReadDataAcc_t = decltype(AMDGpuMetrics_v14_t::m_xgmi_read_data_acc); +using GPUMetricXgmiWriteDataAcc_t = decltype(AMDGpuMetrics_v14_t::m_xgmi_write_data_acc); +using GPUMetricXgmiAccTbl_t = std::array; + +using GPUMetricCurrGfxClk_t = decltype(AMDGpuMetrics_v14_t::m_current_gfxclk); +using GPUMetricCurrGfxClkTbl_t = std::array; + +using GPUMetricCurrSocClk_t = decltype(AMDGpuMetrics_v14_t::m_current_socclk); +using GPUMetricCurrSocClkTbl_t = std::array; + +using GPUMetricCurrVClk0_t = decltype(AMDGpuMetrics_v14_t::m_current_vclk0); +using GPUMetricCurrVClkTbl_t = std::array; + +using GPUMetricCurrDClk0_t = decltype(AMDGpuMetrics_v14_t::m_current_dclk0); +using GPUMetricCurrDClkTbl_t = std::array; + + +/* + * When a new metric table is released, we have to update: * + 1. Constants related to the new metrics added; + (ie: kRSMI_MAX_NUM_XGMI_LINKS) + 2. Constants related to new version: + (ie: kRSMI_GPU_METRICS_API_CONTENT_MAJOR_VER_1) + (ie: kRSMI_GPU_METRICS_API_CONTENT_MINOR_VER_x) + (ie: kRSMI_LATEST_GPU_METRICS_API_CONTENT_MAJOR_VER) + (ie: kRSMI_LATEST_GPU_METRICS_API_CONTENT_MINOR_VER) + 3. Check if still use the same existing header or if a new one is needed: + (ie: AMDGpuMetricsHeader_v1_t) + 4. Create a new struct representing the new table format + (ie: AMDGpuMetrics_v13_t -> AMDGpuMetrics_v14_t) + 5. AMGpuMetricsLatest_t -> Newest AMDGpuMetrics_v1x_t + 6. AMDGpuMetricVersionFlags_t + (ie: AMDGpuMetricVersionFlags_t::kGpuMetricV14) +*/ + +using AMDGpuMetricTypeId_t = uint32_t; +using AMDGpuMetricTypeIdSeq_t = uint32_t; +using AMDGpuMetricVersionFlagId_t = uint32_t; + +enum class AMDGpuMetricsClassId_t : AMDGpuMetricTypeId_t +{ + kGpuMetricHeader = 0, + kGpuMetricTemperature, + kGpuMetricUtilization, + kGpuMetricPowerEnergy, + kGpuMetricSystemClockCounter, + kGpuMetricAverageClock, + kGpuMetricCurrentClock, + kGpuMetricThrottleStatus, + kGpuMetricGfxClkLockStatus, + kGpuMetricCurrentFanSpeed, + kGpuMetricLinkWidthSpeed, + kGpuMetricVoltage, + kGpuMetricTimestamp, +}; +using AMDGpuMetricsClassIdTranslationTbl_t = std::map; + +enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t +{ + // kGpuMetricTemperature counters + kMetricTempEdge, + kMetricTempHotspot, + kMetricTempMem, + kMetricTempVrGfx, + kMetricTempVrSoc, + kMetricTempVrMem, + kMetricTempHbm, + + // kGpuMetricUtilization counters + kMetricAvgGfxActivity, + kMetricAvgUmcActivity, + kMetricAvgMmActivity, + kMetricGfxActivityAccumulator, + kMetricMemActivityAccumulator, + kMetricVcnActivity, + + // kGpuMetricAverageClock counters + kMetricAvgGfxClockFrequency, + kMetricAvgSocClockFrequency, + kMetricAvgUClockFrequency, + kMetricAvgVClock0Frequency, + kMetricAvgDClock0Frequency, + kMetricAvgVClock1Frequency, + kMetricAvgDClock1Frequency, + + // kGpuMetricCurrentClock counters + kMetricCurrGfxClock, + kMetricCurrSocClock, + kMetricCurrUClock, + kMetricCurrVClock0, + kMetricCurrDClock0, + kMetricCurrVClock1, + kMetricCurrDClock1, + + // kGpuMetricThrottleStatus counters + kMetricThrottleStatus, + kMetricIndepThrottleStatus, + + // kGpuMetricGfxClkLockStatus counters + kMetricGfxClkLockStatus, + + // kGpuMetricCurrentFanSpeed counters + kMetricCurrFanSpeed, + + // kGpuMetricLinkWidthSpeed counters + kMetricPcieLinkWidth, + kMetricPcieLinkSpeed, + kMetricPcieBandwidthAccumulator, + kMetricPcieBandwidthInst, + kMetricXgmiLinkWidth, + kMetricXgmiLinkSpeed, + kMetricXgmiReadDataAccumulator, + kMetricXgmiWriteDataAccumulator, + + // kGpuMetricPowerEnergy counters + kMetricAvgSocketPower, + kMetricCurrSocketPower, + kMetricEnergyAccumulator, + + // kGpuMetricVoltage counters + kMetricVoltageSoc, + kMetricVoltageGfx, + kMetricVoltageMem, + + // kGpuMetricTimestamp counters + kMetricTSClockCounter, + kMetricTSFirmware, +}; +using AMDGpuMetricsUnitTypeTranslationTbl_t = std::map; + +using AMDGpuMetricsDataTypeId_t = uint8_t; +enum class AMDGpuMetricsDataType_t : AMDGpuMetricsDataTypeId_t +{ + kUInt8, + kUInt16, + kUInt32, + kUInt64, +}; + +struct AMDGpuDynamicMetricsValue_t +{ + uint64_t m_value; + std::string m_info; + AMDGpuMetricsDataType_t m_original_type; +}; +using AMDGpuDynamicMetricTblValues_t = std::vector; +using AMDGpuDynamicMetricsTbl_t = std::map>; + +// Note: All supported metric versions are listed her +// If not here, they are not supported +enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t +{ + kGpuMetricNone = 0x0, + kGpuMetricV10 = (0x1 << 0), + kGpuMetricV11 = (0x1 << 1), + kGpuMetricV12 = (0x1 << 2), + kGpuMetricV13 = (0x1 << 3), + kGpuMetricV14 = (0x1 << 4), +}; +using AMDGpuMetricVersionTranslationTbl_t = std::map; + + +class GpuMetricsBase_t; +using GpuMetricsBasePtr = std::shared_ptr; + +class GpuMetricsBase_t +{ + public: + virtual ~GpuMetricsBase_t() = default; + virtual size_t sizeof_metric_table() = 0; + virtual AMDGpuMetricsBaseRef get_metrics_table() = 0; + virtual AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() = 0; + virtual rsmi_status_t populate_metrics_dynamic_tbl() = 0; + + virtual AMDGpuDynamicMetricsTbl_t get_metrics_dynamic_tbl() { + return m_metrics_dynamic_tbl; + } + + protected: + AMDGpuDynamicMetricsTbl_t m_metrics_dynamic_tbl; + uint64_t m_metrics_timestamp; + +}; +using AMDGpuMetricFactories_t = std::map; + + +class GpuMetricsBase_v11_t final : public GpuMetricsBase_t +{ + public: + ~GpuMetricsBase_v11_t() = default; + + size_t sizeof_metric_table() override { + return sizeof(AMDGpuMetrics_v11_t); + } + + AMDGpuMetricsBaseRef get_metrics_table() override + { + return m_gpu_metrics_tbl; + } + + AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override + { + return AMDGpuMetricVersionFlags_t::kGpuMetricV11; + } + + rsmi_status_t populate_metrics_dynamic_tbl() override; + + + private: + AMDGpuMetrics_v11_t m_gpu_metrics_tbl; + +}; + +class GpuMetricsBase_v12_t final : public GpuMetricsBase_t +{ + public: + ~GpuMetricsBase_v12_t() = default; + + size_t sizeof_metric_table() override { + return sizeof(AMDGpuMetrics_v12_t); + } + + AMDGpuMetricsBaseRef get_metrics_table() override + { + return m_gpu_metrics_tbl; + } + + AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override + { + return AMDGpuMetricVersionFlags_t::kGpuMetricV12; + } + + rsmi_status_t populate_metrics_dynamic_tbl() override; + + + private: + AMDGpuMetrics_v12_t m_gpu_metrics_tbl; + +}; + +class GpuMetricsBase_v13_t final : public GpuMetricsBase_t +{ + public: + ~GpuMetricsBase_v13_t() = default; + + size_t sizeof_metric_table() override { + return sizeof(AMDGpuMetrics_v13_t); + } + + AMDGpuMetricsBaseRef get_metrics_table() override + { + return m_gpu_metrics_tbl; + } + + AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override + { + return AMDGpuMetricVersionFlags_t::kGpuMetricV13; + } + + rsmi_status_t populate_metrics_dynamic_tbl() override; + + + private: + AMDGpuMetrics_v13_t m_gpu_metrics_tbl; + +}; + +class GpuMetricsBase_v14_t final : public GpuMetricsBase_t +{ + public: + ~GpuMetricsBase_v14_t() = default; + + size_t sizeof_metric_table() override { + return sizeof(AMDGpuMetrics_v14_t); + } + + AMDGpuMetricsBaseRef get_metrics_table() override + { + return m_gpu_metrics_tbl; + } + + AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override + { + return AMDGpuMetricVersionFlags_t::kGpuMetricV14; + } + + rsmi_status_t populate_metrics_dynamic_tbl() override; + + + private: + AMDGpuMetrics_v14_t m_gpu_metrics_tbl; + +}; + +template +rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, T& metric_value); + +} // namespace amd::smi + +#endif // ROCM_SMI_ROCM_SMI_GPU_METRICS_H_ + diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 2108a52289..3f09b83683 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -3517,7 +3517,6 @@ rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity) { } auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); - auto avg_mm_activity(uint16_t(0)); rsmi_activity_metric_counter_t activity_metric_counter; status_code = rsmi_dev_activity_metric_get(dv_ind, rsmi_activity_metric_t::RSMI_ACTIVITY_MM, &activity_metric_counter); avg_activity = &activity_metric_counter.average_mm_activity; @@ -3535,7 +3534,6 @@ rsmi_dev_activity_avg_mm_get(uint32_t dv_ind, uint16_t* avg_activity) { CATCH } - rsmi_status_t rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len) { TRY @@ -5405,6 +5403,616 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) { CATCH } +// +// NOTE: APIs related to new 'GPU Metrics' related work are added here +// so they can be used/tested. +// +rsmi_status_t +rsmi_dev_temp_hotspot_get(uint32_t dv_ind, uint16_t* hotspot_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempHotspot); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *hotspot_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_temp_mem_get(uint32_t dv_ind, uint16_t* mem_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempMem); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *mem_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_temp_vrsoc_get(uint32_t dv_ind, uint16_t* vrsoc_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempVrSoc); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *vrsoc_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_curr_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *socket_power_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_avg_gfx_activity_get(uint32_t dv_ind, uint16_t* gfx_activity_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *gfx_activity_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_avg_umc_activity_get(uint32_t dv_ind, uint16_t* umc_activity_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *umc_activity_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_energy_acc_get(uint32_t dv_ind, uint64_t* energy_acc_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *energy_acc_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_system_clock_counter_get(uint32_t dv_ind, uint64_t* system_clock_counter_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTSClockCounter); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *system_clock_counter_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_firmware_timestamp_get(uint32_t dv_ind, uint64_t* firmware_timestamp_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTSFirmware); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *firmware_timestamp_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_throttle_status_get(uint32_t dv_ind, uint32_t* throttle_status_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricThrottleStatus); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *throttle_status_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_pcie_link_width_get(uint32_t dv_ind, uint16_t* pcie_link_width_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_link_width_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_pcie_link_speed_get(uint32_t dv_ind, uint16_t* pcie_link_speed_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_link_speed_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_xgmi_link_width_get(uint32_t dv_ind, uint16_t* xgmi_link_width_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *xgmi_link_width_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_xgmi_link_speed_get(uint32_t dv_ind, uint16_t* xgmi_link_speed_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *xgmi_link_speed_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_gfxclk_lock_status_get(uint32_t dv_ind, uint32_t* gfxclk_lock_status_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *gfxclk_lock_status_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_gfx_activity_acc_get(uint32_t dv_ind, uint32_t* gfx_activity_acc_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *gfx_activity_acc_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_mem_activity_acc_get(uint32_t dv_ind, uint32_t* mem_activity_acc_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *mem_activity_acc_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_pcie_bandwidth_acc_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_acc_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_bandwidth_acc_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_pcie_bandwidth_inst_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_inst_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_bandwidth_inst_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_curr_uclk_get(uint32_t dv_ind, uint16_t* uclk_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrUClock); + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *uclk_value); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivityTbl_t* vcn_activity_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); + GPUMetricVcnActivityTbl_t tmp_vcn_activity_value; + *vcn_activity_value = tmp_vcn_activity_value; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_xgmi_read_data_get(uint32_t dv_ind, GPUMetricXgmiAccTbl_t* xgmi_read_data_acc_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); + GPUMetricXgmiAccTbl_t tmp_xgmi_read_data_acc_value; + *xgmi_read_data_acc_value = tmp_xgmi_read_data_acc_value; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_xgmi_write_data_get(uint32_t dv_ind, GPUMetricXgmiAccTbl_t* xgmi_write_data_acc_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); + GPUMetricXgmiAccTbl_t tmp_xgmi_write_data_acc_value; + *xgmi_write_data_acc_value = tmp_xgmi_write_data_acc_value; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_curr_gfxclk_get(uint32_t dv_ind, GPUMetricCurrGfxClkTbl_t* current_gfxclk_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); + GPUMetricCurrGfxClkTbl_t tmp_current_gfxclk_value; + *current_gfxclk_value = tmp_current_gfxclk_value; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_curr_socclk_get(uint32_t dv_ind, GPUMetricCurrSocClkTbl_t* current_socclk_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); + GPUMetricCurrSocClkTbl_t tmp_current_socclk_value; + *current_socclk_value = tmp_current_socclk_value; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_curr_vclk0_get(uint32_t dv_ind, GPUMetricCurrVClkTbl_t* current_vclk_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); + GPUMetricCurrVClkTbl_t tmp_current_vclk_value; + *current_vclk_value = tmp_current_vclk_value; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_curr_vdlk0_get(uint32_t dv_ind, GPUMetricCurrDClkTbl_t* current_dclk_value) +{ + TRY + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ostrstream); + + const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); + GPUMetricCurrDClkTbl_t tmp_current_dclk_value; + *current_dclk_value = tmp_current_dclk_value; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | End Result " + << " | Device #: " << dv_ind + << " | Metric Type: " << static_cast(gpu_metric_unit) + << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + return status_code; + CATCH +} + +// +// End of: new GPU Metrics related work. +// + + // UNDOCUMENTED FUNCTIONS // This functions are not declared in rocm_smi.h. They are either not fully // supported, or to be used for test purposes. @@ -5439,3 +6047,4 @@ rsmi_test_refcount(uint64_t refcnt_type) { return static_cast(smi.ref_count()); } + diff --git a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc index 16d629895d..6123eae147 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc @@ -41,26 +41,34 @@ * */ +#include "rocm_smi/rocm_smi_gpu_metrics.h" +#include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h +#include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_utils.h" +#include "rocm_smi/rocm_smi_exception.h" +#include "rocm_smi/rocm_smi_device.h" +#include "rocm_smi/rocm_smi_logger.h" + #include #include #include #include +#include #include #include #include #include #include +#include #include // NOLINT #include +#include +#include +#include #include -#include "rocm_smi/rocm_smi_common.h" // Should go before rocm_smi.h -#include "rocm_smi/rocm_smi_main.h" -#include "rocm_smi/rocm_smi_utils.h" -#include "rocm_smi/rocm_smi_exception.h" -#include "rocm_smi/rocm_smi_logger.h" - using namespace amd::smi; #define TRY try { @@ -526,3 +534,1097 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { return ret; CATCH } + + +namespace amd::smi +{ + +constexpr uint16_t join_metrics_version(uint8_t format_rev, uint8_t content_rev) +{ + return (format_rev << 8 | content_rev); +} + +constexpr uint16_t join_metrics_version(const AMDGpuMetricsHeader_v1_t& metrics_header) +{ + return join_metrics_version(metrics_header.m_format_revision, metrics_header.m_content_revision); +} + +AMDGpuMetricsHeader_v1_t disjoin_metrics_version(uint16_t version) +{ + AMDGpuMetricsHeader_v1_t metrics_header; + + metrics_header.m_format_revision = static_cast((version & 0xFF00) >> 8); + metrics_header.m_content_revision = static_cast(version & 0x00FF); + + return metrics_header; +} + +uint64_t actual_timestamp_in_secs() +{ + using namespace std::chrono; + return duration_cast(system_clock::now().time_since_epoch()).count(); +} + +std::string stringfy_metrics_header(const AMDGpuMetricsHeader_v1_t& metrics_header) +{ + std::stringstream metrics_header_info; + metrics_header_info + << "Format: " << print_unsigned_hex_and_int(metrics_header.m_format_revision) + << "." << print_unsigned_hex_and_int(metrics_header.m_content_revision) + << " Size: " << print_unsigned_hex_and_int(metrics_header.m_structure_size); + + return metrics_header_info.str(); +} + +// +// version 1,0: 256 +// version 1,1: 257 +// version 1,2: 258 +// version 1,3: 259 +// version 1,4: 260 +// version 1,5: 261 +// +const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_table +{ + {join_metrics_version(1, 1), AMDGpuMetricVersionFlags_t::kGpuMetricV11}, + {join_metrics_version(1, 2), AMDGpuMetricVersionFlags_t::kGpuMetricV12}, + {join_metrics_version(1, 3), AMDGpuMetricVersionFlags_t::kGpuMetricV13}, + {join_metrics_version(1, 4), AMDGpuMetricVersionFlags_t::kGpuMetricV14}, +}; + +/** + * +*/ +const AMDGpuMetricsClassIdTranslationTbl_t amdgpu_metrics_class_id_translation_table +{ + {AMDGpuMetricsClassId_t::kGpuMetricHeader, "Header"}, + {AMDGpuMetricsClassId_t::kGpuMetricTemperature, "Temperature"}, + {AMDGpuMetricsClassId_t::kGpuMetricUtilization, "Utilization"}, + {AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy, "Power/Energy"}, + {AMDGpuMetricsClassId_t::kGpuMetricSystemClockCounter, "System Clock"}, + {AMDGpuMetricsClassId_t::kGpuMetricAverageClock, "Average Clock"}, + {AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, "Current Clock"}, + {AMDGpuMetricsClassId_t::kGpuMetricThrottleStatus, "Throttle"}, + {AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus, "Gfx Clock Lock"}, + {AMDGpuMetricsClassId_t::kGpuMetricCurrentFanSpeed, "Current Fan Speed"}, + {AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, "Link/Bandwidth/Speed"}, + {AMDGpuMetricsClassId_t::kGpuMetricVoltage, "Voltage"}, + {AMDGpuMetricsClassId_t::kGpuMetricTimestamp, "Timestamp"}, +}; + +const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation_table +{ + // kGpuMetricTemperature counters + {AMDGpuMetricsUnitType_t::kMetricTempEdge, "TempEdge"}, + {AMDGpuMetricsUnitType_t::kMetricTempHotspot, "TempHotspot"}, + {AMDGpuMetricsUnitType_t::kMetricTempMem, "TempMem"}, + {AMDGpuMetricsUnitType_t::kMetricTempVrGfx, "TempVrGfx"}, + {AMDGpuMetricsUnitType_t::kMetricTempVrSoc, "TempVrSoc"}, + {AMDGpuMetricsUnitType_t::kMetricTempVrMem, "TempVrMem"}, + {AMDGpuMetricsUnitType_t::kMetricTempHbm, "TempHbm"}, + + // kGpuMetricUtilization counters + {AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, "AvgGfxActivity"}, + {AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, "AvgUmcActivity"}, + {AMDGpuMetricsUnitType_t::kMetricAvgMmActivity, "AvgMmActivity"}, + {AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, "GfxActivityAcc"}, + {AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, "MemActivityAcc"}, + {AMDGpuMetricsUnitType_t::kMetricVcnActivity, "VcnActivity"}, + + // kGpuMetricAverageClock counters + {AMDGpuMetricsUnitType_t::kMetricAvgGfxClockFrequency, "AvgGfxClockFrequency"}, + {AMDGpuMetricsUnitType_t::kMetricAvgSocClockFrequency, "AvgSocClockFrequency"}, + {AMDGpuMetricsUnitType_t::kMetricAvgUClockFrequency, "AvgUClockFrequency"}, + {AMDGpuMetricsUnitType_t::kMetricAvgVClock0Frequency, "AvgVClock0Frequency"}, + {AMDGpuMetricsUnitType_t::kMetricAvgDClock0Frequency, "AvgDClock0Frequency"}, + {AMDGpuMetricsUnitType_t::kMetricAvgVClock1Frequency, "AvgVClock1Frequency"}, + {AMDGpuMetricsUnitType_t::kMetricAvgDClock1Frequency, "AvgDClock1Frequency"}, + + // kGpuMetricCurrentClock counters + {AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, "CurrGfxClock"}, + {AMDGpuMetricsUnitType_t::kMetricCurrSocClock, "CurrSocClock"}, + {AMDGpuMetricsUnitType_t::kMetricCurrUClock, "CurrUClock"}, + {AMDGpuMetricsUnitType_t::kMetricCurrVClock0, "CurrVClock0"}, + {AMDGpuMetricsUnitType_t::kMetricCurrDClock0, "CurrDClock0"}, + {AMDGpuMetricsUnitType_t::kMetricCurrVClock1, "CurrVClock1"}, + {AMDGpuMetricsUnitType_t::kMetricCurrDClock1, "CurrDClock1"}, + + // kGpuMetricThrottleStatus counters + {AMDGpuMetricsUnitType_t::kMetricThrottleStatus, "ThrottleStatus"}, + {AMDGpuMetricsUnitType_t::kMetricIndepThrottleStatus, "IndepThrottleStatus"}, + + // kGpuMetricGfxClkLockStatus counters + {AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, "GfxClkLockStatus"}, + + // kGpuMetricCurrentFanSpeed counters + {AMDGpuMetricsUnitType_t::kMetricCurrFanSpeed, "CurrFanSpeed"}, + + // kGpuMetricLinkWidthSpeed counters + {AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, "PcieLinkWidth"}, + {AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, "PcieLinkSpeed"}, + {AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, "PcieBandwidthAcc"}, + {AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, "PcieBandwidthInst"}, + {AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, "XgmiLinkWidth"}, + {AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, "XgmiLinkSpeed"}, + {AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, "XgmiReadDataAcc"}, + {AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, "XgmiWriteDataAcc"}, + + // kGpuMetricPowerEnergy counters + {AMDGpuMetricsUnitType_t::kMetricAvgSocketPower, "AvgSocketPower"}, + {AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, "CurrSocketPower"}, + {AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, "EnergyAcc"}, + + // kGpuMetricVoltage counters + {AMDGpuMetricsUnitType_t::kMetricVoltageSoc, "VoltageSoc"}, + {AMDGpuMetricsUnitType_t::kMetricVoltageGfx, "VoltageGfx"}, + {AMDGpuMetricsUnitType_t::kMetricVoltageMem, "VoltageMem"}, + + // kGpuMetricTimestamp counters + {AMDGpuMetricsUnitType_t::kMetricTSClockCounter, "TSClockCounter"}, + {AMDGpuMetricsUnitType_t::kMetricTSFirmware, "TSFirmware"}, +}; + + +AMDGpuMetricVersionFlags_t translate_header_to_flag_version(const AMDGpuMetricsHeader_v1_t& metrics_header) +{ + const auto flag_version = join_metrics_version(metrics_header); + if (amdgpu_metric_version_translation_table.find(flag_version) != amdgpu_metric_version_translation_table.end()) { + return amdgpu_metric_version_translation_table.at(flag_version); + } + + return AMDGpuMetricVersionFlags_t::kGpuMetricNone; +} + +rsmi_status_t is_gpu_metrics_version_supported(const AMDGpuMetricsHeader_v1_t& metrics_header) +{ + const auto flag_version = join_metrics_version(metrics_header); + return (amdgpu_metric_version_translation_table.find(flag_version) != + amdgpu_metric_version_translation_table.end()) + ? rsmi_status_t::RSMI_STATUS_SUCCESS : rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; +} + +struct AMDGpuMetricsLogInfo_t +{ + rsmi_status_t m_status_code; + std::string m_title; + std::string m_pretty_function; +}; + +class AMDGpuMetricsLogger_t +{ + public: + enum class LogInfoType_t + { + kLogError, + kLogAlarm, + kLogInfo, + kLogBuffer, + kLogTrace, + kLogDebug, + }; + + void operator()(const AMDGpuMetricsLogInfo_t& log_info) + { + m_ostrstream << log_info.m_pretty_function << log_info.m_title; + LOG_TRACE(m_ostrstream); + + m_ostrstream << log_info.m_pretty_function + << " | ======= end ======= " + << " | Fail " + << " | Device #: " + << " | Metric Version: " + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(log_info.m_status_code) + << " |"; + + LOG_ERROR(m_ostrstream); + } + + private: + std::ostringstream m_ostrstream; + +}; + + +AMDGpuMetricFactories_t amd_gpu_metrics_factory_table +{ + {AMDGpuMetricVersionFlags_t::kGpuMetricV11, std::make_unique(GpuMetricsBase_v11_t{})}, + {AMDGpuMetricVersionFlags_t::kGpuMetricV12, std::make_unique(GpuMetricsBase_v12_t{})}, + {AMDGpuMetricVersionFlags_t::kGpuMetricV13, std::make_unique(GpuMetricsBase_v13_t{})}, + {AMDGpuMetricVersionFlags_t::kGpuMetricV14, std::make_unique(GpuMetricsBase_v14_t{})}, +}; + +GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version) +{ + auto contains = [](const AMDGpuMetricVersionFlags_t metric_version) { + return (amd_gpu_metrics_factory_table.find(metric_version) != amd_gpu_metrics_factory_table.end()); + }; + + if (contains(gpu_metric_version)) { + return std::move(amd_gpu_metrics_factory_table[gpu_metric_version]); + } + + return nullptr; +} + +template +AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::string& value_title) +{ + auto multi_values = AMDGpuDynamicMetricTblValues_t{}; + + auto get_data_type_info = [&]() { + auto data_type(AMDGpuMetricsDataType_t::kUInt64); + if constexpr (std::is_array_v) { + const uint8_t check_uint8[]={1}; + const uint16_t check_uint16[]={2}; + const uint32_t check_uint32[]={3}; + const uint64_t check_uint64[]={4}; + if constexpr (std::is_same_v) { + data_type = AMDGpuMetricsDataType_t::kUInt8; + } + if constexpr (std::is_same_v) { + data_type = AMDGpuMetricsDataType_t::kUInt16; + } + if constexpr (std::is_same_v) { + data_type = AMDGpuMetricsDataType_t::kUInt32; + } + if constexpr (std::is_same_v) { + data_type = AMDGpuMetricsDataType_t::kUInt64; + } + return std::make_tuple(data_type, static_cast(std::end(metric) - std::begin(metric))); + } + + const uint16_t kSingleValue(1); + if constexpr (std::is_same_v) { + data_type = AMDGpuMetricsDataType_t::kUInt8; + } + if constexpr (std::is_same_v) { + data_type = AMDGpuMetricsDataType_t::kUInt16; + } + if constexpr (std::is_same_v) { + data_type = AMDGpuMetricsDataType_t::kUInt32; + } + if constexpr (std::is_same_v) { + data_type = AMDGpuMetricsDataType_t::kUInt64; + } + return std::make_tuple(data_type, kSingleValue); + }; + + const auto [data_type, num_values] = get_data_type_info(); + for (auto idx = uint16_t(0); idx < num_values; ++idx) { + auto value = uint64_t(0); + if constexpr (std::is_array_v) { + value = (metric[idx]); + } + else { + value = (metric); + } + + auto amdgpu_dynamic_metric_value = [&]() { + AMDGpuDynamicMetricsValue_t amdgpu_dynamic_metric_value_init{}; + amdgpu_dynamic_metric_value_init.m_value = value; + amdgpu_dynamic_metric_value_init.m_info = (value_title + " : " + std::to_string(idx)); + amdgpu_dynamic_metric_value_init.m_original_type = data_type; + return amdgpu_dynamic_metric_value_init; + }(); + + multi_values.emplace_back(amdgpu_dynamic_metric_value); + } + + return multi_values; +} + +rsmi_status_t GpuMetricsBase_v14_t::populate_metrics_dynamic_tbl() +{ + // Temperature Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, + format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, + "temperature_hotspot")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, + format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, + "temperature_hotspot")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem, + format_metric_row(m_gpu_metrics_tbl.m_temperature_mem, + "temperature_mem")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc, + "temperature_vrsoc")) + ); + + // Power/Energy Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, + format_metric_row(m_gpu_metrics_tbl.m_curr_socket_power, + "curr_socket_power")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator, + "energy_acc")) + ); + + // Utilization Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity, + "average_gfx_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity, + "average_umc_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVcnActivity, + format_metric_row(m_gpu_metrics_tbl.m_vcn_activity, + "[average_vcn_activity]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc, + "gfx_activity_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc, + "mem_activity_acc")) + ); + + // Timestamp Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware, + format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp, + "firmware_timestamp")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter, + format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter, + "system_clock_counter")) + ); + + // Throttle Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleStatus] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricThrottleStatus, + format_metric_row(m_gpu_metrics_tbl.m_throttle_status, + "throttle_status")) + ); + + // GfxLock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, + format_metric_row(m_gpu_metrics_tbl.m_gfxclk_lock_status, + "gfxclk_lock_status")) + ); + + // Link/Width/Speed Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width, + "pcie_link_width")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed, + "pcie_link_speed")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_width, + "xgmi_link_width")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_speed, + "xgmi_link_speed")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator,\ + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc, + "pcie_bandwidth_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc, + "pcie_bandwidth_inst")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc, + "[xgmi_read_data_acc]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_write_data_acc, + "[xgmi_write_data_acc]")) + ); + + // CurrentClock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, + format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk, + "[current_gfxclk]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock, + format_metric_row(m_gpu_metrics_tbl.m_current_socclk, + "[current_socclk]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_vclk0, + "[current_vclk0]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_dclk0, + "[current_dclk0]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock, + format_metric_row(m_gpu_metrics_tbl.m_current_uclk, + "current_uclk")) + ); + + return rsmi_status_t::RSMI_STATUS_SUCCESS; +} + +rsmi_status_t GpuMetricsBase_v13_t::populate_metrics_dynamic_tbl() +{ + // Temperature Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempEdge, + format_metric_row(m_gpu_metrics_tbl.m_temperature_edge, + "temperature_edge")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, + format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, + "temperature_hotspot")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem, + format_metric_row(m_gpu_metrics_tbl.m_temperature_mem, + "temperature_mem")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrGfx, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrgfx, + "temperature_vrgfx")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc, + "temperature_vrsoc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrMem, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrmem, + "temperature_vrmem")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHbm, + format_metric_row(m_gpu_metrics_tbl.m_temperature_hbm, + "[temperature_hbm]")) + ); + + // Power/Energy Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgSocketPower, + format_metric_row(m_gpu_metrics_tbl.m_average_socket_power, + "average_socket_power")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator, + "energy_acc")) + ); + + // Utilization Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity, + "average_gfx_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity, + "average_umc_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgMmActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_mm_activity, + "average_mm_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc, + "gfx_activity_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc, + "mem_activity_acc")) + ); + + // Timestamp Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware, + format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp, + "firmware_timestamp")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter, + format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter, + "system_clock_counter")) + ); + + // Fan Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentFanSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrFanSpeed, + format_metric_row(m_gpu_metrics_tbl.m_current_fan_speed, + "current_fan_speed")) + ); + + // Throttle Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleStatus] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricThrottleStatus, + format_metric_row(m_gpu_metrics_tbl.m_throttle_status, + "throttle_status")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleStatus] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricIndepThrottleStatus, + format_metric_row(m_gpu_metrics_tbl.m_indep_throttle_status, + "indep_throttle_status")) + ); + + // Average Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxClockFrequency, + format_metric_row(m_gpu_metrics_tbl.m_average_gfxclk_frequency, + "average_gfxclk_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgSocClockFrequency, + format_metric_row(m_gpu_metrics_tbl.m_average_socclk_frequency, + "average_socclk_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUClockFrequency, + format_metric_row(m_gpu_metrics_tbl.m_average_uclk_frequency, + "average_uclk_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgVClock0Frequency, + format_metric_row(m_gpu_metrics_tbl.m_average_vclk0_frequency, + "average_vclk0_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgDClock0Frequency, + format_metric_row(m_gpu_metrics_tbl.m_average_dclk0_frequency, + "average_dclk0_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgVClock1Frequency, + format_metric_row(m_gpu_metrics_tbl.m_average_vclk1_frequency, + "average_vclk1_frequency")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricAverageClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgDClock1Frequency, + format_metric_row(m_gpu_metrics_tbl.m_average_dclk1_frequency, + "average_dclk1_frequency")) + ); + + // CurrentClock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, + format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk, + "current_gfxclk")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock, + format_metric_row(m_gpu_metrics_tbl.m_current_socclk, + "current_socclk")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock, + format_metric_row(m_gpu_metrics_tbl.m_current_uclk, + "current_uclk")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_vclk0, + "current_vclk0")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_dclk0, + "current_dclk0")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock1, + format_metric_row(m_gpu_metrics_tbl.m_current_vclk1, + "current_vclk1")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock1, + format_metric_row(m_gpu_metrics_tbl.m_current_dclk1, + "current_dclk1")) + ); + + // Link/Width/Speed Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width, + "pcie_link_width")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed, + "pcie_link_speed")) + ); + + // Voltage Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricVoltage] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVoltageSoc, + format_metric_row(m_gpu_metrics_tbl.m_voltage_soc, + "voltage_soc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricVoltage] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVoltageGfx, + format_metric_row(m_gpu_metrics_tbl.m_voltage_gfx, + "voltage_gfx")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricVoltage] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVoltageMem, + format_metric_row(m_gpu_metrics_tbl.m_voltage_mem, + "voltage_mem")) + ); + + return rsmi_status_t::RSMI_STATUS_SUCCESS; +} + +rsmi_status_t GpuMetricsBase_v12_t::populate_metrics_dynamic_tbl() +{ + // TODO: Implement these; + return rsmi_status_t::RSMI_STATUS_NOT_YET_IMPLEMENTED; +} + +rsmi_status_t GpuMetricsBase_v11_t::populate_metrics_dynamic_tbl() +{ + // TODO: Implement these; + return rsmi_status_t::RSMI_STATUS_NOT_YET_IMPLEMENTED; +} + + +rsmi_status_t Device::dev_read_gpu_metrics_header_data() +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + // Check if/when metrics table needs to be refreshed. + auto now_ts = actual_timestamp_in_secs(); + if (((!m_gpu_metrics_header.m_structure_size) || + (!m_gpu_metrics_header.m_format_revision) || + (!m_gpu_metrics_header.m_content_revision)) || + ((now_ts - m_gpu_metrics_updated_timestamp) >= + kRSMI_GPU_METRICS_EXPIRATION_SECS)) { + auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, + sizeof(AMDGpuMetricsHeader_v1_t), + &m_gpu_metrics_header); + if ((status_code = ErrnoToRsmiStatus(op_result)) != + rsmi_status_t::RSMI_STATUS_SUCCESS) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) + << " | Cause: readDevInfo(kDevGpuMetrics)" + << " | Returning = " + << getRSMIStatusString(status_code) + << " Could not read Metrics Header: " + << print_unsigned_int(m_gpu_metrics_header.m_structure_size) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + if ((status_code = is_gpu_metrics_version_supported(m_gpu_metrics_header)) == + rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) + << " | Cause: gpu metric file version is not supported: " + << " | Returning = " + << getRSMIStatusString(status_code) + << " Could not read Metrics Header: " + << print_unsigned_int(m_gpu_metrics_header.m_structure_size) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs(); + } + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) + << " | Update Timestamp: " << m_gpu_metrics_updated_timestamp + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + return status_code; +} + +rsmi_status_t Device::dev_read_gpu_metrics_all_data() +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + // At this point we should have a valid gpu_metrics pointer. + if (!m_gpu_metrics_ptr) { + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, + m_gpu_metrics_ptr->sizeof_metric_table(), + &m_gpu_metrics_ptr->get_metrics_table()); + if ((status_code = ErrnoToRsmiStatus(op_result)) != + rsmi_status_t::RSMI_STATUS_SUCCESS) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) + << " | Cause: readDevInfo(kDevGpuMetrics)" + << " | Returning = " + << getRSMIStatusString(status_code) + << " Could not read Metrics Header: " + << print_unsigned_int(m_gpu_metrics_header.m_structure_size) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs(); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) + << " | Update Timestamp: " << m_gpu_metrics_updated_timestamp + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + return status_code; +} + +rsmi_status_t Device::setup_gpu_metrics_reading() +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + status_code = dev_read_gpu_metrics_header_data(); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { + return status_code; + } + + const auto gpu_metrics_flag_version = translate_header_to_flag_version(dev_get_metrics_header()); + if (gpu_metrics_flag_version == AMDGpuMetricVersionFlags_t::kGpuMetricNone) { + status_code = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | [Translates to: " << join_metrics_version(dev_get_metrics_header()) + << " ] " + << " | Cause: Metric version found is not supported!" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + auto gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version); + if (!gpu_metrics_ptr) { + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + /// gpu_metrics_ptr has the pointer to the proper object type/version. + dev_set_gpu_metric(gpu_metrics_ptr); + status_code = dev_read_gpu_metrics_all_data(); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: dev_read_gpu_metrics_all_data() couldn't read gpu metric data!" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + return status_code; +} + +rsmi_status_t Device::dev_log_gpu_metrics() +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + // At this point we should have a valid gpu_metrics pointer. + if (!m_gpu_metrics_ptr) { + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + // Header info + auto header_output = [&]() { + const auto gpu_metrics_header = dev_get_metrics_header(); + ostrstream << "GPU Metrics Header: \n"; + ostrstream << "Timestamp: " << m_gpu_metrics_updated_timestamp << "\n"; + ostrstream << "Based on: " << static_cast(m_gpu_metrics_ptr->get_gpu_metrics_version_used()) << "\n"; + ostrstream << print_unsigned_hex_and_int(gpu_metrics_header.m_structure_size, " ->structure_size "); + ostrstream << print_unsigned_hex_and_int(gpu_metrics_header.m_format_revision, " ->format_revision "); + ostrstream << print_unsigned_hex_and_int(gpu_metrics_header.m_content_revision, " ->content_revision "); + LOG_DEBUG(ostrstream); + return; + }; + + auto table_content_output = [&]() { + const auto gpu_metrics_tbl = m_gpu_metrics_ptr->get_metrics_dynamic_tbl(); + + ostrstream << "GPU Metrics Data: \n"; + for (const auto& [metric_class, metric_data] : gpu_metrics_tbl) { + ostrstream << amdgpu_metrics_class_id_translation_table.at(metric_class) << "\n"; + + for (const auto& [metric_unit, metric_values] : metric_data) { + for (const auto& metric_value : metric_values) { + ostrstream << print_unsigned_hex_and_int(metric_value.m_value, metric_value.m_info); + } + } + } + return; + }; + + // + header_output(); + table_content_output(); + + return status_code; +} + +rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t metric_counter, AMDGpuDynamicMetricTblValues_t& values) +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_NOT_FOUND); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + status_code = setup_gpu_metrics_reading(); + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { + return status_code; + } + + if (!m_gpu_metrics_ptr) { + status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + // Lookup the dynamic table + const auto gpu_metrics_tbl = m_gpu_metrics_ptr->get_metrics_dynamic_tbl(); + for (const auto& [metric_class, metric_data] : gpu_metrics_tbl) { + for (const auto& [metric_unit, metric_values] : metric_data) { + if (metric_unit == metric_counter) { + values = metric_values; + return rsmi_status_t::RSMI_STATUS_SUCCESS; + } + } + } + + return status_code; +} + + +template struct is_vector : std::false_type {}; +template struct is_vector> : std::true_type {}; +template inline constexpr bool is_vector_v = is_vector::value; + +template struct is_array : std::false_type {}; +template struct is_array> : std::true_type {}; + +template struct is_bounded_uint8_array : std::false_type {}; +template struct is_bounded_uint16_array : std::false_type {}; +template struct is_bounded_uint32_array : std::false_type {}; +template struct is_bounded_uint64_array : std::false_type {}; + +template +struct is_bounded_uint8_array : std::true_type {}; + +template +struct is_bounded_uint16_array : std::true_type {}; + +template +struct is_bounded_uint32_array : std::true_type {}; + +template +struct is_bounded_uint64_array : std::true_type {}; + +template struct is_bounded_array : std::false_type {}; + +template +struct is_bounded_array : std::true_type {}; + + +template +rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, T& metric_value) +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + AMDGpuDynamicMetricTblValues_t tmp_values{}; + GET_DEV_FROM_INDX + status_code = dev->run_internal_gpu_metrics_query(metric_counter, tmp_values); + if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || tmp_values.empty()) { + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Metric Version: " << stringfy_metrics_header(dev->dev_get_metrics_header()) + << " | Cause: Couldn't find metric/counter requested" + << " | Metric Type: " << static_cast(metric_counter) + << " " << amdgpu_metrics_unit_type_translation_table.at(metric_counter) + << " | Values: " << tmp_values.size() + << " | Returning = " + << getRSMIStatusString(status_code) + << " |"; + LOG_ERROR(ostrstream); + return status_code; + } + + const auto num_stored_elems = (std::end(tmp_values) - std::begin(tmp_values)); + if constexpr (std::is_array_v) { + std::variant tmp_value; + auto idx = uint16_t(0); + for (const auto& value : tmp_values) { + tmp_value = value.m_value; + idx++; + switch (value.m_original_type) { + case AMDGpuMetricsDataType_t::kUInt8: + metric_value[idx] = std::get(tmp_value); + break; + + case AMDGpuMetricsDataType_t::kUInt16: + metric_value[idx] = std::get(tmp_value); + break; + + case AMDGpuMetricsDataType_t::kUInt32: + metric_value[idx] = std::get(tmp_value); + break; + + case AMDGpuMetricsDataType_t::kUInt64: + metric_value[idx] = std::get(tmp_value); + break; + + default: + break; + } + + metric_value[idx++] = tmp_value; + } + } + if constexpr ((std::is_same_v) || (std::is_same_v) || + (std::is_same_v) || (std::is_same_v)) { + T tmp_value(0); + tmp_value = static_cast(tmp_values[0].m_value); + metric_value = tmp_value; + } + + return status_code; +} + + +template +rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, uint8_t& metric_value); + +template +rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, uint16_t& metric_value); + +template +rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, uint32_t& metric_value); + +template +rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, uint64_t& metric_value); + + +} //namespace amd::smi