diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 2e544e7c02..7edf4181a0 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -757,6 +757,72 @@ typedef struct { typedef rsmi_od_volt_freq_data_t rsmi_od_volt_freq_data; /// \endcond + +/** + * @brief The following structures hold the gpu metrics values for a device. + */ + +struct metrics_table_header { + uint16_t structure_size; + uint8_t format_revision; + uint8_t content_revision; +}; + +typedef struct { + struct metrics_table_header common_header; + +/* Driver attached timestamp (in ns) */ + uint64_t system_clock_counter; + +/* Temperature */ + uint16_t temperature_edge; + uint16_t temperature_hotspot; + uint16_t temperature_mem; + uint16_t temperature_vrgfx; + uint16_t temperature_vrsoc; + uint16_t temperature_vrmem; + +/* Utilization */ + uint16_t average_gfx_activity; + uint16_t average_umc_activity; // memory controller + uint16_t average_mm_activity; // UVD or VCN + +/* Power/Energy */ + uint16_t average_socket_power; + uint32_t energy_accumulator; + +/* Average clocks */ + uint16_t average_gfxclk_frequency; + uint16_t average_socclk_frequency; + uint16_t average_uclk_frequency; + uint16_t average_vclk0_frequency; + uint16_t average_dclk0_frequency; + uint16_t average_vclk1_frequency; + uint16_t average_dclk1_frequency; + +/* Current clocks */ + uint16_t current_gfxclk; + uint16_t current_socclk; + uint16_t current_uclk; + uint16_t current_vclk0; + uint16_t current_dclk0; + uint16_t current_vclk1; + uint16_t current_dclk1; + +/* Throttle status */ + uint32_t throttle_status; + +/* Fans */ + uint16_t current_fan_speed; + +/* Link width/speed */ + uint8_t pcie_link_width; + uint8_t pcie_link_speed; // in 0.1 GT/s +}rsmi_gpu_metrics_t; +/// \cond Ignore in docs. +typedef rsmi_gpu_metrics_t rsmi_gpu_metrics; +/// \endcond + /** * @brief This structure holds error counts. */ @@ -2014,6 +2080,29 @@ rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv); +/** + * @brief This function retrieves the gpu metrics information + * + * @details Given a device index @p dv_ind and a pointer to a + * ::rsmi_gpu_metrics_t structure @p pgpu_metrics, this function will populate + * @p pgpu_metrics. See ::rsmi_gpu_metrics_t for more details. + * + * @param[in] dv_ind a device index + * + * @param[inout] pgpu_metrics a pointer to an ::rsmi_gpu_metrics_t structure + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided, + * arguments and ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the + * provided arguments. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + */ +rsmi_status_t rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, + rsmi_gpu_metrics_t *pgpu_metrics); + /** * @brief This function sets the clock frequency information * diff --git a/include/rocm_smi/rocm_smi_device.h b/include/rocm_smi/rocm_smi_device.h index 2bc6738951..105e0cb44a 100755 --- a/include/rocm_smi/rocm_smi_device.h +++ b/include/rocm_smi/rocm_smi_device.h @@ -152,7 +152,8 @@ enum DevInfoTypes { kDevFwVersionVcn, kDevSerialNumber, kDevMemPageBad, - kDevNumaNode + kDevNumaNode, + kDevGpuMetrics }; typedef struct { @@ -175,6 +176,7 @@ class Device { int readDevInfoLine(DevInfoTypes type, std::string *line); int readDevInfo(DevInfoTypes type, std::string *val); int readDevInfo(DevInfoTypes type, std::vector *retVec); + int readDevInfo(DevInfoTypes type, std::vector *retVec); int writeDevInfo(DevInfoTypes type, uint64_t val); int writeDevInfo(DevInfoTypes type, std::string val); @@ -214,6 +216,8 @@ class Device { int readDevInfoStr(DevInfoTypes type, std::string *retStr); int readDevInfoMultiLineStr(DevInfoTypes type, std::vector *retVec); + int readDevInfoBinary(DevInfoTypes type, + std::vector *retVec); int writeDevInfoStr(DevInfoTypes type, std::string valStr); uint64_t bdfid_; uint64_t kfd_gpu_id_; diff --git a/include/rocm_smi/rocm_smi_utils.h b/include/rocm_smi/rocm_smi_utils.h index 89c3ac4b7d..c67bf41981 100755 --- a/include/rocm_smi/rocm_smi_utils.h +++ b/include/rocm_smi/rocm_smi_utils.h @@ -80,6 +80,9 @@ rsmi_status_t handleException(); rsmi_status_t GetDevValueVec(amd::smi::DevInfoTypes type, uint32_t dv_ind, std::vector *val_vec); +rsmi_status_t +GetDevBinaryVec(amd::smi::DevInfoTypes type, + uint32_t dv_ind, std::vector *val_vec); rsmi_status_t ErrnoToRsmiStatus(uint32_t err); struct pthread_wrap { diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc index 2d97cd23fe..df0c941310 100755 --- a/rocm_smi/example/rocm_smi_example.cc +++ b/rocm_smi/example/rocm_smi_example.cc @@ -408,6 +408,7 @@ int main() { rsmi_dev_perf_level_t pfl; rsmi_frequencies_t f; uint32_t num_monitor_devs = 0; + rsmi_gpu_metrics_t p; rsmi_num_monitor_devices(&num_monitor_devs); for (uint32_t i = 0; i< num_monitor_devs; ++i) { @@ -415,6 +416,10 @@ int main() { CHK_RSMI_RET(ret) std::cout << "\t**Device ID: 0x" << std::hex << val_ui64 << std::endl; + ret = rsmi_dev_gpu_metrics_info_get(i, &p); + CHK_RSMI_RET(ret) + std::cout << "\t**GPU METRICS" << std::endl; + ret = rsmi_dev_perf_level_get(i, &pfl); CHK_RSMI_RET(ret) std::cout << "\t**Performance Level:" << diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 60451f0865..acbaf6e3e9 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -1033,7 +1033,6 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, CATCH } - rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, uint64_t clkvalue, rsmi_clk_type_t clkType) { @@ -2158,6 +2157,29 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) { CATCH } +rsmi_status_t +rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { + TRY + DEVICE_MUTEX + CHK_SUPPORT_NAME_ONLY(smu) + + std::vector val_vec; + rsmi_status_t ret = GetDevBinaryVec(amd::smi::kDevGpuMetrics, dv_ind, + &val_vec); + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + if (val_vec.size() == 0) { + return RSMI_STATUS_NOT_YET_IMPLEMENTED; + } + + smu = reinterpret_cast(val_vec.data()); + + return ret; + CATCH +} + rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind, uint32_t *num_regions, rsmi_freq_volt_region_t *buffer) { TRY diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index b0daea3edb..c8a430ee06 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -57,6 +57,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_device.h" @@ -106,6 +107,7 @@ static const char *kDevMemBusyPercentFName = "mem_busy_percent"; static const char *kDevXGMIErrorFName = "xgmi_error"; static const char *kDevSerialNumberFName = "serial_number"; static const char *kDevNumaNodeFName = "numa_node"; +static const char *kDevGpuMetricsFName = "gpu_metrics"; // Firmware version files static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version"; @@ -265,6 +267,7 @@ static const std::map kDevAttribNameMap = { {kDevSerialNumber, kDevSerialNumberFName}, {kDevMemPageBad, kDevMemPageBadFName}, {kDevNumaNode, kDevNumaNodeFName}, + {kDevGpuMetrics, kDevGpuMetricsFName}, }; static const std::map kDevPerfLvlMap = { @@ -375,6 +378,7 @@ static const std::map kDevFuncDependsMap = { {"rsmi_dev_xgmi_error_reset", {{kDevXGMIErrorFName}, {}}}, {"rsmi_dev_memory_reserved_pages_get", {{kDevMemPageBadFName}, {}}}, {"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}}, + {"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}}, // These functions with variants, but no sensors/units. (May or may not // have mandatory dependencies.) @@ -624,6 +628,24 @@ int Device::readDevInfoLine(DevInfoTypes type, std::string *line) { return 0; } +int Device::readDevInfoBinary(DevInfoTypes type, + std::vector *retVec) { + auto sysfs_path = path_; + + sysfs_path += "/device/"; + sysfs_path += kDevAttribNameMap.at(type); + + std::ifstream fs(sysfs_path, std::ios::binary); + if (!fs.is_open()) { + return errno; + } + // copies all data into buffer + retVec->insert(retVec->begin(), + std::istreambuf_iterator(fs),{}); + + return 0; +} + int Device::readDevInfoMultiLineStr(DevInfoTypes type, std::vector *retVec) { std::string line; @@ -754,6 +776,21 @@ int Device::readDevInfo(DevInfoTypes type, std::vector *val) { return 0; } +int Device::readDevInfo(DevInfoTypes type, std::vector *val) { + assert(val != nullptr); + + switch (type) { + case kDevGpuMetrics: + return readDevInfoBinary(type, val); + break; + + default: + return EINVAL; + } + + return 0; +} + int Device::readDevInfo(DevInfoTypes type, std::string *val) { assert(val != nullptr); diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index 91dadb6c33..273e547417 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -204,6 +204,18 @@ rsmi_status_t GetDevValueVec(amd::smi::DevInfoTypes type, return ErrnoToRsmiStatus(ret); } +rsmi_status_t GetDevBinaryVec(amd::smi::DevInfoTypes type, + uint32_t dv_ind, std::vector *val_vec) { + assert(val_vec != nullptr); + if (val_vec == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + GET_DEV_FROM_INDX + + int ret = dev->readDevInfo(type, val_vec); + return ErrnoToRsmiStatus(ret); +} + rsmi_status_t ErrnoToRsmiStatus(uint32_t err) { switch (err) { case 0: return RSMI_STATUS_SUCCESS;