Adding functionality that will parse gpu_metrics sysfs file

Signed-off-by: Divya Shikre <DivyaUday.Shikre@amd.com>
Change-Id: I3a84870b83eb4cd0ed46f10bb19169c91f99fd8e


[ROCm/amdsmi commit: 8b48564ce3]
Этот коммит содержится в:
Divya Shikre
2020-09-24 16:06:13 -04:00
коммит произвёл Divya Uday Shikre
родитель e775527ffc
Коммит 5cddbccec6
7 изменённых файлов: 174 добавлений и 2 удалений
+89
Просмотреть файл
@@ -757,6 +757,72 @@ typedef struct {
typedef rsmi_od_volt_freq_data_t rsmi_od_volt_freq_data;
/// \endcond
/**
* @brief The following structures hold the gpu metrics values for a device.
*/
struct metrics_table_header {
uint16_t structure_size;
uint8_t format_revision;
uint8_t content_revision;
};
typedef struct {
struct metrics_table_header common_header;
/* Driver attached timestamp (in ns) */
uint64_t system_clock_counter;
/* Temperature */
uint16_t temperature_edge;
uint16_t temperature_hotspot;
uint16_t temperature_mem;
uint16_t temperature_vrgfx;
uint16_t temperature_vrsoc;
uint16_t temperature_vrmem;
/* Utilization */
uint16_t average_gfx_activity;
uint16_t average_umc_activity; // memory controller
uint16_t average_mm_activity; // UVD or VCN
/* Power/Energy */
uint16_t average_socket_power;
uint32_t energy_accumulator;
/* Average clocks */
uint16_t average_gfxclk_frequency;
uint16_t average_socclk_frequency;
uint16_t average_uclk_frequency;
uint16_t average_vclk0_frequency;
uint16_t average_dclk0_frequency;
uint16_t average_vclk1_frequency;
uint16_t average_dclk1_frequency;
/* Current clocks */
uint16_t current_gfxclk;
uint16_t current_socclk;
uint16_t current_uclk;
uint16_t current_vclk0;
uint16_t current_dclk0;
uint16_t current_vclk1;
uint16_t current_dclk1;
/* Throttle status */
uint32_t throttle_status;
/* Fans */
uint16_t current_fan_speed;
/* Link width/speed */
uint8_t pcie_link_width;
uint8_t pcie_link_speed; // in 0.1 GT/s
}rsmi_gpu_metrics_t;
/// \cond Ignore in docs.
typedef rsmi_gpu_metrics_t rsmi_gpu_metrics;
/// \endcond
/**
* @brief This structure holds error counts.
*/
@@ -2014,6 +2080,29 @@ rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind,
rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind,
rsmi_od_volt_freq_data_t *odv);
/**
* @brief This function retrieves the gpu metrics information
*
* @details Given a device index @p dv_ind and a pointer to a
* ::rsmi_gpu_metrics_t structure @p pgpu_metrics, this function will populate
* @p pgpu_metrics. See ::rsmi_gpu_metrics_t for more details.
*
* @param[in] dv_ind a device index
*
* @param[inout] pgpu_metrics a pointer to an ::rsmi_gpu_metrics_t structure
* If this parameter is nullptr, this function will return
* ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided,
* arguments and ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the
* provided arguments.
*
* @retval ::RSMI_STATUS_SUCCESS call was successful
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function with the given arguments
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*/
rsmi_status_t rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind,
rsmi_gpu_metrics_t *pgpu_metrics);
/**
* @brief This function sets the clock frequency information
*
+5 -1
Просмотреть файл
@@ -152,7 +152,8 @@ enum DevInfoTypes {
kDevFwVersionVcn,
kDevSerialNumber,
kDevMemPageBad,
kDevNumaNode
kDevNumaNode,
kDevGpuMetrics
};
typedef struct {
@@ -175,6 +176,7 @@ class Device {
int readDevInfoLine(DevInfoTypes type, std::string *line);
int readDevInfo(DevInfoTypes type, std::string *val);
int readDevInfo(DevInfoTypes type, std::vector<std::string> *retVec);
int readDevInfo(DevInfoTypes type, std::vector<unsigned char> *retVec);
int writeDevInfo(DevInfoTypes type, uint64_t val);
int writeDevInfo(DevInfoTypes type, std::string val);
@@ -214,6 +216,8 @@ class Device {
int readDevInfoStr(DevInfoTypes type, std::string *retStr);
int readDevInfoMultiLineStr(DevInfoTypes type,
std::vector<std::string> *retVec);
int readDevInfoBinary(DevInfoTypes type,
std::vector<unsigned char> *retVec);
int writeDevInfoStr(DevInfoTypes type, std::string valStr);
uint64_t bdfid_;
uint64_t kfd_gpu_id_;
+3
Просмотреть файл
@@ -80,6 +80,9 @@ rsmi_status_t handleException();
rsmi_status_t
GetDevValueVec(amd::smi::DevInfoTypes type,
uint32_t dv_ind, std::vector<std::string> *val_vec);
rsmi_status_t
GetDevBinaryVec(amd::smi::DevInfoTypes type,
uint32_t dv_ind, std::vector<unsigned char> *val_vec);
rsmi_status_t ErrnoToRsmiStatus(uint32_t err);
struct pthread_wrap {
+5
Просмотреть файл
@@ -408,6 +408,7 @@ int main() {
rsmi_dev_perf_level_t pfl;
rsmi_frequencies_t f;
uint32_t num_monitor_devs = 0;
rsmi_gpu_metrics_t p;
rsmi_num_monitor_devices(&num_monitor_devs);
for (uint32_t i = 0; i< num_monitor_devs; ++i) {
@@ -415,6 +416,10 @@ int main() {
CHK_RSMI_RET(ret)
std::cout << "\t**Device ID: 0x" << std::hex << val_ui64 << std::endl;
ret = rsmi_dev_gpu_metrics_info_get(i, &p);
CHK_RSMI_RET(ret)
std::cout << "\t**GPU METRICS" << std::endl;
ret = rsmi_dev_perf_level_get(i, &pfl);
CHK_RSMI_RET(ret)
std::cout << "\t**Performance Level:" <<
+23 -1
Просмотреть файл
@@ -1033,7 +1033,6 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind,
CATCH
}
rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level,
uint64_t clkvalue,
rsmi_clk_type_t clkType) {
@@ -2158,6 +2157,29 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) {
CATCH
}
rsmi_status_t
rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
TRY
DEVICE_MUTEX
CHK_SUPPORT_NAME_ONLY(smu)
std::vector<unsigned char> val_vec;
rsmi_status_t ret = GetDevBinaryVec(amd::smi::kDevGpuMetrics, dv_ind,
&val_vec);
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
if (val_vec.size() == 0) {
return RSMI_STATUS_NOT_YET_IMPLEMENTED;
}
smu = reinterpret_cast<rsmi_gpu_metrics_t *>(val_vec.data());
return ret;
CATCH
}
rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind,
uint32_t *num_regions, rsmi_freq_volt_region_t *buffer) {
TRY
+37
Просмотреть файл
@@ -57,6 +57,7 @@
#include <vector>
#include <memory>
#include <algorithm>
#include <iterator>
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_device.h"
@@ -106,6 +107,7 @@ static const char *kDevMemBusyPercentFName = "mem_busy_percent";
static const char *kDevXGMIErrorFName = "xgmi_error";
static const char *kDevSerialNumberFName = "serial_number";
static const char *kDevNumaNodeFName = "numa_node";
static const char *kDevGpuMetricsFName = "gpu_metrics";
// Firmware version files
static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version";
@@ -265,6 +267,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevSerialNumber, kDevSerialNumberFName},
{kDevMemPageBad, kDevMemPageBadFName},
{kDevNumaNode, kDevNumaNodeFName},
{kDevGpuMetrics, kDevGpuMetricsFName},
};
static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
@@ -375,6 +378,7 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
{"rsmi_dev_xgmi_error_reset", {{kDevXGMIErrorFName}, {}}},
{"rsmi_dev_memory_reserved_pages_get", {{kDevMemPageBadFName}, {}}},
{"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}},
{"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}},
// These functions with variants, but no sensors/units. (May or may not
// have mandatory dependencies.)
@@ -624,6 +628,24 @@ int Device::readDevInfoLine(DevInfoTypes type, std::string *line) {
return 0;
}
int Device::readDevInfoBinary(DevInfoTypes type,
std::vector<unsigned char> *retVec) {
auto sysfs_path = path_;
sysfs_path += "/device/";
sysfs_path += kDevAttribNameMap.at(type);
std::ifstream fs(sysfs_path, std::ios::binary);
if (!fs.is_open()) {
return errno;
}
// copies all data into buffer
retVec->insert(retVec->begin(),
std::istreambuf_iterator<char>(fs),{});
return 0;
}
int Device::readDevInfoMultiLineStr(DevInfoTypes type,
std::vector<std::string> *retVec) {
std::string line;
@@ -754,6 +776,21 @@ int Device::readDevInfo(DevInfoTypes type, std::vector<std::string> *val) {
return 0;
}
int Device::readDevInfo(DevInfoTypes type, std::vector<unsigned char> *val) {
assert(val != nullptr);
switch (type) {
case kDevGpuMetrics:
return readDevInfoBinary(type, val);
break;
default:
return EINVAL;
}
return 0;
}
int Device::readDevInfo(DevInfoTypes type, std::string *val) {
assert(val != nullptr);
+12
Просмотреть файл
@@ -204,6 +204,18 @@ rsmi_status_t GetDevValueVec(amd::smi::DevInfoTypes type,
return ErrnoToRsmiStatus(ret);
}
rsmi_status_t GetDevBinaryVec(amd::smi::DevInfoTypes type,
uint32_t dv_ind, std::vector<unsigned char> *val_vec) {
assert(val_vec != nullptr);
if (val_vec == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
GET_DEV_FROM_INDX
int ret = dev->readDevInfo(type, val_vec);
return ErrnoToRsmiStatus(ret);
}
rsmi_status_t ErrnoToRsmiStatus(uint32_t err) {
switch (err) {
case 0: return RSMI_STATUS_SUCCESS;