Adding functionality that will parse gpu_metrics sysfs file
Signed-off-by: Divya Shikre <DivyaUday.Shikre@amd.com> Change-Id: I3a84870b83eb4cd0ed46f10bb19169c91f99fd8e
Этот коммит содержится в:
коммит произвёл
Divya Uday Shikre
родитель
3522e94ed0
Коммит
8b48564ce3
@@ -757,6 +757,72 @@ typedef struct {
|
||||
typedef rsmi_od_volt_freq_data_t rsmi_od_volt_freq_data;
|
||||
/// \endcond
|
||||
|
||||
|
||||
/**
|
||||
* @brief The following structures hold the gpu metrics values for a device.
|
||||
*/
|
||||
|
||||
struct metrics_table_header {
|
||||
uint16_t structure_size;
|
||||
uint8_t format_revision;
|
||||
uint8_t content_revision;
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
struct metrics_table_header common_header;
|
||||
|
||||
/* Driver attached timestamp (in ns) */
|
||||
uint64_t system_clock_counter;
|
||||
|
||||
/* Temperature */
|
||||
uint16_t temperature_edge;
|
||||
uint16_t temperature_hotspot;
|
||||
uint16_t temperature_mem;
|
||||
uint16_t temperature_vrgfx;
|
||||
uint16_t temperature_vrsoc;
|
||||
uint16_t temperature_vrmem;
|
||||
|
||||
/* Utilization */
|
||||
uint16_t average_gfx_activity;
|
||||
uint16_t average_umc_activity; // memory controller
|
||||
uint16_t average_mm_activity; // UVD or VCN
|
||||
|
||||
/* Power/Energy */
|
||||
uint16_t average_socket_power;
|
||||
uint32_t energy_accumulator;
|
||||
|
||||
/* Average clocks */
|
||||
uint16_t average_gfxclk_frequency;
|
||||
uint16_t average_socclk_frequency;
|
||||
uint16_t average_uclk_frequency;
|
||||
uint16_t average_vclk0_frequency;
|
||||
uint16_t average_dclk0_frequency;
|
||||
uint16_t average_vclk1_frequency;
|
||||
uint16_t average_dclk1_frequency;
|
||||
|
||||
/* Current clocks */
|
||||
uint16_t current_gfxclk;
|
||||
uint16_t current_socclk;
|
||||
uint16_t current_uclk;
|
||||
uint16_t current_vclk0;
|
||||
uint16_t current_dclk0;
|
||||
uint16_t current_vclk1;
|
||||
uint16_t current_dclk1;
|
||||
|
||||
/* Throttle status */
|
||||
uint32_t throttle_status;
|
||||
|
||||
/* Fans */
|
||||
uint16_t current_fan_speed;
|
||||
|
||||
/* Link width/speed */
|
||||
uint8_t pcie_link_width;
|
||||
uint8_t pcie_link_speed; // in 0.1 GT/s
|
||||
}rsmi_gpu_metrics_t;
|
||||
/// \cond Ignore in docs.
|
||||
typedef rsmi_gpu_metrics_t rsmi_gpu_metrics;
|
||||
/// \endcond
|
||||
|
||||
/**
|
||||
* @brief This structure holds error counts.
|
||||
*/
|
||||
@@ -2014,6 +2080,29 @@ rsmi_status_t rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind,
|
||||
rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind,
|
||||
rsmi_od_volt_freq_data_t *odv);
|
||||
|
||||
/**
|
||||
* @brief This function retrieves the gpu metrics information
|
||||
*
|
||||
* @details Given a device index @p dv_ind and a pointer to a
|
||||
* ::rsmi_gpu_metrics_t structure @p pgpu_metrics, this function will populate
|
||||
* @p pgpu_metrics. See ::rsmi_gpu_metrics_t for more details.
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @param[inout] pgpu_metrics a pointer to an ::rsmi_gpu_metrics_t structure
|
||||
* If this parameter is nullptr, this function will return
|
||||
* ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided,
|
||||
* arguments and ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the
|
||||
* provided arguments.
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS call was successful
|
||||
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
|
||||
* support this function with the given arguments
|
||||
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind,
|
||||
rsmi_gpu_metrics_t *pgpu_metrics);
|
||||
|
||||
/**
|
||||
* @brief This function sets the clock frequency information
|
||||
*
|
||||
|
||||
@@ -152,7 +152,8 @@ enum DevInfoTypes {
|
||||
kDevFwVersionVcn,
|
||||
kDevSerialNumber,
|
||||
kDevMemPageBad,
|
||||
kDevNumaNode
|
||||
kDevNumaNode,
|
||||
kDevGpuMetrics
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
@@ -175,6 +176,7 @@ class Device {
|
||||
int readDevInfoLine(DevInfoTypes type, std::string *line);
|
||||
int readDevInfo(DevInfoTypes type, std::string *val);
|
||||
int readDevInfo(DevInfoTypes type, std::vector<std::string> *retVec);
|
||||
int readDevInfo(DevInfoTypes type, std::vector<unsigned char> *retVec);
|
||||
int writeDevInfo(DevInfoTypes type, uint64_t val);
|
||||
int writeDevInfo(DevInfoTypes type, std::string val);
|
||||
|
||||
@@ -214,6 +216,8 @@ class Device {
|
||||
int readDevInfoStr(DevInfoTypes type, std::string *retStr);
|
||||
int readDevInfoMultiLineStr(DevInfoTypes type,
|
||||
std::vector<std::string> *retVec);
|
||||
int readDevInfoBinary(DevInfoTypes type,
|
||||
std::vector<unsigned char> *retVec);
|
||||
int writeDevInfoStr(DevInfoTypes type, std::string valStr);
|
||||
uint64_t bdfid_;
|
||||
uint64_t kfd_gpu_id_;
|
||||
|
||||
@@ -80,6 +80,9 @@ rsmi_status_t handleException();
|
||||
rsmi_status_t
|
||||
GetDevValueVec(amd::smi::DevInfoTypes type,
|
||||
uint32_t dv_ind, std::vector<std::string> *val_vec);
|
||||
rsmi_status_t
|
||||
GetDevBinaryVec(amd::smi::DevInfoTypes type,
|
||||
uint32_t dv_ind, std::vector<unsigned char> *val_vec);
|
||||
rsmi_status_t ErrnoToRsmiStatus(uint32_t err);
|
||||
|
||||
struct pthread_wrap {
|
||||
|
||||
@@ -408,6 +408,7 @@ int main() {
|
||||
rsmi_dev_perf_level_t pfl;
|
||||
rsmi_frequencies_t f;
|
||||
uint32_t num_monitor_devs = 0;
|
||||
rsmi_gpu_metrics_t p;
|
||||
|
||||
rsmi_num_monitor_devices(&num_monitor_devs);
|
||||
for (uint32_t i = 0; i< num_monitor_devs; ++i) {
|
||||
@@ -415,6 +416,10 @@ int main() {
|
||||
CHK_RSMI_RET(ret)
|
||||
std::cout << "\t**Device ID: 0x" << std::hex << val_ui64 << std::endl;
|
||||
|
||||
ret = rsmi_dev_gpu_metrics_info_get(i, &p);
|
||||
CHK_RSMI_RET(ret)
|
||||
std::cout << "\t**GPU METRICS" << std::endl;
|
||||
|
||||
ret = rsmi_dev_perf_level_get(i, &pfl);
|
||||
CHK_RSMI_RET(ret)
|
||||
std::cout << "\t**Performance Level:" <<
|
||||
|
||||
+23
-1
@@ -1033,7 +1033,6 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind,
|
||||
CATCH
|
||||
}
|
||||
|
||||
|
||||
rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level,
|
||||
uint64_t clkvalue,
|
||||
rsmi_clk_type_t clkType) {
|
||||
@@ -2158,6 +2157,29 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) {
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
|
||||
TRY
|
||||
DEVICE_MUTEX
|
||||
CHK_SUPPORT_NAME_ONLY(smu)
|
||||
|
||||
std::vector<unsigned char> val_vec;
|
||||
rsmi_status_t ret = GetDevBinaryVec(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
&val_vec);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (val_vec.size() == 0) {
|
||||
return RSMI_STATUS_NOT_YET_IMPLEMENTED;
|
||||
}
|
||||
|
||||
smu = reinterpret_cast<rsmi_gpu_metrics_t *>(val_vec.data());
|
||||
|
||||
return ret;
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind,
|
||||
uint32_t *num_regions, rsmi_freq_volt_region_t *buffer) {
|
||||
TRY
|
||||
|
||||
@@ -57,6 +57,7 @@
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_device.h"
|
||||
@@ -106,6 +107,7 @@ static const char *kDevMemBusyPercentFName = "mem_busy_percent";
|
||||
static const char *kDevXGMIErrorFName = "xgmi_error";
|
||||
static const char *kDevSerialNumberFName = "serial_number";
|
||||
static const char *kDevNumaNodeFName = "numa_node";
|
||||
static const char *kDevGpuMetricsFName = "gpu_metrics";
|
||||
|
||||
// Firmware version files
|
||||
static const char *kDevFwVersionAsdFName = "fw_version/asd_fw_version";
|
||||
@@ -265,6 +267,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
|
||||
{kDevSerialNumber, kDevSerialNumberFName},
|
||||
{kDevMemPageBad, kDevMemPageBadFName},
|
||||
{kDevNumaNode, kDevNumaNodeFName},
|
||||
{kDevGpuMetrics, kDevGpuMetricsFName},
|
||||
};
|
||||
|
||||
static const std::map<rsmi_dev_perf_level, const char *> kDevPerfLvlMap = {
|
||||
@@ -375,6 +378,7 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
|
||||
{"rsmi_dev_xgmi_error_reset", {{kDevXGMIErrorFName}, {}}},
|
||||
{"rsmi_dev_memory_reserved_pages_get", {{kDevMemPageBadFName}, {}}},
|
||||
{"rsmi_topo_numa_affinity_get", {{kDevNumaNodeFName}, {}}},
|
||||
{"rsmi_dev_gpu_metrics_info_get", {{kDevGpuMetricsFName}, {}}},
|
||||
|
||||
// These functions with variants, but no sensors/units. (May or may not
|
||||
// have mandatory dependencies.)
|
||||
@@ -624,6 +628,24 @@ int Device::readDevInfoLine(DevInfoTypes type, std::string *line) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Device::readDevInfoBinary(DevInfoTypes type,
|
||||
std::vector<unsigned char> *retVec) {
|
||||
auto sysfs_path = path_;
|
||||
|
||||
sysfs_path += "/device/";
|
||||
sysfs_path += kDevAttribNameMap.at(type);
|
||||
|
||||
std::ifstream fs(sysfs_path, std::ios::binary);
|
||||
if (!fs.is_open()) {
|
||||
return errno;
|
||||
}
|
||||
// copies all data into buffer
|
||||
retVec->insert(retVec->begin(),
|
||||
std::istreambuf_iterator<char>(fs),{});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Device::readDevInfoMultiLineStr(DevInfoTypes type,
|
||||
std::vector<std::string> *retVec) {
|
||||
std::string line;
|
||||
@@ -754,6 +776,21 @@ int Device::readDevInfo(DevInfoTypes type, std::vector<std::string> *val) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Device::readDevInfo(DevInfoTypes type, std::vector<unsigned char> *val) {
|
||||
assert(val != nullptr);
|
||||
|
||||
switch (type) {
|
||||
case kDevGpuMetrics:
|
||||
return readDevInfoBinary(type, val);
|
||||
break;
|
||||
|
||||
default:
|
||||
return EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Device::readDevInfo(DevInfoTypes type, std::string *val) {
|
||||
assert(val != nullptr);
|
||||
|
||||
|
||||
@@ -204,6 +204,18 @@ rsmi_status_t GetDevValueVec(amd::smi::DevInfoTypes type,
|
||||
return ErrnoToRsmiStatus(ret);
|
||||
}
|
||||
|
||||
rsmi_status_t GetDevBinaryVec(amd::smi::DevInfoTypes type,
|
||||
uint32_t dv_ind, std::vector<unsigned char> *val_vec) {
|
||||
assert(val_vec != nullptr);
|
||||
if (val_vec == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
GET_DEV_FROM_INDX
|
||||
|
||||
int ret = dev->readDevInfo(type, val_vec);
|
||||
return ErrnoToRsmiStatus(ret);
|
||||
}
|
||||
|
||||
rsmi_status_t ErrnoToRsmiStatus(uint32_t err) {
|
||||
switch (err) {
|
||||
case 0: return RSMI_STATUS_SUCCESS;
|
||||
|
||||
Ссылка в новой задаче
Block a user