Handle different gpu_metrics content versions for format v1
Change-Id: I344d1815da683befc8f8b5caf921803b267ae29f
Этот коммит содержится в:
@@ -802,14 +802,25 @@ struct metrics_table_header_t {
|
||||
/**
|
||||
* @brief The following structure holds the gpu metrics values for a device.
|
||||
*/
|
||||
// Below is the assumed version of gpu_metric data on the device. If the device
|
||||
// is using this version, we can read data directly into rsmi_gpu_metrics_t.
|
||||
// If the device is using an older format, a conversion of formats will be
|
||||
// required.
|
||||
// DGPU targets have a format version of 1. APU targets have a format version of
|
||||
// 2. Currently, only version 1 (DGPU) gpu_metrics is supported.
|
||||
#define RSMI_GPU_METRICS_API_FORMAT_VER 1
|
||||
// The content version increments when gpu_metrics is extended with new and/or
|
||||
// existing field sizes are changed.
|
||||
#define RSMI_GPU_METRICS_API_CONTENT_VER 1
|
||||
|
||||
// This should match NUM_HBM_INSTANCES
|
||||
#define RSMI_NUM_HBM_INSTANCES 4
|
||||
|
||||
typedef struct {
|
||||
// TODO(amd) Doxygen documents
|
||||
/// \cond Ignore in docs.
|
||||
struct metrics_table_header_t common_header;
|
||||
|
||||
/* Driver attached timestamp (in ns) */
|
||||
uint64_t system_clock_counter;
|
||||
|
||||
/* Temperature */
|
||||
uint16_t temperature_edge;
|
||||
uint16_t temperature_hotspot;
|
||||
@@ -825,7 +836,10 @@ typedef struct {
|
||||
|
||||
/* Power/Energy */
|
||||
uint16_t average_socket_power;
|
||||
uint32_t energy_accumulator;
|
||||
uint64_t energy_accumulator; // v1 mod. (32->64)
|
||||
|
||||
/* Driver attached timestamp (in ns) */
|
||||
uint64_t system_clock_counter; // v1 mod. (moved from top of struct)
|
||||
|
||||
/* Average clocks */
|
||||
uint16_t average_gfxclk_frequency;
|
||||
@@ -852,8 +866,14 @@ typedef struct {
|
||||
uint16_t current_fan_speed;
|
||||
|
||||
/* Link width/speed */
|
||||
uint8_t pcie_link_width;
|
||||
uint8_t pcie_link_speed; // in 0.1 GT/s
|
||||
uint16_t pcie_link_width; // v1 mod.(8->16)
|
||||
uint16_t pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16)
|
||||
|
||||
uint16_t padding; // new in v1
|
||||
|
||||
uint32_t gfx_activity_acc; // new in v1
|
||||
uint32_t mem_actvity_acc; // new in v1
|
||||
uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1
|
||||
/// \endcond
|
||||
} rsmi_gpu_metrics_t;
|
||||
|
||||
|
||||
@@ -204,6 +204,7 @@ class Device {
|
||||
void set_evt_notif_anon_fd(uint32_t fd) {
|
||||
evt_notif_anon_fd_ = static_cast<int>(fd);}
|
||||
int evt_notif_anon_fd(void) const {return evt_notif_anon_fd_;}
|
||||
metrics_table_header_t & gpu_metrics_ver(void) {return gpu_metrics_ver_;}
|
||||
void fillSupportedFuncs(void);
|
||||
void DumpSupportedFunctions(void);
|
||||
bool DeviceAPISupported(std::string name, uint64_t variant,
|
||||
@@ -237,6 +238,7 @@ class Device {
|
||||
|
||||
int evt_notif_anon_fd_;
|
||||
FILE *evt_notif_anon_file_ptr_;
|
||||
struct metrics_table_header_t gpu_metrics_ver_;
|
||||
};
|
||||
|
||||
} // namespace smi
|
||||
|
||||
+160
-5
@@ -2217,19 +2217,174 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) {
|
||||
return ret;
|
||||
CATCH
|
||||
}
|
||||
// Put definitions of old gpu_metrics formats here
|
||||
typedef struct {
|
||||
struct metrics_table_header_t common_header;
|
||||
|
||||
/* Driver attached timestamp (in ns) */
|
||||
uint64_t system_clock_counter;
|
||||
|
||||
/* Temperature */
|
||||
uint16_t temperature_edge;
|
||||
uint16_t temperature_hotspot;
|
||||
uint16_t temperature_mem;
|
||||
uint16_t temperature_vrgfx;
|
||||
uint16_t temperature_vrsoc;
|
||||
uint16_t temperature_vrmem;
|
||||
|
||||
/* Utilization */
|
||||
uint16_t average_gfx_activity;
|
||||
uint16_t average_umc_activity; // memory controller
|
||||
uint16_t average_mm_activity; // UVD or VCN
|
||||
|
||||
/* Power/Energy */
|
||||
uint16_t average_socket_power;
|
||||
uint32_t energy_accumulator;
|
||||
|
||||
/* Average clocks */
|
||||
uint16_t average_gfxclk_frequency;
|
||||
uint16_t average_socclk_frequency;
|
||||
uint16_t average_uclk_frequency;
|
||||
uint16_t average_vclk0_frequency;
|
||||
uint16_t average_dclk0_frequency;
|
||||
uint16_t average_vclk1_frequency;
|
||||
uint16_t average_dclk1_frequency;
|
||||
|
||||
/* Current clocks */
|
||||
uint16_t current_gfxclk;
|
||||
uint16_t current_socclk;
|
||||
uint16_t current_uclk;
|
||||
uint16_t current_vclk0;
|
||||
uint16_t current_dclk0;
|
||||
uint16_t current_vclk1;
|
||||
uint16_t current_dclk1;
|
||||
|
||||
/* Throttle status */
|
||||
uint32_t throttle_status;
|
||||
|
||||
/* Fans */
|
||||
uint16_t current_fan_speed;
|
||||
|
||||
/* Link width/speed */
|
||||
uint8_t pcie_link_width;
|
||||
uint8_t pcie_link_speed; // in 0.1 GT/s
|
||||
} rsmi_gpu_metrics_v_1_0_t;
|
||||
|
||||
|
||||
static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind,
|
||||
rsmi_gpu_metrics_t *data, uint8_t content_v) {
|
||||
assert(content_v != RSMI_GPU_METRICS_API_CONTENT_VER);
|
||||
if (content_v == RSMI_GPU_METRICS_API_CONTENT_VER) {
|
||||
// This function shouldn't be called if content version is
|
||||
// RSMI_GPU_METRICS_API_CONTENT_VER.
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
void *metric_data = nullptr;
|
||||
size_t data_size;
|
||||
rsmi_status_t ret;
|
||||
|
||||
rsmi_gpu_metrics_v_1_0_t metric_data_v_1_0;
|
||||
|
||||
if (content_v == 0) {
|
||||
metric_data = &metric_data_v_1_0;
|
||||
data_size = sizeof(rsmi_gpu_metrics_v_1_0_t);
|
||||
} // else { ... handle other conversions to v1
|
||||
|
||||
assert(metric_data != nullptr && "Unexpected conversion attempted.");
|
||||
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, data_size,
|
||||
metric_data);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define ASSIGN_DATA_FIELD(FIELD, SRC) \
|
||||
data->FIELD = SRC->FIELD;
|
||||
|
||||
#define ASSIGN_COMMON_FORMATS(SRC) \
|
||||
ASSIGN_DATA_FIELD(common_header, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(temperature_edge, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(temperature_hotspot, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(temperature_mem, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(temperature_vrgfx, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(temperature_vrsoc, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(temperature_vrmem, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(average_gfx_activity, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(average_umc_activity, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(average_mm_activity, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(average_socket_power, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(system_clock_counter, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(average_gfxclk_frequency, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(average_socclk_frequency, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(average_uclk_frequency, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(average_vclk0_frequency, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(average_dclk0_frequency, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(average_vclk1_frequency, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(average_dclk1_frequency, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(current_gfxclk, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(current_socclk, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(current_uclk, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(current_vclk0, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(current_dclk0, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(current_vclk1, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(current_dclk1, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(throttle_status, (SRC)) \
|
||||
ASSIGN_DATA_FIELD(current_fan_speed, (SRC))
|
||||
|
||||
// Now handle differences from format 1
|
||||
if (content_v == 0) {
|
||||
// First handle all data that is common to Format1 and other formats
|
||||
ASSIGN_COMMON_FORMATS(
|
||||
reinterpret_cast<rsmi_gpu_metrics_v_1_0_t *>(metric_data))
|
||||
|
||||
// Then, the differences:
|
||||
data->energy_accumulator = static_cast<uint64_t>(
|
||||
reinterpret_cast<rsmi_gpu_metrics_v_1_0_t *>(
|
||||
metric_data)->energy_accumulator);
|
||||
data->pcie_link_width = static_cast<uint16_t>(
|
||||
reinterpret_cast<rsmi_gpu_metrics_v_1_0_t *>(
|
||||
metric_data)->pcie_link_width);
|
||||
data->pcie_link_speed = static_cast<uint16_t>(
|
||||
reinterpret_cast<rsmi_gpu_metrics_v_1_0_t *>(
|
||||
metric_data)->pcie_link_speed);
|
||||
|
||||
// These fields didn't exist in v0
|
||||
data->gfx_activity_acc = 0;
|
||||
data->mem_actvity_acc = 0;
|
||||
(void)memset(data->temperature_hbm, 0,
|
||||
RSMI_NUM_HBM_INSTANCES * sizeof(uint16_t));
|
||||
} // else handle other conversions to format 1
|
||||
#undef ASSIGN_DATA_FIELD
|
||||
#undef ASSIGN_COMMON_FORMATS
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
|
||||
TRY
|
||||
DEVICE_MUTEX
|
||||
CHK_SUPPORT_NAME_ONLY(smu)
|
||||
rsmi_status_t ret;
|
||||
|
||||
rsmi_status_t ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
if (!dev->gpu_metrics_ver().structure_size) {
|
||||
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
sizeof(struct metrics_table_header_t), &dev->gpu_metrics_ver());
|
||||
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
// only supports gpu_metrics_v1_x version
|
||||
if (dev->gpu_metrics_ver().format_revision != 1) {
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
} else { // format_revision == 1
|
||||
if (dev->gpu_metrics_ver().content_revision ==
|
||||
RSMI_GPU_METRICS_API_CONTENT_VER) {
|
||||
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
sizeof(rsmi_gpu_metrics_t), smu);
|
||||
|
||||
// only supports gpu_metrics_v1_0 version
|
||||
if (smu->common_header.format_revision != 1) {
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
} else {
|
||||
ret = GetGPUMetricsFormat1(dv_ind, smu,
|
||||
dev->gpu_metrics_ver().content_revision);
|
||||
}
|
||||
}
|
||||
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
|
||||
@@ -482,8 +482,9 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
|
||||
if (X) return X; \
|
||||
}
|
||||
|
||||
Device::Device(std::string p, RocmSMI_env_vars const *e) : monitor_(nullptr),
|
||||
path_(p), env_(e), evt_notif_anon_fd_(-1) {
|
||||
Device::Device(std::string p, RocmSMI_env_vars const *e) :
|
||||
monitor_(nullptr), path_(p), env_(e), evt_notif_anon_fd_(-1),
|
||||
gpu_metrics_ver_{0, 0, 0} {
|
||||
#ifdef NDEBUG
|
||||
env_ = nullptr;
|
||||
#endif
|
||||
|
||||
@@ -111,75 +111,84 @@ void TestGpuMetricsRead::Run(void) {
|
||||
"Not supported on this machine" << std::endl;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
CHK_ERR_ASRT(err);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << std::dec << "system_clock_counter="
|
||||
<< smu.system_clock_counter << '\n';
|
||||
std::cout << std::dec << "temperature_edge="
|
||||
<< smu.temperature_edge << '\n';
|
||||
std::cout << std::dec << "temperature_hotspot="
|
||||
<< smu.temperature_hotspot << '\n';
|
||||
std::cout << std::dec << "temperature_mem="
|
||||
<< smu.temperature_mem << '\n';
|
||||
std::cout << std::dec << "temperature_vrgfx="
|
||||
<< smu.temperature_vrgfx << '\n';
|
||||
std::cout << std::dec << "temperature_vrsoc="
|
||||
<< smu.temperature_vrsoc << '\n';
|
||||
std::cout << std::dec << "temperature_vrmem="
|
||||
<< smu.temperature_vrmem << '\n';
|
||||
std::cout << std::dec << "average_gfx_activity="
|
||||
<< smu.average_gfx_activity << '\n';
|
||||
std::cout << std::dec << "average_umc_activity="
|
||||
<< smu.average_umc_activity << '\n';
|
||||
std::cout << std::dec << "average_mm_activity="
|
||||
<< smu.average_mm_activity << '\n';
|
||||
std::cout << std::dec << "average_socket_power="
|
||||
<< smu.average_socket_power << '\n';
|
||||
std::cout << std::dec << "energy_accumulator="
|
||||
<< smu.energy_accumulator << '\n';
|
||||
std::cout << std::dec << "average_gfxclk_frequency="
|
||||
<< smu.average_gfxclk_frequency << '\n';
|
||||
std::cout << std::dec << "average_gfxclk_frequency="
|
||||
<< smu.average_gfxclk_frequency << '\n';
|
||||
std::cout << std::dec << "average_uclk_frequency="
|
||||
<< smu.average_uclk_frequency << '\n';
|
||||
std::cout << std::dec << "average_vclk0_frequency="
|
||||
<< smu.average_vclk0_frequency << '\n';
|
||||
std::cout << std::dec << "average_dclk0_frequency="
|
||||
<< smu.average_dclk0_frequency << '\n';
|
||||
std::cout << std::dec << "average_dclk0_frequency="
|
||||
<< smu.average_dclk0_frequency << '\n';
|
||||
std::cout << std::dec << "average_dclk1_frequency="
|
||||
<< smu.average_dclk1_frequency << '\n';
|
||||
std::cout << std::dec << "current_gfxclk="
|
||||
<< smu.current_gfxclk << '\n';
|
||||
std::cout << std::dec << "current_socclk="
|
||||
<< smu.current_socclk << '\n';
|
||||
std::cout << std::dec << "current_uclk="
|
||||
<< smu.current_uclk << '\n';
|
||||
std::cout << std::dec << "current_vclk0="
|
||||
<< smu.current_vclk0 << '\n';
|
||||
std::cout << std::dec << "current_dclk0="
|
||||
<< smu.current_dclk0 << '\n';
|
||||
std::cout << std::dec << "current_vclk1="
|
||||
<< smu.current_vclk1 << '\n';
|
||||
std::cout << std::dec << "current_dclk1="
|
||||
<< smu.current_dclk1 << '\n';
|
||||
std::cout << std::dec << "throttle_status="
|
||||
<< smu.throttle_status << '\n';
|
||||
std::cout << std::dec << "current_fan_speed="
|
||||
<< smu.current_fan_speed << '\n';
|
||||
std::cout << "pcie_link_width="
|
||||
<< std::to_string(smu.pcie_link_width) << '\n';
|
||||
std::cout << "pcie_link_width="
|
||||
<< std::to_string(smu.pcie_link_speed) << '\n';
|
||||
}
|
||||
} else {
|
||||
CHK_ERR_ASRT(err);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << std::dec << "system_clock_counter="
|
||||
<< smu.system_clock_counter << '\n';
|
||||
std::cout << std::dec << "temperature_edge="
|
||||
<< smu.temperature_edge << '\n';
|
||||
std::cout << std::dec << "temperature_hotspot="
|
||||
<< smu.temperature_hotspot << '\n';
|
||||
std::cout << std::dec << "temperature_mem="
|
||||
<< smu.temperature_mem << '\n';
|
||||
std::cout << std::dec << "temperature_vrgfx="
|
||||
<< smu.temperature_vrgfx << '\n';
|
||||
std::cout << std::dec << "temperature_vrsoc="
|
||||
<< smu.temperature_vrsoc << '\n';
|
||||
std::cout << std::dec << "temperature_vrmem="
|
||||
<< smu.temperature_vrmem << '\n';
|
||||
std::cout << std::dec << "average_gfx_activity="
|
||||
<< smu.average_gfx_activity << '\n';
|
||||
std::cout << std::dec << "average_umc_activity="
|
||||
<< smu.average_umc_activity << '\n';
|
||||
std::cout << std::dec << "average_mm_activity="
|
||||
<< smu.average_mm_activity << '\n';
|
||||
std::cout << std::dec << "average_socket_power="
|
||||
<< smu.average_socket_power << '\n';
|
||||
std::cout << std::dec << "energy_accumulator="
|
||||
<< smu.energy_accumulator << '\n';
|
||||
std::cout << std::dec << "average_gfxclk_frequency="
|
||||
<< smu.average_gfxclk_frequency << '\n';
|
||||
std::cout << std::dec << "average_gfxclk_frequency="
|
||||
<< smu.average_gfxclk_frequency << '\n';
|
||||
std::cout << std::dec << "average_uclk_frequency="
|
||||
<< smu.average_uclk_frequency << '\n';
|
||||
std::cout << std::dec << "average_vclk0_frequency="
|
||||
<< smu.average_vclk0_frequency << '\n';
|
||||
std::cout << std::dec << "average_dclk0_frequency="
|
||||
<< smu.average_dclk0_frequency << '\n';
|
||||
std::cout << std::dec << "average_vclk1_frequency="
|
||||
<< smu.average_vclk1_frequency << '\n';
|
||||
std::cout << std::dec << "average_dclk1_frequency="
|
||||
<< smu.average_dclk1_frequency << '\n';
|
||||
std::cout << std::dec << "current_gfxclk="
|
||||
<< smu.current_gfxclk << '\n';
|
||||
std::cout << std::dec << "current_socclk="
|
||||
<< smu.current_socclk << '\n';
|
||||
std::cout << std::dec << "current_uclk="
|
||||
<< smu.current_uclk << '\n';
|
||||
std::cout << std::dec << "current_vclk0="
|
||||
<< smu.current_vclk0 << '\n';
|
||||
std::cout << std::dec << "current_dclk0="
|
||||
<< smu.current_dclk0 << '\n';
|
||||
std::cout << std::dec << "current_vclk1="
|
||||
<< smu.current_vclk1 << '\n';
|
||||
std::cout << std::dec << "current_dclk1="
|
||||
<< smu.current_dclk1 << '\n';
|
||||
std::cout << std::dec << "throttle_status="
|
||||
<< smu.throttle_status << '\n';
|
||||
std::cout << std::dec << "current_fan_speed="
|
||||
<< smu.current_fan_speed << '\n';
|
||||
std::cout << "pcie_link_width="
|
||||
<< std::to_string(smu.pcie_link_width) << '\n';
|
||||
std::cout << "pcie_link_width="
|
||||
<< std::to_string(smu.pcie_link_speed) << '\n';
|
||||
std::cout << "gfx_activity_acc="
|
||||
<< std::dec << smu.gfx_activity_acc << '\n';
|
||||
std::cout << "mem_actvity_acc="
|
||||
<< std::dec << smu.mem_actvity_acc << '\n';
|
||||
|
||||
for (int i = 0; i < RSMI_NUM_HBM_INSTANCES; ++i) {
|
||||
std::cout << "temperature_hbm[" << i << "]=" << std::dec <<
|
||||
smu.temperature_hbm[i] << '\n';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_gpu_metrics_info_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user