Handle different gpu_metrics content versions for format v1

Change-Id: I344d1815da683befc8f8b5caf921803b267ae29f
Этот коммит содержится в:
Chris Freehill
2021-03-06 11:53:30 -06:00
родитель ce475b009c
Коммит 5e2a4f3a15
5 изменённых файлов: 264 добавлений и 77 удалений
+26 -6
Просмотреть файл
@@ -802,14 +802,25 @@ struct metrics_table_header_t {
/**
* @brief The following structure holds the gpu metrics values for a device.
*/
// Below is the assumed version of gpu_metric data on the device. If the device
// is using this version, we can read data directly into rsmi_gpu_metrics_t.
// If the device is using an older format, a conversion of formats will be
// required.
// DGPU targets have a format version of 1. APU targets have a format version of
// 2. Currently, only version 1 (DGPU) gpu_metrics is supported.
#define RSMI_GPU_METRICS_API_FORMAT_VER 1
// The content version increments when gpu_metrics is extended with new and/or
// existing field sizes are changed.
#define RSMI_GPU_METRICS_API_CONTENT_VER 1
// This should match NUM_HBM_INSTANCES
#define RSMI_NUM_HBM_INSTANCES 4
typedef struct {
// TODO(amd) Doxygen documents
/// \cond Ignore in docs.
struct metrics_table_header_t common_header;
/* Driver attached timestamp (in ns) */
uint64_t system_clock_counter;
/* Temperature */
uint16_t temperature_edge;
uint16_t temperature_hotspot;
@@ -825,7 +836,10 @@ typedef struct {
/* Power/Energy */
uint16_t average_socket_power;
uint32_t energy_accumulator;
uint64_t energy_accumulator; // v1 mod. (32->64)
/* Driver attached timestamp (in ns) */
uint64_t system_clock_counter; // v1 mod. (moved from top of struct)
/* Average clocks */
uint16_t average_gfxclk_frequency;
@@ -852,8 +866,14 @@ typedef struct {
uint16_t current_fan_speed;
/* Link width/speed */
uint8_t pcie_link_width;
uint8_t pcie_link_speed; // in 0.1 GT/s
uint16_t pcie_link_width; // v1 mod.(8->16)
uint16_t pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16)
uint16_t padding; // new in v1
uint32_t gfx_activity_acc; // new in v1
uint32_t mem_actvity_acc; // new in v1
uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1
/// \endcond
} rsmi_gpu_metrics_t;
+2
Просмотреть файл
@@ -204,6 +204,7 @@ class Device {
void set_evt_notif_anon_fd(uint32_t fd) {
evt_notif_anon_fd_ = static_cast<int>(fd);}
int evt_notif_anon_fd(void) const {return evt_notif_anon_fd_;}
metrics_table_header_t & gpu_metrics_ver(void) {return gpu_metrics_ver_;}
void fillSupportedFuncs(void);
void DumpSupportedFunctions(void);
bool DeviceAPISupported(std::string name, uint64_t variant,
@@ -237,6 +238,7 @@ class Device {
int evt_notif_anon_fd_;
FILE *evt_notif_anon_file_ptr_;
struct metrics_table_header_t gpu_metrics_ver_;
};
} // namespace smi
+160 -5
Просмотреть файл
@@ -2217,19 +2217,174 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) {
return ret;
CATCH
}
// Put definitions of old gpu_metrics formats here
typedef struct {
struct metrics_table_header_t common_header;
/* Driver attached timestamp (in ns) */
uint64_t system_clock_counter;
/* Temperature */
uint16_t temperature_edge;
uint16_t temperature_hotspot;
uint16_t temperature_mem;
uint16_t temperature_vrgfx;
uint16_t temperature_vrsoc;
uint16_t temperature_vrmem;
/* Utilization */
uint16_t average_gfx_activity;
uint16_t average_umc_activity; // memory controller
uint16_t average_mm_activity; // UVD or VCN
/* Power/Energy */
uint16_t average_socket_power;
uint32_t energy_accumulator;
/* Average clocks */
uint16_t average_gfxclk_frequency;
uint16_t average_socclk_frequency;
uint16_t average_uclk_frequency;
uint16_t average_vclk0_frequency;
uint16_t average_dclk0_frequency;
uint16_t average_vclk1_frequency;
uint16_t average_dclk1_frequency;
/* Current clocks */
uint16_t current_gfxclk;
uint16_t current_socclk;
uint16_t current_uclk;
uint16_t current_vclk0;
uint16_t current_dclk0;
uint16_t current_vclk1;
uint16_t current_dclk1;
/* Throttle status */
uint32_t throttle_status;
/* Fans */
uint16_t current_fan_speed;
/* Link width/speed */
uint8_t pcie_link_width;
uint8_t pcie_link_speed; // in 0.1 GT/s
} rsmi_gpu_metrics_v_1_0_t;
static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind,
rsmi_gpu_metrics_t *data, uint8_t content_v) {
assert(content_v != RSMI_GPU_METRICS_API_CONTENT_VER);
if (content_v == RSMI_GPU_METRICS_API_CONTENT_VER) {
// This function shouldn't be called if content version is
// RSMI_GPU_METRICS_API_CONTENT_VER.
return RSMI_STATUS_INVALID_ARGS;
}
void *metric_data = nullptr;
size_t data_size;
rsmi_status_t ret;
rsmi_gpu_metrics_v_1_0_t metric_data_v_1_0;
if (content_v == 0) {
metric_data = &metric_data_v_1_0;
data_size = sizeof(rsmi_gpu_metrics_v_1_0_t);
} // else { ... handle other conversions to v1
assert(metric_data != nullptr && "Unexpected conversion attempted.");
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, data_size,
metric_data);
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
#define ASSIGN_DATA_FIELD(FIELD, SRC) \
data->FIELD = SRC->FIELD;
#define ASSIGN_COMMON_FORMATS(SRC) \
ASSIGN_DATA_FIELD(common_header, (SRC)) \
ASSIGN_DATA_FIELD(temperature_edge, (SRC)) \
ASSIGN_DATA_FIELD(temperature_hotspot, (SRC)) \
ASSIGN_DATA_FIELD(temperature_mem, (SRC)) \
ASSIGN_DATA_FIELD(temperature_vrgfx, (SRC)) \
ASSIGN_DATA_FIELD(temperature_vrsoc, (SRC)) \
ASSIGN_DATA_FIELD(temperature_vrmem, (SRC)) \
ASSIGN_DATA_FIELD(average_gfx_activity, (SRC)) \
ASSIGN_DATA_FIELD(average_umc_activity, (SRC)) \
ASSIGN_DATA_FIELD(average_mm_activity, (SRC)) \
ASSIGN_DATA_FIELD(average_socket_power, (SRC)) \
ASSIGN_DATA_FIELD(system_clock_counter, (SRC)) \
ASSIGN_DATA_FIELD(average_gfxclk_frequency, (SRC)) \
ASSIGN_DATA_FIELD(average_socclk_frequency, (SRC)) \
ASSIGN_DATA_FIELD(average_uclk_frequency, (SRC)) \
ASSIGN_DATA_FIELD(average_vclk0_frequency, (SRC)) \
ASSIGN_DATA_FIELD(average_dclk0_frequency, (SRC)) \
ASSIGN_DATA_FIELD(average_vclk1_frequency, (SRC)) \
ASSIGN_DATA_FIELD(average_dclk1_frequency, (SRC)) \
ASSIGN_DATA_FIELD(current_gfxclk, (SRC)) \
ASSIGN_DATA_FIELD(current_socclk, (SRC)) \
ASSIGN_DATA_FIELD(current_uclk, (SRC)) \
ASSIGN_DATA_FIELD(current_vclk0, (SRC)) \
ASSIGN_DATA_FIELD(current_dclk0, (SRC)) \
ASSIGN_DATA_FIELD(current_vclk1, (SRC)) \
ASSIGN_DATA_FIELD(current_dclk1, (SRC)) \
ASSIGN_DATA_FIELD(throttle_status, (SRC)) \
ASSIGN_DATA_FIELD(current_fan_speed, (SRC))
// Now handle differences from format 1
if (content_v == 0) {
// First handle all data that is common to Format1 and other formats
ASSIGN_COMMON_FORMATS(
reinterpret_cast<rsmi_gpu_metrics_v_1_0_t *>(metric_data))
// Then, the differences:
data->energy_accumulator = static_cast<uint64_t>(
reinterpret_cast<rsmi_gpu_metrics_v_1_0_t *>(
metric_data)->energy_accumulator);
data->pcie_link_width = static_cast<uint16_t>(
reinterpret_cast<rsmi_gpu_metrics_v_1_0_t *>(
metric_data)->pcie_link_width);
data->pcie_link_speed = static_cast<uint16_t>(
reinterpret_cast<rsmi_gpu_metrics_v_1_0_t *>(
metric_data)->pcie_link_speed);
// These fields didn't exist in v0
data->gfx_activity_acc = 0;
data->mem_actvity_acc = 0;
(void)memset(data->temperature_hbm, 0,
RSMI_NUM_HBM_INSTANCES * sizeof(uint16_t));
} // else handle other conversions to format 1
#undef ASSIGN_DATA_FIELD
#undef ASSIGN_COMMON_FORMATS
return RSMI_STATUS_SUCCESS;
}
rsmi_status_t
rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
TRY
DEVICE_MUTEX
CHK_SUPPORT_NAME_ONLY(smu)
rsmi_status_t ret;
rsmi_status_t ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
if (!dev->gpu_metrics_ver().structure_size) {
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
sizeof(struct metrics_table_header_t), &dev->gpu_metrics_ver());
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
}
// only supports gpu_metrics_v1_x version
if (dev->gpu_metrics_ver().format_revision != 1) {
return RSMI_STATUS_NOT_SUPPORTED;
} else { // format_revision == 1
if (dev->gpu_metrics_ver().content_revision ==
RSMI_GPU_METRICS_API_CONTENT_VER) {
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
sizeof(rsmi_gpu_metrics_t), smu);
// only supports gpu_metrics_v1_0 version
if (smu->common_header.format_revision != 1) {
return RSMI_STATUS_NOT_SUPPORTED;
} else {
ret = GetGPUMetricsFormat1(dv_ind, smu,
dev->gpu_metrics_ver().content_revision);
}
}
if (ret != RSMI_STATUS_SUCCESS) {
+3 -2
Просмотреть файл
@@ -482,8 +482,9 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
if (X) return X; \
}
Device::Device(std::string p, RocmSMI_env_vars const *e) : monitor_(nullptr),
path_(p), env_(e), evt_notif_anon_fd_(-1) {
Device::Device(std::string p, RocmSMI_env_vars const *e) :
monitor_(nullptr), path_(p), env_(e), evt_notif_anon_fd_(-1),
gpu_metrics_ver_{0, 0, 0} {
#ifdef NDEBUG
env_ = nullptr;
#endif
+73 -64
Просмотреть файл
@@ -111,75 +111,84 @@ void TestGpuMetricsRead::Run(void) {
"Not supported on this machine" << std::endl;
return;
}
} else {
CHK_ERR_ASRT(err);
IF_VERB(STANDARD) {
std::cout << std::dec << "system_clock_counter="
<< smu.system_clock_counter << '\n';
std::cout << std::dec << "temperature_edge="
<< smu.temperature_edge << '\n';
std::cout << std::dec << "temperature_hotspot="
<< smu.temperature_hotspot << '\n';
std::cout << std::dec << "temperature_mem="
<< smu.temperature_mem << '\n';
std::cout << std::dec << "temperature_vrgfx="
<< smu.temperature_vrgfx << '\n';
std::cout << std::dec << "temperature_vrsoc="
<< smu.temperature_vrsoc << '\n';
std::cout << std::dec << "temperature_vrmem="
<< smu.temperature_vrmem << '\n';
std::cout << std::dec << "average_gfx_activity="
<< smu.average_gfx_activity << '\n';
std::cout << std::dec << "average_umc_activity="
<< smu.average_umc_activity << '\n';
std::cout << std::dec << "average_mm_activity="
<< smu.average_mm_activity << '\n';
std::cout << std::dec << "average_socket_power="
<< smu.average_socket_power << '\n';
std::cout << std::dec << "energy_accumulator="
<< smu.energy_accumulator << '\n';
std::cout << std::dec << "average_gfxclk_frequency="
<< smu.average_gfxclk_frequency << '\n';
std::cout << std::dec << "average_gfxclk_frequency="
<< smu.average_gfxclk_frequency << '\n';
std::cout << std::dec << "average_uclk_frequency="
<< smu.average_uclk_frequency << '\n';
std::cout << std::dec << "average_vclk0_frequency="
<< smu.average_vclk0_frequency << '\n';
std::cout << std::dec << "average_dclk0_frequency="
<< smu.average_dclk0_frequency << '\n';
std::cout << std::dec << "average_dclk0_frequency="
<< smu.average_dclk0_frequency << '\n';
std::cout << std::dec << "average_dclk1_frequency="
<< smu.average_dclk1_frequency << '\n';
std::cout << std::dec << "current_gfxclk="
<< smu.current_gfxclk << '\n';
std::cout << std::dec << "current_socclk="
<< smu.current_socclk << '\n';
std::cout << std::dec << "current_uclk="
<< smu.current_uclk << '\n';
std::cout << std::dec << "current_vclk0="
<< smu.current_vclk0 << '\n';
std::cout << std::dec << "current_dclk0="
<< smu.current_dclk0 << '\n';
std::cout << std::dec << "current_vclk1="
<< smu.current_vclk1 << '\n';
std::cout << std::dec << "current_dclk1="
<< smu.current_dclk1 << '\n';
std::cout << std::dec << "throttle_status="
<< smu.throttle_status << '\n';
std::cout << std::dec << "current_fan_speed="
<< smu.current_fan_speed << '\n';
std::cout << "pcie_link_width="
<< std::to_string(smu.pcie_link_width) << '\n';
std::cout << "pcie_link_width="
<< std::to_string(smu.pcie_link_speed) << '\n';
}
} else {
CHK_ERR_ASRT(err);
IF_VERB(STANDARD) {
std::cout << std::dec << "system_clock_counter="
<< smu.system_clock_counter << '\n';
std::cout << std::dec << "temperature_edge="
<< smu.temperature_edge << '\n';
std::cout << std::dec << "temperature_hotspot="
<< smu.temperature_hotspot << '\n';
std::cout << std::dec << "temperature_mem="
<< smu.temperature_mem << '\n';
std::cout << std::dec << "temperature_vrgfx="
<< smu.temperature_vrgfx << '\n';
std::cout << std::dec << "temperature_vrsoc="
<< smu.temperature_vrsoc << '\n';
std::cout << std::dec << "temperature_vrmem="
<< smu.temperature_vrmem << '\n';
std::cout << std::dec << "average_gfx_activity="
<< smu.average_gfx_activity << '\n';
std::cout << std::dec << "average_umc_activity="
<< smu.average_umc_activity << '\n';
std::cout << std::dec << "average_mm_activity="
<< smu.average_mm_activity << '\n';
std::cout << std::dec << "average_socket_power="
<< smu.average_socket_power << '\n';
std::cout << std::dec << "energy_accumulator="
<< smu.energy_accumulator << '\n';
std::cout << std::dec << "average_gfxclk_frequency="
<< smu.average_gfxclk_frequency << '\n';
std::cout << std::dec << "average_gfxclk_frequency="
<< smu.average_gfxclk_frequency << '\n';
std::cout << std::dec << "average_uclk_frequency="
<< smu.average_uclk_frequency << '\n';
std::cout << std::dec << "average_vclk0_frequency="
<< smu.average_vclk0_frequency << '\n';
std::cout << std::dec << "average_dclk0_frequency="
<< smu.average_dclk0_frequency << '\n';
std::cout << std::dec << "average_vclk1_frequency="
<< smu.average_vclk1_frequency << '\n';
std::cout << std::dec << "average_dclk1_frequency="
<< smu.average_dclk1_frequency << '\n';
std::cout << std::dec << "current_gfxclk="
<< smu.current_gfxclk << '\n';
std::cout << std::dec << "current_socclk="
<< smu.current_socclk << '\n';
std::cout << std::dec << "current_uclk="
<< smu.current_uclk << '\n';
std::cout << std::dec << "current_vclk0="
<< smu.current_vclk0 << '\n';
std::cout << std::dec << "current_dclk0="
<< smu.current_dclk0 << '\n';
std::cout << std::dec << "current_vclk1="
<< smu.current_vclk1 << '\n';
std::cout << std::dec << "current_dclk1="
<< smu.current_dclk1 << '\n';
std::cout << std::dec << "throttle_status="
<< smu.throttle_status << '\n';
std::cout << std::dec << "current_fan_speed="
<< smu.current_fan_speed << '\n';
std::cout << "pcie_link_width="
<< std::to_string(smu.pcie_link_width) << '\n';
std::cout << "pcie_link_width="
<< std::to_string(smu.pcie_link_speed) << '\n';
std::cout << "gfx_activity_acc="
<< std::dec << smu.gfx_activity_acc << '\n';
std::cout << "mem_actvity_acc="
<< std::dec << smu.mem_actvity_acc << '\n';
for (int i = 0; i < RSMI_NUM_HBM_INSTANCES; ++i) {
std::cout << "temperature_hbm[" << i << "]=" << std::dec <<
smu.temperature_hbm[i] << '\n';
}
}
}
// Verify api support checking functionality is working
err = rsmi_dev_gpu_metrics_info_get(i, nullptr);
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
}
}
}