From 5e2a4f3a157d3ee66efbe7c2a2fa8de0847d607e Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Sat, 6 Mar 2021 11:53:30 -0600 Subject: [PATCH] Handle different gpu_metrics content versions for format v1 Change-Id: I344d1815da683befc8f8b5caf921803b267ae29f --- include/rocm_smi/rocm_smi.h | 32 +++- include/rocm_smi/rocm_smi_device.h | 2 + src/rocm_smi.cc | 165 +++++++++++++++++- src/rocm_smi_device.cc | 5 +- .../functional/gpu_metrics_read.cc | 137 ++++++++------- 5 files changed, 264 insertions(+), 77 deletions(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 76165ddd24..894ba534f2 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -802,14 +802,25 @@ struct metrics_table_header_t { /** * @brief The following structure holds the gpu metrics values for a device. */ +// Below is the assumed version of gpu_metric data on the device. If the device +// is using this version, we can read data directly into rsmi_gpu_metrics_t. +// If the device is using an older format, a conversion of formats will be +// required. +// DGPU targets have a format version of 1. APU targets have a format version of +// 2. Currently, only version 1 (DGPU) gpu_metrics is supported. +#define RSMI_GPU_METRICS_API_FORMAT_VER 1 +// The content version increments when gpu_metrics is extended with new and/or +// existing field sizes are changed. +#define RSMI_GPU_METRICS_API_CONTENT_VER 1 + +// This should match NUM_HBM_INSTANCES +#define RSMI_NUM_HBM_INSTANCES 4 + typedef struct { // TODO(amd) Doxygen documents /// \cond Ignore in docs. struct metrics_table_header_t common_header; -/* Driver attached timestamp (in ns) */ - uint64_t system_clock_counter; - /* Temperature */ uint16_t temperature_edge; uint16_t temperature_hotspot; @@ -825,7 +836,10 @@ typedef struct { /* Power/Energy */ uint16_t average_socket_power; - uint32_t energy_accumulator; + uint64_t energy_accumulator; // v1 mod. (32->64) + +/* Driver attached timestamp (in ns) */ + uint64_t system_clock_counter; // v1 mod. (moved from top of struct) /* Average clocks */ uint16_t average_gfxclk_frequency; @@ -852,8 +866,14 @@ typedef struct { uint16_t current_fan_speed; /* Link width/speed */ - uint8_t pcie_link_width; - uint8_t pcie_link_speed; // in 0.1 GT/s + uint16_t pcie_link_width; // v1 mod.(8->16) + uint16_t pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16) + + uint16_t padding; // new in v1 + + uint32_t gfx_activity_acc; // new in v1 + uint32_t mem_actvity_acc; // new in v1 + uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1 /// \endcond } rsmi_gpu_metrics_t; diff --git a/include/rocm_smi/rocm_smi_device.h b/include/rocm_smi/rocm_smi_device.h index 9f8553c233..ad837bec8e 100755 --- a/include/rocm_smi/rocm_smi_device.h +++ b/include/rocm_smi/rocm_smi_device.h @@ -204,6 +204,7 @@ class Device { void set_evt_notif_anon_fd(uint32_t fd) { evt_notif_anon_fd_ = static_cast(fd);} int evt_notif_anon_fd(void) const {return evt_notif_anon_fd_;} + metrics_table_header_t & gpu_metrics_ver(void) {return gpu_metrics_ver_;} void fillSupportedFuncs(void); void DumpSupportedFunctions(void); bool DeviceAPISupported(std::string name, uint64_t variant, @@ -237,6 +238,7 @@ class Device { int evt_notif_anon_fd_; FILE *evt_notif_anon_file_ptr_; + struct metrics_table_header_t gpu_metrics_ver_; }; } // namespace smi diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index d843127f7a..eebd59bc28 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -2217,19 +2217,174 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) { return ret; CATCH } +// Put definitions of old gpu_metrics formats here +typedef struct { + struct metrics_table_header_t common_header; + + /* Driver attached timestamp (in ns) */ + uint64_t system_clock_counter; + +/* Temperature */ + uint16_t temperature_edge; + uint16_t temperature_hotspot; + uint16_t temperature_mem; + uint16_t temperature_vrgfx; + uint16_t temperature_vrsoc; + uint16_t temperature_vrmem; + +/* Utilization */ + uint16_t average_gfx_activity; + uint16_t average_umc_activity; // memory controller + uint16_t average_mm_activity; // UVD or VCN + +/* Power/Energy */ + uint16_t average_socket_power; + uint32_t energy_accumulator; + +/* Average clocks */ + uint16_t average_gfxclk_frequency; + uint16_t average_socclk_frequency; + uint16_t average_uclk_frequency; + uint16_t average_vclk0_frequency; + uint16_t average_dclk0_frequency; + uint16_t average_vclk1_frequency; + uint16_t average_dclk1_frequency; + +/* Current clocks */ + uint16_t current_gfxclk; + uint16_t current_socclk; + uint16_t current_uclk; + uint16_t current_vclk0; + uint16_t current_dclk0; + uint16_t current_vclk1; + uint16_t current_dclk1; + +/* Throttle status */ + uint32_t throttle_status; + +/* Fans */ + uint16_t current_fan_speed; + +/* Link width/speed */ + uint8_t pcie_link_width; + uint8_t pcie_link_speed; // in 0.1 GT/s +} rsmi_gpu_metrics_v_1_0_t; + + +static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind, + rsmi_gpu_metrics_t *data, uint8_t content_v) { + assert(content_v != RSMI_GPU_METRICS_API_CONTENT_VER); + if (content_v == RSMI_GPU_METRICS_API_CONTENT_VER) { + // This function shouldn't be called if content version is + // RSMI_GPU_METRICS_API_CONTENT_VER. + return RSMI_STATUS_INVALID_ARGS; + } + void *metric_data = nullptr; + size_t data_size; + rsmi_status_t ret; + + rsmi_gpu_metrics_v_1_0_t metric_data_v_1_0; + + if (content_v == 0) { + metric_data = &metric_data_v_1_0; + data_size = sizeof(rsmi_gpu_metrics_v_1_0_t); + } // else { ... handle other conversions to v1 + + assert(metric_data != nullptr && "Unexpected conversion attempted."); + ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, data_size, + metric_data); + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + +#define ASSIGN_DATA_FIELD(FIELD, SRC) \ + data->FIELD = SRC->FIELD; + +#define ASSIGN_COMMON_FORMATS(SRC) \ + ASSIGN_DATA_FIELD(common_header, (SRC)) \ + ASSIGN_DATA_FIELD(temperature_edge, (SRC)) \ + ASSIGN_DATA_FIELD(temperature_hotspot, (SRC)) \ + ASSIGN_DATA_FIELD(temperature_mem, (SRC)) \ + ASSIGN_DATA_FIELD(temperature_vrgfx, (SRC)) \ + ASSIGN_DATA_FIELD(temperature_vrsoc, (SRC)) \ + ASSIGN_DATA_FIELD(temperature_vrmem, (SRC)) \ + ASSIGN_DATA_FIELD(average_gfx_activity, (SRC)) \ + ASSIGN_DATA_FIELD(average_umc_activity, (SRC)) \ + ASSIGN_DATA_FIELD(average_mm_activity, (SRC)) \ + ASSIGN_DATA_FIELD(average_socket_power, (SRC)) \ + ASSIGN_DATA_FIELD(system_clock_counter, (SRC)) \ + ASSIGN_DATA_FIELD(average_gfxclk_frequency, (SRC)) \ + ASSIGN_DATA_FIELD(average_socclk_frequency, (SRC)) \ + ASSIGN_DATA_FIELD(average_uclk_frequency, (SRC)) \ + ASSIGN_DATA_FIELD(average_vclk0_frequency, (SRC)) \ + ASSIGN_DATA_FIELD(average_dclk0_frequency, (SRC)) \ + ASSIGN_DATA_FIELD(average_vclk1_frequency, (SRC)) \ + ASSIGN_DATA_FIELD(average_dclk1_frequency, (SRC)) \ + ASSIGN_DATA_FIELD(current_gfxclk, (SRC)) \ + ASSIGN_DATA_FIELD(current_socclk, (SRC)) \ + ASSIGN_DATA_FIELD(current_uclk, (SRC)) \ + ASSIGN_DATA_FIELD(current_vclk0, (SRC)) \ + ASSIGN_DATA_FIELD(current_dclk0, (SRC)) \ + ASSIGN_DATA_FIELD(current_vclk1, (SRC)) \ + ASSIGN_DATA_FIELD(current_dclk1, (SRC)) \ + ASSIGN_DATA_FIELD(throttle_status, (SRC)) \ + ASSIGN_DATA_FIELD(current_fan_speed, (SRC)) + + // Now handle differences from format 1 + if (content_v == 0) { + // First handle all data that is common to Format1 and other formats + ASSIGN_COMMON_FORMATS( + reinterpret_cast(metric_data)) + + // Then, the differences: + data->energy_accumulator = static_cast( + reinterpret_cast( + metric_data)->energy_accumulator); + data->pcie_link_width = static_cast( + reinterpret_cast( + metric_data)->pcie_link_width); + data->pcie_link_speed = static_cast( + reinterpret_cast( + metric_data)->pcie_link_speed); + + // These fields didn't exist in v0 + data->gfx_activity_acc = 0; + data->mem_actvity_acc = 0; + (void)memset(data->temperature_hbm, 0, + RSMI_NUM_HBM_INSTANCES * sizeof(uint16_t)); + } // else handle other conversions to format 1 +#undef ASSIGN_DATA_FIELD +#undef ASSIGN_COMMON_FORMATS + return RSMI_STATUS_SUCCESS; +} rsmi_status_t rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { TRY DEVICE_MUTEX CHK_SUPPORT_NAME_ONLY(smu) + rsmi_status_t ret; - rsmi_status_t ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, + if (!dev->gpu_metrics_ver().structure_size) { + ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, + sizeof(struct metrics_table_header_t), &dev->gpu_metrics_ver()); + + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + } + // only supports gpu_metrics_v1_x version + if (dev->gpu_metrics_ver().format_revision != 1) { + return RSMI_STATUS_NOT_SUPPORTED; + } else { // format_revision == 1 + if (dev->gpu_metrics_ver().content_revision == + RSMI_GPU_METRICS_API_CONTENT_VER) { + ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(rsmi_gpu_metrics_t), smu); - - // only supports gpu_metrics_v1_0 version - if (smu->common_header.format_revision != 1) { - return RSMI_STATUS_NOT_SUPPORTED; + } else { + ret = GetGPUMetricsFormat1(dv_ind, smu, + dev->gpu_metrics_ver().content_revision); + } } if (ret != RSMI_STATUS_SUCCESS) { diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index 6d81e39a0d..5207be3a64 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -482,8 +482,9 @@ static const std::map kDevFuncDependsMap = { if (X) return X; \ } -Device::Device(std::string p, RocmSMI_env_vars const *e) : monitor_(nullptr), - path_(p), env_(e), evt_notif_anon_fd_(-1) { +Device::Device(std::string p, RocmSMI_env_vars const *e) : + monitor_(nullptr), path_(p), env_(e), evt_notif_anon_fd_(-1), + gpu_metrics_ver_{0, 0, 0} { #ifdef NDEBUG env_ = nullptr; #endif diff --git a/tests/rocm_smi_test/functional/gpu_metrics_read.cc b/tests/rocm_smi_test/functional/gpu_metrics_read.cc index d47e74a9a1..a1b362fc31 100644 --- a/tests/rocm_smi_test/functional/gpu_metrics_read.cc +++ b/tests/rocm_smi_test/functional/gpu_metrics_read.cc @@ -111,75 +111,84 @@ void TestGpuMetricsRead::Run(void) { "Not supported on this machine" << std::endl; return; } - } else { - CHK_ERR_ASRT(err); - IF_VERB(STANDARD) { - std::cout << std::dec << "system_clock_counter=" - << smu.system_clock_counter << '\n'; - std::cout << std::dec << "temperature_edge=" - << smu.temperature_edge << '\n'; - std::cout << std::dec << "temperature_hotspot=" - << smu.temperature_hotspot << '\n'; - std::cout << std::dec << "temperature_mem=" - << smu.temperature_mem << '\n'; - std::cout << std::dec << "temperature_vrgfx=" - << smu.temperature_vrgfx << '\n'; - std::cout << std::dec << "temperature_vrsoc=" - << smu.temperature_vrsoc << '\n'; - std::cout << std::dec << "temperature_vrmem=" - << smu.temperature_vrmem << '\n'; - std::cout << std::dec << "average_gfx_activity=" - << smu.average_gfx_activity << '\n'; - std::cout << std::dec << "average_umc_activity=" - << smu.average_umc_activity << '\n'; - std::cout << std::dec << "average_mm_activity=" - << smu.average_mm_activity << '\n'; - std::cout << std::dec << "average_socket_power=" - << smu.average_socket_power << '\n'; - std::cout << std::dec << "energy_accumulator=" - << smu.energy_accumulator << '\n'; - std::cout << std::dec << "average_gfxclk_frequency=" - << smu.average_gfxclk_frequency << '\n'; - std::cout << std::dec << "average_gfxclk_frequency=" - << smu.average_gfxclk_frequency << '\n'; - std::cout << std::dec << "average_uclk_frequency=" - << smu.average_uclk_frequency << '\n'; - std::cout << std::dec << "average_vclk0_frequency=" - << smu.average_vclk0_frequency << '\n'; - std::cout << std::dec << "average_dclk0_frequency=" - << smu.average_dclk0_frequency << '\n'; - std::cout << std::dec << "average_dclk0_frequency=" - << smu.average_dclk0_frequency << '\n'; - std::cout << std::dec << "average_dclk1_frequency=" - << smu.average_dclk1_frequency << '\n'; - std::cout << std::dec << "current_gfxclk=" - << smu.current_gfxclk << '\n'; - std::cout << std::dec << "current_socclk=" - << smu.current_socclk << '\n'; - std::cout << std::dec << "current_uclk=" - << smu.current_uclk << '\n'; - std::cout << std::dec << "current_vclk0=" - << smu.current_vclk0 << '\n'; - std::cout << std::dec << "current_dclk0=" - << smu.current_dclk0 << '\n'; - std::cout << std::dec << "current_vclk1=" - << smu.current_vclk1 << '\n'; - std::cout << std::dec << "current_dclk1=" - << smu.current_dclk1 << '\n'; - std::cout << std::dec << "throttle_status=" - << smu.throttle_status << '\n'; - std::cout << std::dec << "current_fan_speed=" - << smu.current_fan_speed << '\n'; - std::cout << "pcie_link_width=" - << std::to_string(smu.pcie_link_width) << '\n'; - std::cout << "pcie_link_width=" - << std::to_string(smu.pcie_link_speed) << '\n'; + } + } else { + CHK_ERR_ASRT(err); + IF_VERB(STANDARD) { + std::cout << std::dec << "system_clock_counter=" + << smu.system_clock_counter << '\n'; + std::cout << std::dec << "temperature_edge=" + << smu.temperature_edge << '\n'; + std::cout << std::dec << "temperature_hotspot=" + << smu.temperature_hotspot << '\n'; + std::cout << std::dec << "temperature_mem=" + << smu.temperature_mem << '\n'; + std::cout << std::dec << "temperature_vrgfx=" + << smu.temperature_vrgfx << '\n'; + std::cout << std::dec << "temperature_vrsoc=" + << smu.temperature_vrsoc << '\n'; + std::cout << std::dec << "temperature_vrmem=" + << smu.temperature_vrmem << '\n'; + std::cout << std::dec << "average_gfx_activity=" + << smu.average_gfx_activity << '\n'; + std::cout << std::dec << "average_umc_activity=" + << smu.average_umc_activity << '\n'; + std::cout << std::dec << "average_mm_activity=" + << smu.average_mm_activity << '\n'; + std::cout << std::dec << "average_socket_power=" + << smu.average_socket_power << '\n'; + std::cout << std::dec << "energy_accumulator=" + << smu.energy_accumulator << '\n'; + std::cout << std::dec << "average_gfxclk_frequency=" + << smu.average_gfxclk_frequency << '\n'; + std::cout << std::dec << "average_gfxclk_frequency=" + << smu.average_gfxclk_frequency << '\n'; + std::cout << std::dec << "average_uclk_frequency=" + << smu.average_uclk_frequency << '\n'; + std::cout << std::dec << "average_vclk0_frequency=" + << smu.average_vclk0_frequency << '\n'; + std::cout << std::dec << "average_dclk0_frequency=" + << smu.average_dclk0_frequency << '\n'; + std::cout << std::dec << "average_vclk1_frequency=" + << smu.average_vclk1_frequency << '\n'; + std::cout << std::dec << "average_dclk1_frequency=" + << smu.average_dclk1_frequency << '\n'; + std::cout << std::dec << "current_gfxclk=" + << smu.current_gfxclk << '\n'; + std::cout << std::dec << "current_socclk=" + << smu.current_socclk << '\n'; + std::cout << std::dec << "current_uclk=" + << smu.current_uclk << '\n'; + std::cout << std::dec << "current_vclk0=" + << smu.current_vclk0 << '\n'; + std::cout << std::dec << "current_dclk0=" + << smu.current_dclk0 << '\n'; + std::cout << std::dec << "current_vclk1=" + << smu.current_vclk1 << '\n'; + std::cout << std::dec << "current_dclk1=" + << smu.current_dclk1 << '\n'; + std::cout << std::dec << "throttle_status=" + << smu.throttle_status << '\n'; + std::cout << std::dec << "current_fan_speed=" + << smu.current_fan_speed << '\n'; + std::cout << "pcie_link_width=" + << std::to_string(smu.pcie_link_width) << '\n'; + std::cout << "pcie_link_width=" + << std::to_string(smu.pcie_link_speed) << '\n'; + std::cout << "gfx_activity_acc=" + << std::dec << smu.gfx_activity_acc << '\n'; + std::cout << "mem_actvity_acc=" + << std::dec << smu.mem_actvity_acc << '\n'; + + for (int i = 0; i < RSMI_NUM_HBM_INSTANCES; ++i) { + std::cout << "temperature_hbm[" << i << "]=" << std::dec << + smu.temperature_hbm[i] << '\n'; } } + } // Verify api support checking functionality is working err = rsmi_dev_gpu_metrics_info_get(i, nullptr); ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); - } } }