diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index b7addfe521..fc09eda1db 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -835,6 +835,7 @@ struct metrics_table_header_t { // existing field sizes are changed. #define RSMI_GPU_METRICS_API_CONTENT_VER_1 1 #define RSMI_GPU_METRICS_API_CONTENT_VER_2 2 +#define RSMI_GPU_METRICS_API_CONTENT_VER_3 3 // This should match NUM_HBM_INSTANCES #define RSMI_NUM_HBM_INSTANCES 4 @@ -900,6 +901,19 @@ typedef struct { uint32_t gfx_activity_acc; // new in v1 uint32_t mem_actvity_acc; // new in v1 uint16_t temperature_hbm[RSMI_NUM_HBM_INSTANCES]; // new in v1 + + /* PMFW attached timestamp (10ns resolution) */ + uint64_t firmware_timestamp; // added in v1_2 + + /* Voltage (mV) */ + uint16_t voltage_soc; // added in v1_3 + uint16_t voltage_gfx; // added in v1_3 + uint16_t voltage_mem; // added in v1_3 + + uint16_t padding1; + + /* Throttle status (ASIC independent) */ + uint64_t indep_throttle_status; // added in v1_3 /// \endcond } rsmi_gpu_metrics_t; @@ -2155,7 +2169,7 @@ rsmi_dev_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent); * * @param[inout] utilization_counters Multiple utilization counters can be retreived with a single * call. The caller must allocate enough space to the utilization_counters array. The caller also - * needs to set valid RSMI_UTILIZATION_COUNTER_TYPE type for each element of the array. + * needs to set valid RSMI_UTILIZATION_COUNTER_TYPE type for each element of the array. * ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the provided arguments. * * If the function reutrns RSMI_STATUS_SUCCESS, the counter will be set in the value field of diff --git a/src/rocm_smi_gpu_metrics.cc b/src/rocm_smi_gpu_metrics.cc index f8e53b4ccb..9f961012f2 100755 --- a/src/rocm_smi_gpu_metrics.cc +++ b/src/rocm_smi_gpu_metrics.cc @@ -121,14 +121,32 @@ typedef struct { uint64_t firmware_timestamp; } rsmi_gpu_metrics_v_1_2; +typedef struct { + rsmi_gpu_metrics_t base; + /* PMFW attached timestamp (10ns resolution) */ + uint64_t firmware_timestamp; + + /* Voltage (mV) */ + uint16_t voltage_soc; + uint16_t voltage_gfx; + uint16_t voltage_mem; + + /* Throttle status (ASIC independent) */ + uint64_t indep_throttle_status; + +} rsmi_gpu_metrics_v_1_3; + static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind, rsmi_gpu_metrics_t *data, uint8_t content_v) { assert(content_v != RSMI_GPU_METRICS_API_CONTENT_VER_1 && - content_v != RSMI_GPU_METRICS_API_CONTENT_VER_2 ); + content_v != RSMI_GPU_METRICS_API_CONTENT_VER_2 && + content_v != RSMI_GPU_METRICS_API_CONTENT_VER_3 ); if (content_v == RSMI_GPU_METRICS_API_CONTENT_VER_1 || - content_v == RSMI_GPU_METRICS_API_CONTENT_VER_2 ) { + content_v == RSMI_GPU_METRICS_API_CONTENT_VER_2 || + content_v == RSMI_GPU_METRICS_API_CONTENT_VER_3 ) { // This function shouldn't be called if content version is // RSMI_GPU_METRICS_API_CONTENT_VER_1 or RSMI_GPU_METRICS_API_CONTENT_VER_2 + // or RSMI_GPU_METRICS_API_CONTENT_VER_3 return RSMI_STATUS_INVALID_ARGS; } void *metric_data = nullptr; @@ -226,6 +244,17 @@ static void map_gpu_metrics_1_2_to_rsmi_gpu_metrics_t( gpu_metrics_v_1_2->firmware_timestamp * 10; } +static void map_gpu_metrics_1_3_to_rsmi_gpu_metrics_t( + const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3, + rsmi_gpu_metrics_t *rsmi_gpu_metrics) +{ + memcpy(rsmi_gpu_metrics, &gpu_metrics_v_1_3->base, + sizeof(rsmi_gpu_metrics_t)); + // firmware_timestamp is at 10ns resolution + rsmi_gpu_metrics->system_clock_counter = + gpu_metrics_v_1_3->firmware_timestamp * 10; + +} rsmi_status_t rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { @@ -233,6 +262,7 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { DEVICE_MUTEX CHK_SUPPORT_NAME_ONLY(smu) rsmi_gpu_metrics_v_1_2 smu_v_1_2; + rsmi_gpu_metrics_v_1_3 smu_v_1_3; rsmi_status_t ret; if (!dev->gpu_metrics_ver().structure_size) { @@ -248,6 +278,10 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { return RSMI_STATUS_NOT_SUPPORTED; } + // Initialize the smu fiedls to zero as some of them only valid in + // a specific version. + *smu = {}; + if (dev->gpu_metrics_ver().content_revision == RSMI_GPU_METRICS_API_CONTENT_VER_1) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, @@ -257,6 +291,11 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(rsmi_gpu_metrics_v_1_2), &smu_v_1_2); map_gpu_metrics_1_2_to_rsmi_gpu_metrics_t(&smu_v_1_2, smu); + } else if (dev->gpu_metrics_ver().content_revision == + RSMI_GPU_METRICS_API_CONTENT_VER_3) { + ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, + sizeof(rsmi_gpu_metrics_v_1_3), &smu_v_1_3); + map_gpu_metrics_1_3_to_rsmi_gpu_metrics_t(&smu_v_1_3, smu); } else { ret = GetGPUMetricsFormat1(dv_ind, smu, dev->gpu_metrics_ver().content_revision);