AMDSMI needs to merge first and bump the version to at least 24.4.2

Change-Id: I30149bb78c79ebc3de0dabdc8e63fcef12b2f406
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: a5cb334f8b]
Этот коммит содержится в:
Galantsev, Dmitrii
2025-04-10 20:45:58 +00:00
коммит произвёл Galantsev, Dmitrii
родитель e15c5a15fa
Коммит 375ab5eace
6 изменённых файлов: 53 добавлений и 37 удалений
+1 -1
Просмотреть файл
@@ -165,7 +165,7 @@ if(BUILD_STANDALONE AND GRPC_ROOT STREQUAL GRPC_ROOT_DEFAULT)
Continuing without gRPC install")
endif()
find_package(amd_smi 25.4.0
find_package(amd_smi 25.4.2
NAMES amd_smi
HINTS ${ROCM_DIR}/lib/cmake
CONFIGURE REQUIRED)
+3 -1
Просмотреть файл
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// clang-format off
// Description Fields:
// Arg # Type Meaning
// -------------------------------------------------
@@ -54,7 +56,7 @@ FLD_DESC_ENT(RDC_FI_GPU_MM_DEC_UTIL, "Mutilmedia decoder busy percentage",
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_ACTIVITY, "Memory busy percentage", "GPU_MEM_UTIL", true)
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, "Memory max bandwidth", "GPU_MEM_MAX_BANDWIDTH", true)
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_CUR_BANDWIDTH, "Memory current bandwidth", "GPU_MEM_CUR_BANDWIDTH", true)
FLD_DESC_ENT(RDC_FI_GPU_BUSY_PERCENT, "GPU busy percentage", "GPU_BUSY_PERCENT", true)
FLD_DESC_ENT(RDC_FI_GPU_PAGE_RETRIED, "Retried page of the GPU instance", "GPU_PAGE_RETRIED", true)
// ECC totals
+1
Просмотреть файл
@@ -207,6 +207,7 @@ typedef enum {
RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, //<! The Memory max bandwidth at current memory clock in
// Mb/Second
RDC_FI_GPU_MEMORY_CUR_BANDWIDTH, //<! The Memory current bandwidth in Mb/Second
RDC_FI_GPU_BUSY_PERCENT, //<! The GPU busy percentage
/**
* @brief GPU page related fields
+1
Просмотреть файл
@@ -100,6 +100,7 @@ class rdc_field_t(c_int):
RDC_FI_GPU_MEMORY_ACTIVITY = 505
RDC_FI_GPU_MEMORY_MAX_BANDWIDTH = 506
RDC_FI_GPU_MEMORY_CUR_BANDWIDTH = 507
RDC_FI_GPU_BUSY_PERCENT = 508
RDC_FI_GPU_PAGE_RETRIED = 550
RDC_FI_ECC_CORRECT_TOTAL = 600
RDC_FI_ECC_UNCORRECT_TOTAL = 601
+10
Просмотреть файл
@@ -1091,6 +1091,16 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
break;
}
case RDC_FI_GPU_BUSY_PERCENT: {
uint32_t gpu_busy_percent = 0;
ret = amdsmi_get_gpu_busy_percent(processor_handle, &gpu_busy_percent);
value->status = Smi2RdcError(ret);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(gpu_busy_percent);
}
}
default:
break;
}
+37 -35
Просмотреть файл
@@ -148,43 +148,45 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
}
// List of fields supported by amd_smi_lib
// clang-format off
const std::vector<uint32_t> fields{
RDC_FI_GPU_COUNT, RDC_FI_DEV_NAME, RDC_FI_GPU_CLOCK,
RDC_FI_MEM_CLOCK, RDC_FI_MEMORY_TEMP, RDC_FI_GPU_TEMP,
RDC_FI_POWER_USAGE, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX,
RDC_FI_PCIE_BANDWIDTH, RDC_FI_GPU_UTIL, RDC_FI_GPU_MEMORY_USAGE,
RDC_FI_GPU_MEMORY_TOTAL, RDC_FI_ECC_CORRECT_TOTAL, RDC_FI_ECC_UNCORRECT_TOTAL,
RDC_FI_ECC_SDMA_CE, RDC_FI_ECC_SDMA_UE, RDC_FI_ECC_GFX_CE,
RDC_FI_ECC_GFX_UE, RDC_FI_ECC_MMHUB_CE, RDC_FI_ECC_MMHUB_UE,
RDC_FI_ECC_ATHUB_CE, RDC_FI_ECC_ATHUB_UE, RDC_FI_ECC_PCIE_BIF_CE,
RDC_FI_ECC_PCIE_BIF_UE, RDC_FI_ECC_HDP_CE, RDC_FI_ECC_HDP_UE,
RDC_FI_ECC_XGMI_WAFL_CE, RDC_FI_ECC_XGMI_WAFL_UE, RDC_FI_ECC_DF_CE,
RDC_FI_ECC_DF_UE, RDC_FI_ECC_SMN_CE, RDC_FI_ECC_SMN_UE,
RDC_FI_ECC_SEM_CE, RDC_FI_ECC_SEM_UE, RDC_FI_ECC_MP0_CE,
RDC_FI_ECC_MP0_UE, RDC_FI_ECC_MP1_CE, RDC_FI_ECC_MP1_UE,
RDC_FI_ECC_FUSE_CE, RDC_FI_ECC_FUSE_UE, RDC_FI_ECC_UMC_CE,
RDC_FI_ECC_UMC_UE, RDC_FI_ECC_MCA_CE, RDC_FI_ECC_MCA_UE,
RDC_FI_ECC_VCN_CE, RDC_FI_ECC_VCN_UE, RDC_FI_ECC_JPEG_CE,
RDC_FI_ECC_JPEG_UE, RDC_FI_ECC_IH_CE, RDC_FI_ECC_IH_UE,
RDC_FI_ECC_MPIO_CE, RDC_FI_ECC_MPIO_UE, RDC_FI_XGMI_0_READ_KB,
RDC_FI_XGMI_1_READ_KB, RDC_FI_XGMI_2_READ_KB, RDC_FI_XGMI_3_READ_KB,
RDC_FI_XGMI_4_READ_KB, RDC_FI_XGMI_5_READ_KB, RDC_FI_XGMI_6_READ_KB,
RDC_FI_XGMI_7_READ_KB, RDC_FI_XGMI_TOTAL_READ_KB, RDC_FI_XGMI_0_WRITE_KB,
RDC_FI_XGMI_1_WRITE_KB, RDC_FI_XGMI_2_WRITE_KB, RDC_FI_XGMI_3_WRITE_KB,
RDC_FI_XGMI_4_WRITE_KB, RDC_FI_XGMI_5_WRITE_KB, RDC_FI_XGMI_6_WRITE_KB,
RDC_FI_XGMI_7_WRITE_KB, RDC_FI_XGMI_TOTAL_WRITE_KB,
RDC_EVNT_XGMI_0_NOP_TX, RDC_EVNT_XGMI_0_REQ_TX, RDC_EVNT_XGMI_0_RESP_TX,
RDC_EVNT_XGMI_0_BEATS_TX, RDC_EVNT_XGMI_1_NOP_TX, RDC_EVNT_XGMI_1_REQ_TX,
RDC_EVNT_XGMI_1_RESP_TX, RDC_EVNT_XGMI_1_BEATS_TX, RDC_EVNT_XGMI_0_THRPUT,
RDC_EVNT_XGMI_1_THRPUT, RDC_EVNT_XGMI_2_THRPUT, RDC_EVNT_XGMI_3_THRPUT,
RDC_EVNT_XGMI_4_THRPUT, RDC_EVNT_XGMI_5_THRPUT, RDC_FI_OAM_ID,
RDC_FI_GPU_MM_ENC_UTIL, RDC_FI_GPU_MM_DEC_UTIL, RDC_FI_GPU_MEMORY_ACTIVITY,
RDC_HEALTH_XGMI_ERROR, RDC_HEALTH_PCIE_REPLAY_COUNT, RDC_HEALTH_RETIRED_PAGE_NUM,
RDC_HEALTH_PENDING_PAGE_NUM, RDC_HEALTH_RETIRED_PAGE_LIMIT, RDC_HEALTH_EEPROM_CONFIG_VALID,
RDC_HEALTH_POWER_THROTTLE_TIME, RDC_HEALTH_THERMAL_THROTTLE_TIME,
RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, RDC_FI_GPU_MEMORY_CUR_BANDWIDTH,
RDC_FI_GPU_COUNT, RDC_FI_DEV_NAME, RDC_FI_GPU_CLOCK,
RDC_FI_MEM_CLOCK, RDC_FI_MEMORY_TEMP, RDC_FI_GPU_TEMP,
RDC_FI_POWER_USAGE, RDC_FI_PCIE_TX, RDC_FI_PCIE_RX,
RDC_FI_PCIE_BANDWIDTH, RDC_FI_GPU_UTIL, RDC_FI_GPU_MEMORY_USAGE,
RDC_FI_GPU_MEMORY_TOTAL, RDC_FI_ECC_CORRECT_TOTAL, RDC_FI_ECC_UNCORRECT_TOTAL,
RDC_FI_ECC_SDMA_CE, RDC_FI_ECC_SDMA_UE, RDC_FI_ECC_GFX_CE,
RDC_FI_ECC_GFX_UE, RDC_FI_ECC_MMHUB_CE, RDC_FI_ECC_MMHUB_UE,
RDC_FI_ECC_ATHUB_CE, RDC_FI_ECC_ATHUB_UE, RDC_FI_ECC_PCIE_BIF_CE,
RDC_FI_ECC_PCIE_BIF_UE, RDC_FI_ECC_HDP_CE, RDC_FI_ECC_HDP_UE,
RDC_FI_ECC_XGMI_WAFL_CE, RDC_FI_ECC_XGMI_WAFL_UE, RDC_FI_ECC_DF_CE,
RDC_FI_ECC_DF_UE, RDC_FI_ECC_SMN_CE, RDC_FI_ECC_SMN_UE,
RDC_FI_ECC_SEM_CE, RDC_FI_ECC_SEM_UE, RDC_FI_ECC_MP0_CE,
RDC_FI_ECC_MP0_UE, RDC_FI_ECC_MP1_CE, RDC_FI_ECC_MP1_UE,
RDC_FI_ECC_FUSE_CE, RDC_FI_ECC_FUSE_UE, RDC_FI_ECC_UMC_CE,
RDC_FI_ECC_UMC_UE, RDC_FI_ECC_MCA_CE, RDC_FI_ECC_MCA_UE,
RDC_FI_ECC_VCN_CE, RDC_FI_ECC_VCN_UE, RDC_FI_ECC_JPEG_CE,
RDC_FI_ECC_JPEG_UE, RDC_FI_ECC_IH_CE, RDC_FI_ECC_IH_UE,
RDC_FI_ECC_MPIO_CE, RDC_FI_ECC_MPIO_UE, RDC_FI_XGMI_0_READ_KB,
RDC_FI_XGMI_1_READ_KB, RDC_FI_XGMI_2_READ_KB, RDC_FI_XGMI_3_READ_KB,
RDC_FI_XGMI_4_READ_KB, RDC_FI_XGMI_5_READ_KB, RDC_FI_XGMI_6_READ_KB,
RDC_FI_XGMI_7_READ_KB, RDC_FI_XGMI_TOTAL_READ_KB, RDC_FI_XGMI_0_WRITE_KB,
RDC_FI_XGMI_1_WRITE_KB, RDC_FI_XGMI_2_WRITE_KB, RDC_FI_XGMI_3_WRITE_KB,
RDC_FI_XGMI_4_WRITE_KB, RDC_FI_XGMI_5_WRITE_KB, RDC_FI_XGMI_6_WRITE_KB,
RDC_FI_XGMI_7_WRITE_KB, RDC_FI_XGMI_TOTAL_WRITE_KB,
RDC_EVNT_XGMI_0_NOP_TX, RDC_EVNT_XGMI_0_REQ_TX, RDC_EVNT_XGMI_0_RESP_TX,
RDC_EVNT_XGMI_0_BEATS_TX, RDC_EVNT_XGMI_1_NOP_TX, RDC_EVNT_XGMI_1_REQ_TX,
RDC_EVNT_XGMI_1_RESP_TX, RDC_EVNT_XGMI_1_BEATS_TX, RDC_EVNT_XGMI_0_THRPUT,
RDC_EVNT_XGMI_1_THRPUT, RDC_EVNT_XGMI_2_THRPUT, RDC_EVNT_XGMI_3_THRPUT,
RDC_EVNT_XGMI_4_THRPUT, RDC_EVNT_XGMI_5_THRPUT, RDC_FI_OAM_ID,
RDC_FI_GPU_MM_ENC_UTIL, RDC_FI_GPU_MM_DEC_UTIL, RDC_FI_GPU_MEMORY_ACTIVITY,
RDC_HEALTH_XGMI_ERROR, RDC_HEALTH_PCIE_REPLAY_COUNT, RDC_HEALTH_RETIRED_PAGE_NUM,
RDC_HEALTH_PENDING_PAGE_NUM, RDC_HEALTH_RETIRED_PAGE_LIMIT, RDC_HEALTH_EEPROM_CONFIG_VALID,
RDC_HEALTH_POWER_THROTTLE_TIME, RDC_HEALTH_THERMAL_THROTTLE_TIME,
RDC_FI_GPU_MEMORY_MAX_BANDWIDTH, RDC_FI_GPU_MEMORY_CUR_BANDWIDTH,
RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED
};
// clang-format on
std::copy(fields.begin(), fields.end(), field_ids);
*field_count = fields.size();