From 3daf9c10633f5b2b15d7d5fa3a80d27dcbb5e178 Mon Sep 17 00:00:00 2001 From: Dalibor Stanisavljevic Date: Mon, 5 Sep 2022 15:26:03 +0200 Subject: [PATCH] SWDEV-353742 - Port smilib function to amdsmi Change-Id: I99df249755a5c665a8dd1777fa82d046e139bd77 Signed-off-by: Dalibor Stanisavljevic --- CMakeLists.txt | 11 +- example/amd_smi_drm_example.cc | 535 ++++++++++++++++ example/amd_smi_example.cc | 146 ----- example/amd_smi_nodrm_example.cc | 335 ++++++++++ include/amd_smi/amd_smi.h | 396 ++++++------ include/amd_smi/impl/amd_smi_drm.h | 11 +- include/amd_smi/impl/amd_smi_gpu_device.h | 24 +- include/amd_smi/impl/amd_smi_socket.h | 1 + include/amd_smi/impl/amd_smi_utils.h | 44 ++ include/amd_smi/impl/fdinfo.h | 38 ++ src/amd_smi/amd_smi.cc | 590 +++++++++++++++--- src/amd_smi/amd_smi_drm.cc | 76 ++- src/amd_smi/amd_smi_gpu_device.cc | 61 +- src/amd_smi/amd_smi_socket.cc | 5 + src/amd_smi/amd_smi_system.cc | 105 +++- src/amd_smi/amd_smi_utils.cc | 382 ++++++++++++ src/amd_smi/fdinfo.cc | 264 ++++++++ tests/amd_smi_test/functional/err_cnt_read.cc | 2 +- 18 files changed, 2577 insertions(+), 449 deletions(-) create mode 100644 example/amd_smi_drm_example.cc delete mode 100644 example/amd_smi_example.cc create mode 100644 example/amd_smi_nodrm_example.cc create mode 100644 include/amd_smi/impl/amd_smi_utils.h create mode 100644 include/amd_smi/impl/fdinfo.h create mode 100644 src/amd_smi/amd_smi_utils.cc create mode 100644 src/amd_smi/fdinfo.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 31b2533202..ee43bee420 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -141,6 +141,8 @@ set(CMN_SRC_LIST ${CMN_SRC_LIST} "${AMDSMI_SRC_DIR}/amd_smi_socket.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${AMDSMI_SRC_DIR}/amd_smi_system.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${AMDSMI_SRC_DIR}/amd_smi_drm.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${AMDSMI_SRC_DIR}/amd_smi_lib_loader.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${AMDSMI_SRC_DIR}/amd_smi_utils.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${AMDSMI_SRC_DIR}/fdinfo.cc") set(CMN_INC_LIST "${ROCM_INC_DIR}/rocm_smi_device.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${ROCM_INC_DIR}/rocm_smi_main.h") @@ -166,11 +168,16 @@ set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_INC_DIR}/impl/amd_smi_lib_loader.h") add_subdirectory("rocm_smi") # Examples and docs -set(SMI_EXAMPLE_EXE "amd_smi_ex") -add_executable(${SMI_EXAMPLE_EXE} "example/amd_smi_example.cc") +set(SMI_EXAMPLE_EXE "amd_smi_drm_ex") +add_executable(${SMI_EXAMPLE_EXE} "example/amd_smi_drm_example.cc") target_link_libraries(${SMI_EXAMPLE_EXE} ${ROCM_SMI_TARGET}) add_dependencies(${SMI_EXAMPLE_EXE} ${ROCM_SMI_TARGET}) +set(SMI_NODRM_EXAMPLE_EXE "amd_smi_nodrm_ex") +add_executable(${SMI_NODRM_EXAMPLE_EXE} "example/amd_smi_nodrm_example.cc") +target_link_libraries(${SMI_NODRM_EXAMPLE_EXE} ${ROCM_SMI_TARGET}) +add_dependencies(${SMI_NODRM_EXAMPLE_EXE} ${ROCM_SMI_TARGET}) + # Generate Doxygen documentation find_package(Doxygen) find_package(LATEX COMPONENTS PDFLATEX) diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc new file mode 100644 index 0000000000..c2c5779fa3 --- /dev/null +++ b/example/amd_smi_drm_example.cc @@ -0,0 +1,535 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ +#include +#include +#include + +#include "amd_smi/amd_smi.h" +#include +#include +#include +#include +#include + +#define CHK_AMDSMI_RET(RET) \ + { \ + if (RET != AMDSMI_STATUS_SUCCESS) { \ + const char *err_str; \ + std::cout << "AMDSMI call returned " << RET << " at line " \ + << __LINE__ << std::endl; \ + amdsmi_status_string(RET, &err_str); \ + std::cout << err_str << std::endl; \ + return RET; \ + } \ + } + +int main() { + amdsmi_status_t ret; + + // Init amdsmi for sockets and devices. + // Here we are only interested in AMD_GPUS. + ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS); + CHK_AMDSMI_RET(ret) + + // Get all sockets + uint32_t socket_count = 0; + amdsmi_socket_handle *sockets = nullptr; + ret = amdsmi_get_socket_handles(&socket_count, &sockets); + CHK_AMDSMI_RET(ret) + std::cout << "Total Socket: " << socket_count << std::endl; + + // For each socket, get identifier and devices + for (uint32_t i = 0; i < socket_count; i++) { + // Get Socket info + char socket_name[128]; + ret = amdsmi_get_socket_info(sockets[i], socket_name, 128); + CHK_AMDSMI_RET(ret) + std::cout << "Socket " << socket_name << std::endl; + + // Get all devices of the socket + uint32_t device_count = 0; + amdsmi_device_handle *device_handles = nullptr; + ret = amdsmi_get_device_handles(sockets[i], &device_count, + &device_handles); + CHK_AMDSMI_RET(ret) + + // For each device of the socket, get name and temperature. + for (uint32_t j = 0; j < device_count; j++) { + // Get device type. Since the amdsmi is initialized with + // AMD_SMI_INIT_AMD_GPUS, the device_type must be AMD_GPU. + device_type_t device_type = {}; + ret = amdsmi_get_device_type(device_handles[j], &device_type); + CHK_AMDSMI_RET(ret) + if (device_type != AMD_GPU) { + std::cout << "Expect AMD_GPU device type!\n"; + return AMDSMI_STATUS_NOT_SUPPORTED; + } + + amdsmi_bdf_t bdf = {}; + ret = amdsmi_get_device_bdf(device_handles[j], &bdf); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_device_bdf:\n"); + printf("\tDevice[%d] BDF %04x:%02x:%02x.%d\n\n", i, + bdf.domain_number, bdf.bus_number, bdf.device_number, + bdf.function_number); + + amdsmi_asic_info_t asic_info = {}; + ret = amdsmi_get_asic_info(device_handles[j], &asic_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_asic_info:\n"); + printf("\tMarket Name: %s\n", asic_info.market_name); + printf("\tFamilyID: 0x%x\n", asic_info.family); + printf("\tDeviceID: 0x%lx\n", asic_info.device_id); + printf("\tVendorID: 0x%x\n", asic_info.vendor_id); + printf("\tRevisionID: 0x%x\n", asic_info.rev_id); + printf("\tAsic serial: 0x%s\n\n", asic_info.asic_serial); + + // Get VBIOS info + amdsmi_vbios_info_t vbios_info = {}; + ret = amdsmi_get_vbios_info(device_handles[j], &vbios_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_vbios_info:\n"); + printf("\tVBios Name: %s\n", vbios_info.name); + printf("\tBuild Date: %s\n", vbios_info.build_date); + printf("\tPart Number: %s\n", vbios_info.part_number); + printf("\tVBios Version: %d\n", vbios_info.vbios_version); + printf("\tVBios Version String: %s\n\n", + vbios_info.vbios_version_string); + + // Get engine usage info + amdsmi_engine_usage_t engine_usage = {}; + ret = amdsmi_get_gpu_activity(device_handles[j], &engine_usage); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_gpu_activity:\n"); + printf("\tAverage GFX Activity: %d\n", + engine_usage.average_gfx_activity); + printf("\tAverage MM Activity: %d\n", + engine_usage.average_mm_activity[0]); + printf("\tAverage UMC Activity: %d\n\n", + engine_usage.average_umc_activity); + + // Get firmware info + amdsmi_fw_info_t fw_information = {}; + ret = amdsmi_get_fw_info(device_handles[j], &fw_information); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_fw_info:\n"); + printf("\tFirmware version: %d\n", fw_information.num_fw_info); + printf("\tSMU: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_SMU] + .fw_version); + printf("\tSMC: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_SMC] + .fw_version); + printf("\tVCN: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_VCN] + .fw_version); + printf("\tCP_ME: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_CP_ME] + .fw_version); + printf("\tCP_PFP: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_CP_PFP] + .fw_version); + printf("\tCP_CE: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_CP_CE] + .fw_version); + printf("\tRLC: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_RLC] + .fw_version); + printf("\tCP_MEC1: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_CP_MEC1] + .fw_version); + printf("\tCP_MEC2: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_CP_MEC2] + .fw_version); + printf("\tSDMA0: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_SDMA0] + .fw_version); + printf("\tMC: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_MC] + .fw_version); + printf("\tRLC RESTORE LIST CNTL: %ld\n", + fw_information + .fw_info_list + [amdsmi_fw_block_t::FW_ID_RLC_RESTORE_LIST_CNTL] + .fw_version); + printf("\tRLC RESTORE LIST GPM MEM: %ld\n", + fw_information + .fw_info_list + [amdsmi_fw_block_t::FW_ID_RLC_RESTORE_LIST_GPM_MEM] + .fw_version); + printf("\tRLC RESTORE LIST SRM MEM: %ld\n", + fw_information + .fw_info_list + [amdsmi_fw_block_t::FW_ID_RLC_RESTORE_LIST_SRM_MEM] + .fw_version); + printf( + "\tPSP SOSDRV: %ld\n\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_PSP_SOSDRV] + .fw_version); + + // Get GPU power limit info + amdsmi_power_limit_t power_limit = {}; + ret = amdsmi_get_power_limit(device_handles[j], &power_limit); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_power_limit:\n"); + printf("\tGPU Power limit: %d\n\n", power_limit.limit); + + // Get GFX clock measurements + amdsmi_clock_measure_t gfx_clk_values = {}; + ret = amdsmi_get_clock_measure(device_handles[j], CLOCK_TYPE_GFX, + &gfx_clk_values); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_clock_measure:\n"); + printf("\tGPU GFX Max Clock: %d\n", gfx_clk_values.max_clk); + printf("\tGPU GFX Average Clock: %d\n", gfx_clk_values.avg_clk); + printf("\tGPU GFX Current Clock: %d\n", gfx_clk_values.cur_clk); + + // Get MEM clock measurements + amdsmi_clock_measure_t mem_clk_values = {}; + ret = amdsmi_get_clock_measure(device_handles[j], CLOCK_TYPE_MEM, + &mem_clk_values); + CHK_AMDSMI_RET(ret) + printf("\tGPU MEM Max Clock: %d\n", mem_clk_values.max_clk); + printf("\tGPU MEM Average Clock: %d\n", mem_clk_values.avg_clk); + printf("\tGPU MEM Current Clock: %d\n\n", mem_clk_values.cur_clk); + + // Get VRAM temperature limit + amdsmi_temperature_limit_t mem_temp_limit = {}; + ret = amdsmi_get_temperature_limit( + device_handles[j], TEMPERATURE_TYPE_VRAM, &mem_temp_limit); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_temperature_limit:\n"); + printf("\tGPU VRAM temp limit: %d\n", mem_temp_limit.limit); + + // Get GFX temperature limit + amdsmi_temperature_limit_t gfx_temp_limit = {}; + ret = amdsmi_get_temperature_limit( + device_handles[j], TEMPERATURE_TYPE_EDGE, &gfx_temp_limit); + CHK_AMDSMI_RET(ret) + printf("\tGPU GFX temp limit: %d\n\n", gfx_temp_limit.limit); + + // Get temperature measurements + // amdsmi_temperature_t edge_temp, junction_temp, vram_temp, + // plx_temp; + amdsmi_temperature_t temp_measurements[4]; + amdsmi_temperature_type_t temp_types[4] = { + TEMPERATURE_TYPE_EDGE, TEMPERATURE_TYPE_JUNCTION, + TEMPERATURE_TYPE_VRAM, TEMPERATURE_TYPE_PLX}; + for (const auto &temp_type : temp_types) { + ret = amdsmi_get_temperature_measure( + device_handles[j], temp_type, + &temp_measurements[(int)(temp_type)]); + CHK_AMDSMI_RET(ret) + } + printf(" Output of amdsmi_get_temperature_measure:\n"); + printf("\tGPU Edge temp measurement: %d\n", + temp_measurements[TEMPERATURE_TYPE_EDGE].cur_temp); + printf("\tGPU Junction temp measurement: %d\n", + temp_measurements[TEMPERATURE_TYPE_JUNCTION].cur_temp); + printf("\tGPU VRAM temp measurement: %d\n", + temp_measurements[TEMPERATURE_TYPE_VRAM].cur_temp); + printf("\tGPU PLX temp measurement: %d\n\n", + temp_measurements[TEMPERATURE_TYPE_PLX].cur_temp); + + // Get RAS features enabled + char block_names[14][10] = {"UMC", "SDMA", "GFX", "MMHUB", + "ATHUB", "PCIE_BIF", "HDP", "XGMI_WAFL", + "DF", "SMN", "SEM", "MP0", + "MP1", "FUSE"}; + char status_names[7][10] = {"NONE", "DISABLED", "PARITY", + "SING_C", "MULT_UC", "POISON", + "ENABLED"}; + amdsmi_ras_err_state_t state = {}; + int index = 0; + printf(" Output of amdsmi_get_ras_features_enabled:\n"); + for (auto block = AMDSMI_GPU_BLOCK_FIRST; + block <= AMDSMI_GPU_BLOCK_LAST; + block = (amdsmi_gpu_block_t)(block * 2)) { + ret = amdsmi_get_ras_features_enabled(device_handles[j], block, + &state); + CHK_AMDSMI_RET(ret) + printf("\tBlock: %s\n", block_names[index]); + printf("\tStatus: %s\n", status_names[state]); + index++; + } + printf("\n"); + + // Get bad pages + char bad_page_status_names[3][15] = {"RESERVED", "PENDING", + "UNRESERVABLE"}; + uint32_t num_pages = 0; + ret = amdsmi_get_bad_page_info(device_handles[j], &num_pages, + nullptr); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_bad_page_info:\n"); + if (!num_pages) { + printf("\tNo bad pages found.\n"); + } else { + amdsmi_retired_page_record_t bad_page_info[num_pages] = {}; + ret = amdsmi_get_bad_page_info(device_handles[j], &num_pages, + bad_page_info); + CHK_AMDSMI_RET(ret) + for (uint32_t page_it = 0; page_it < num_pages; page_it += 1) { + printf(" Page[%d]\n", page_it); + printf("\tAddress: %lu\n", + bad_page_info[page_it].page_address); + printf("\tSize: %lu\n", bad_page_info[page_it].page_size); + printf( + "\tStatus: %s\n", + bad_page_status_names[bad_page_info[page_it].status]); + } + } + printf("\n"); + + // Get ECC error counts + amdsmi_error_count_t err_cnt_info = {}; + ret = amdsmi_get_ecc_error_count(device_handles[j], &err_cnt_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_ecc_error_count:\n"); + printf("\tCorrectable errors: %lu\n", err_cnt_info.correctable_count); + printf("\tUncorrectable errors: %lu\n\n", + err_cnt_info.uncorrectable_count); + // Get process list + auto compare = [](const void *a, const void *b) -> int { + return (*(amdsmi_proc_info_t *)a).pid > + (*(amdsmi_proc_info_t *)b).pid + ? 1 + : -1; + }; + + auto sum_item = [](uint16_t *a) -> float { + float b = 0; + for (int iterator = 0; iterator < AMDSMI_MAX_MM_IP_COUNT; + iterator += 1) { + b += (float)a[iterator] / 100.0; + } + return b; + }; + + // Get frequency ranges + amdsmi_frequency_range_t freq_ranges = {}; + ret = amdsmi_get_target_frequency_range( + device_handles[j], CLOCK_TYPE_GFX, &freq_ranges); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_target_frequency_range:\n"); + printf("\tSupported min freq: %lu\n", + freq_ranges.supported_freq_range.lower_bound); + printf("\tSupported max freq: %lu\n", + freq_ranges.supported_freq_range.upper_bound); + printf("\tCurrent min freq: %lu\n", + freq_ranges.current_freq_range.lower_bound); + printf("\tCurrent max freq: %lu\n\n", + freq_ranges.current_freq_range.upper_bound); + + uint32_t num_process = 0; + ret = amdsmi_get_process_list(device_handles[j], nullptr, + &num_process); + CHK_AMDSMI_RET(ret) + if (!num_process) { + printf("No processes found.\n"); + } else { + amdsmi_process_handle process_list[num_process]; + amdsmi_proc_info_t info_list[num_process]; + amdsmi_proc_info_t process = {}; + uint64_t mem = 0, gtt_mem = 0, cpu_mem = 0, vram_mem = 0; + float gfx = 0, comp = 0, dma = 0, enc = 0, dec = 0; + char bdf_str[20]; + sprintf(bdf_str, "%04x:%02x:%02x.%d", bdf.domain_number, + bdf.bus_number, bdf.device_number, bdf.function_number); + int num = 0; + ret = amdsmi_get_process_list(device_handles[j], process_list, + &num_process); + CHK_AMDSMI_RET(ret) + for (uint32_t it = 0; it < num_process; it += 1) { + if (getpid() == process_list[it]) { + continue; + } + ret = amdsmi_get_process_info(device_handles[j], + process_list[it], &process); + if (ret != AMDSMI_STATUS_SUCCESS) { + printf("amdsmi_get_process_info() failed for " + "process_list[%d], returned %d\n", + it, ret); + continue; + } + info_list[num++] = process; + } + qsort(info_list, num, sizeof(info_list[0]), compare); + printf("+=======+==================+============+==============" + "+=============+=============+=============+============" + "==+=========================================+\n"); + printf( + "| pid | name | user | gpu bdf | " + "fb usage | gtt memory | cpu memory | vram memory | " + "ring usage (%%) |\n"); + printf("| | | | " + "| | | | " + " | gfx comp dma enc dec |\n"); + printf("+=======+==================+============+==============" + "+=============+=============+=============+============" + "==+=========================================+\n"); + for (int it = 0; it < num; it++) { + char command[30]; + struct passwd *pwd = NULL; + struct stat st; + + sprintf(command, "/proc/%d", info_list[it].pid); + if (stat(command, &st)) + continue; + pwd = getpwuid(st.st_uid); + if (!pwd) + printf("| %5d | %16s | %10d | %s | %7ld KiB | %7ld KiB " + "| %7ld KiB | %7ld KiB | %6.2f %6.2f %6.2f " + "%6.2f %6.2f |\n", + info_list[it].pid, info_list[it].name, st.st_uid, + bdf_str, info_list[it].mem / 1024, + info_list[it].memory_usage.gtt_mem / 1024, + info_list[it].memory_usage.cpu_mem / 1024, + info_list[it].memory_usage.vram_mem / 1024, + sum_item(info_list[it].engine_usage.gfx), + sum_item(info_list[it].engine_usage.compute), + sum_item(info_list[it].engine_usage.sdma), + sum_item(info_list[it].engine_usage.enc), + sum_item(info_list[it].engine_usage.dec)); + else + printf("| %5d | %16s | %10s | %s | %7ld KiB | %7ld KiB " + "| %7ld KiB | %7ld KiB | %6.2f %6.2f %6.2f " + "%6.2f %6.2f |\n", + info_list[it].pid, info_list[it].name, + pwd->pw_name, bdf_str, info_list[it].mem / 1024, + info_list[it].memory_usage.gtt_mem / 1024, + info_list[it].memory_usage.cpu_mem / 1024, + info_list[it].memory_usage.vram_mem / 1024, + sum_item(info_list[it].engine_usage.gfx), + sum_item(info_list[it].engine_usage.compute), + sum_item(info_list[it].engine_usage.sdma), + sum_item(info_list[it].engine_usage.enc), + sum_item(info_list[it].engine_usage.dec)); + mem += info_list[it].mem / 1024; + gtt_mem += info_list[it].memory_usage.gtt_mem / 1024; + cpu_mem += info_list[it].memory_usage.cpu_mem / 1024; + vram_mem += info_list[it].memory_usage.vram_mem / 1024; + gfx += sum_item(info_list[it].engine_usage.gfx); + comp += sum_item(info_list[it].engine_usage.compute); + dma += sum_item(info_list[it].engine_usage.sdma); + enc += sum_item(info_list[it].engine_usage.enc); + dec += sum_item(info_list[it].engine_usage.dec); + printf( + "+-------+------------------+------------+-------------" + "-+-------------+-------------+-------------+----------" + "----+-----------------------------------------+\n"); + } + printf("| TOTAL:| %s | %7ld " + "KiB | %7ld KiB | %7ld KiB | %7ld KiB | %6.2f %6.2f " + "%6.2f %6.2f %6.2f |\n", + bdf_str, mem, gtt_mem, cpu_mem, vram_mem, gfx, comp, dma, + enc, dec); + printf("+=======+==================+============+==============" + "+=============+=============+=============+============" + "=+==========================================+\n"); + } + + // Get device name + amdsmi_board_info board_info = {}; + ret = amdsmi_get_board_info(device_handles[j], &board_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_board_info:\n"); + std::cout << "\tdevice [" << j + << "]\n\t\tProduct name: " << board_info.product_name + << "\n" + << "\t\tProduct number: " << board_info.product_serial + << "\n" + << "\t\tSerial number: " << board_info.serial_number + << "\n\n"; + + // Get temperature + int64_t val_i64 = 0; + ret = amdsmi_dev_temp_metric_get(device_handles[j], 0, + AMDSMI_TEMP_CURRENT, &val_i64); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_dev_temp_metric_get:\n"); + std::cout << "\t\tTemperature: " << val_i64 / 1000 << "C" + << "\n\n"; + + // Get frame buffer + amdsmi_vram_info_t vram_usage = {}; + ret = amdsmi_get_vram_usage(device_handles[j], &vram_usage); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_vram_usage:\n"); + std::cout << "\t\tFrame buffer usage (MB): " << vram_usage.vram_used + << "/" << vram_usage.vram_total << "\n\n"; + + // Get Cap info + amdsmi_gpu_caps_t caps_info = {}; + ret = amdsmi_get_caps_info(device_handles[j], &caps_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_caps_info:\n"); + std::cout << "\t\tGFX IP Major: " << caps_info.gfx.gfxip_major + << "\n" + << "\t\tGFX IP Minor: " << caps_info.gfx.gfxip_minor + << "\n" + << "\t\tCU IP Count: " << caps_info.gfx.gfxip_cu_count + << "\n" + << "\t\tDMA IP Count: " << caps_info.dma_ip_count << "\n" + << "\t\tGFX IP Count: " << caps_info.gfx_ip_count << "\n" + << "\t\tMM IP Count: " << int(caps_info.mm.mm_ip_count) + << "\n\n"; + + amdsmi_power_cap_info cap_info = {}; + ret = amdsmi_get_power_cap_info(device_handles[j], 0, &cap_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_power_cap_info:\n"); + std::cout << "\t\t Power Cap: " << cap_info.power_cap + << "W\n"; + std::cout << "\t\t Dpm Cap: " << cap_info.dpm_cap + << "\n\n"; + } + } + + // Clean up resources allocated at amdsmi_init. It will invalidate sockets + // and devices pointers + ret = amdsmi_shut_down(); + CHK_AMDSMI_RET(ret) + + return 0; +} diff --git a/example/amd_smi_example.cc b/example/amd_smi_example.cc deleted file mode 100644 index 195375069e..0000000000 --- a/example/amd_smi_example.cc +++ /dev/null @@ -1,146 +0,0 @@ -/* - * ============================================================================= - * The University of Illinois/NCSA - * Open Source License (NCSA) - * - * Copyright (c) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Developed by: - * - * AMD Research and AMD ROC Software Development - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal with the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of , - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS WITH THE SOFTWARE. - * - */ -#include -#include -#include - -#include -#include -#include -#include "amd_smi/amd_smi.h" - -#define CHK_AMDSMI_RET(RET) { \ - if (RET != AMDSMI_STATUS_SUCCESS) { \ - const char *err_str; \ - std::cout << "AMDSMI call returned " << RET \ - << " at line " << __LINE__ << std::endl; \ - amdsmi_status_string(RET, &err_str); \ - std::cout << err_str << std::endl; \ - return RET; \ - } \ -} - -int main() { - amdsmi_status_t ret; - - // Init amdsmi for sockets and devices. - // Here we are only interested in AMD_GPUS. - ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS); - CHK_AMDSMI_RET(ret) - - // Get all sockets - uint32_t socket_count = 0; - amdsmi_socket_handle* sockets = nullptr; - ret = amdsmi_get_socket_handles(&socket_count, &sockets); - CHK_AMDSMI_RET(ret) - std::cout << "Total Socket: " << socket_count << std::endl; - - // For each socket, get identifier and devices - for (uint32_t i = 0; i < socket_count; i++) { - // Get Socket info - char socket_name[128]; - ret = amdsmi_get_socket_info(sockets[i], socket_name, 128); - CHK_AMDSMI_RET(ret) - std::cout << "Socket " << socket_name << std::endl; - - // Get all devices of the socket - uint32_t device_count = 0; - amdsmi_device_handle* device_handles = nullptr; - ret = amdsmi_get_device_handles(sockets[i], - &device_count, &device_handles); - CHK_AMDSMI_RET(ret) - - // For each device of the socket, get name and temperature. - for (uint32_t j = 0; j < device_count; j++) { - // Get device type. Since the amdsmi is initialized with - // AMDSMI_INIT_AMD_GPUS, the device_type must be AMD_GPU. - device_type_t device_type; - ret = amdsmi_get_device_type(device_handles[j], &device_type); - CHK_AMDSMI_RET(ret) - if (device_type != AMD_GPU) { - std::cout << "Expect AMD_GPU device type!\n"; - return 1; - } - - // Get device name - amdsmi_board_info board_info; - ret = amdsmi_get_board_info(device_handles[j], &board_info); - CHK_AMDSMI_RET(ret) - std::cout << "\tdevice " - << j << "\n\t\tName:" << board_info.product_name << std::endl; - - // Get temperature - int64_t val_i64 = 0; - ret = amdsmi_dev_temp_metric_get(device_handles[j], 0, - AMDSMI_TEMP_CURRENT, &val_i64); - CHK_AMDSMI_RET(ret) - std::cout << "\t\tTemperature: " << val_i64/1000 << "C" << std::endl; - - // Get frame buffer - amdsmi_vram_info_t vram_usage; - ret = amdsmi_get_vram_usage(device_handles[j], &vram_usage); - CHK_AMDSMI_RET(ret) - std::cout << "\t\tFrame buffer usage (MB): " << vram_usage.vram_used << "/" - << vram_usage.vram_total << std::endl; - - // Get Cap info - amdsmi_gpu_caps_t caps_info = {}; - ret = amdsmi_get_caps_info(device_handles[j], &caps_info); - CHK_AMDSMI_RET(ret) - std::cout << "\t\tGFX IP Major: " << caps_info.gfx.gfxip_major << "\n"; - std::cout << "\t\tGFX IP Minor: " << caps_info.gfx.gfxip_minor << "\n"; - std::cout << "\t\tCU IP Count: " << caps_info.gfx.gfxip_cu_count << "\n"; - std::cout << "\t\tDMA IP Count: " << caps_info.dma_ip_count << "\n"; - std::cout << "\t\tGFX IP Count: " << caps_info.gfx_ip_count << "\n"; - std::cout << "\t\tMM IP Count: " << int(caps_info.mm.mm_ip_count) << "\n"; - } - } - - // Clean up resources allocated at amdsmi_init. It will invalidate sockets - // and devices pointers - ret = amdsmi_shut_down(); - CHK_AMDSMI_RET(ret) - - return 0; -} - diff --git a/example/amd_smi_nodrm_example.cc b/example/amd_smi_nodrm_example.cc new file mode 100644 index 0000000000..3fb96bcdde --- /dev/null +++ b/example/amd_smi_nodrm_example.cc @@ -0,0 +1,335 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ +#include +#include +#include + +#include "amd_smi/amd_smi.h" +#include +#include +#include +#include +#include + +#define CHK_AMDSMI_RET(RET) \ + { \ + if (RET != AMDSMI_STATUS_SUCCESS) { \ + const char *err_str; \ + std::cout << "AMDSMI call returned " << RET << " at line " \ + << __LINE__ << std::endl; \ + amdsmi_status_string(RET, &err_str); \ + std::cout << err_str << std::endl; \ + return RET; \ + } \ + } + +int main() { + amdsmi_status_t ret; + + // Init amdsmi for sockets and devices. + // Here we are only interested in AMD_GPUS. + ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS); + CHK_AMDSMI_RET(ret) + + // Get all sockets + uint32_t socket_count = 0; + amdsmi_socket_handle *sockets = nullptr; + ret = amdsmi_get_socket_handles(&socket_count, &sockets); + CHK_AMDSMI_RET(ret) + std::cout << "Total Socket: " << socket_count << std::endl; + + // For each socket, get identifier and devices + for (uint32_t i = 0; i < socket_count; i++) { + // Get Socket info + char socket_name[128]; + ret = amdsmi_get_socket_info(sockets[i], socket_name, 128); + CHK_AMDSMI_RET(ret) + std::cout << "Socket " << socket_name << std::endl; + + // Get all devices of the socket + uint32_t device_count = 0; + amdsmi_device_handle *device_handles = nullptr; + ret = amdsmi_get_device_handles(sockets[i], &device_count, + &device_handles); + CHK_AMDSMI_RET(ret) + + // For each device of the socket, get name and temperature. + for (uint32_t j = 0; j < device_count; j++) { + // Get device type. Since the amdsmi is initialized with + // AMD_SMI_INIT_AMD_GPUS, the device_type must be AMD_GPU. + device_type_t device_type = {}; + ret = amdsmi_get_device_type(device_handles[j], &device_type); + CHK_AMDSMI_RET(ret) + if (device_type != AMD_GPU) { + std::cout << "Expect AMD_GPU device type!\n"; + return AMDSMI_STATUS_NOT_SUPPORTED; + } + + amdsmi_bdf_t bdf = {}; + ret = amdsmi_get_device_bdf(device_handles[j], &bdf); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_device_bdf:\n"); + printf("\tDevice[%d] BDF %04x:%02x:%02x.%d\n\n", i, + bdf.domain_number, bdf.bus_number, bdf.device_number, + bdf.function_number); + + amdsmi_asic_info_t asic_info = {}; + ret = amdsmi_get_asic_info(device_handles[j], &asic_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_asic_info:\n"); + printf("\tMarket Name: %s\n", asic_info.market_name); + printf("\tFamilyID: 0x%x\n", asic_info.family); + printf("\tDeviceID: 0x%x\n", asic_info.device_id); + printf("\tVendorID: 0x%x\n", asic_info.vendor_id); + printf("\tRevisionID: 0x%x\n", asic_info.rev_id); + printf("\tAsic serial: 0x%s\n\n", asic_info.asic_serial); + + // Get VBIOS info + amdsmi_vbios_info_t vbios_info = {}; + ret = amdsmi_get_vbios_info(device_handles[j], &vbios_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_vbios_info:\n"); + printf("\tVBios Name: %s\n", vbios_info.name); + printf("\tBuild Date: %s\n", vbios_info.build_date); + printf("\tPart Number: %s\n", vbios_info.part_number); + printf("\tVBios Version: %d\n", vbios_info.vbios_version); + printf("\tVBios Version String: %s\n\n", + vbios_info.vbios_version_string); + + // Get engine usage info + amdsmi_engine_usage_t engine_usage = {}; + ret = amdsmi_get_gpu_activity(device_handles[j], &engine_usage); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_gpu_activity:\n"); + printf("\tAverage GFX Activity: %d\n", + engine_usage.average_gfx_activity); + printf("\tAverage MM Activity: %d\n", + engine_usage.average_mm_activity[0]); + printf("\tAverage UMC Activity: %d\n\n", + engine_usage.average_umc_activity); + + // Get firmware info + amdsmi_fw_info_t fw_information = {}; + ret = amdsmi_get_fw_info(device_handles[j], &fw_information); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_fw_info:\n"); + printf("\tFirmware version: %d\n", fw_information.num_fw_info); + printf("\tSMU: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_SMU] + .fw_version); + printf("\tSMC: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_SMC] + .fw_version); + printf("\tVCN: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_VCN] + .fw_version); + printf("\tCP_ME: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_CP_ME] + .fw_version); + printf("\tCP_PFP: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_CP_PFP] + .fw_version); + printf("\tCP_CE: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_CP_CE] + .fw_version); + printf("\tRLC: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_RLC] + .fw_version); + printf("\tCP_MEC1: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_CP_MEC1] + .fw_version); + printf("\tCP_MEC2: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_CP_MEC2] + .fw_version); + printf("\tSDMA0: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_SDMA0] + .fw_version); + printf("\tMC: %ld\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_MC] + .fw_version); + printf("\tRLC RESTORE LIST CNTL: %ld\n", + fw_information + .fw_info_list + [amdsmi_fw_block_t::FW_ID_RLC_RESTORE_LIST_CNTL] + .fw_version); + printf("\tRLC RESTORE LIST GPM MEM: %ld\n", + fw_information + .fw_info_list + [amdsmi_fw_block_t::FW_ID_RLC_RESTORE_LIST_GPM_MEM] + .fw_version); + printf("\tRLC RESTORE LIST SRM MEM: %ld\n", + fw_information + .fw_info_list + [amdsmi_fw_block_t::FW_ID_RLC_RESTORE_LIST_SRM_MEM] + .fw_version); + printf( + "\tPSP SOSDRV: %ld\n\n", + fw_information.fw_info_list[amdsmi_fw_block_t::FW_ID_PSP_SOSDRV] + .fw_version); + + // Get temperature measurements + amdsmi_temperature_t temp_measurements[4]; + amdsmi_temperature_type_t temp_types[4] = { + TEMPERATURE_TYPE_EDGE, TEMPERATURE_TYPE_JUNCTION, + TEMPERATURE_TYPE_VRAM, TEMPERATURE_TYPE_PLX}; + for (const auto &temp_type : temp_types) { + ret = amdsmi_get_temperature_measure( + device_handles[j], temp_type, + &temp_measurements[(int)(temp_type)]); + CHK_AMDSMI_RET(ret) + } + printf(" Output of amdsmi_get_temperature_measure:\n"); + printf("\tGPU Edge temp measurement: %d\n", + temp_measurements[TEMPERATURE_TYPE_EDGE].cur_temp); + printf("\tGPU Junction temp measurement: %d\n", + temp_measurements[TEMPERATURE_TYPE_JUNCTION].cur_temp); + printf("\tGPU VRAM temp measurement: %d\n", + temp_measurements[TEMPERATURE_TYPE_VRAM].cur_temp); + printf("\tGPU PLX temp measurement: %d\n\n", + temp_measurements[TEMPERATURE_TYPE_PLX].cur_temp); + + // Get bad pages + char bad_page_status_names[3][15] = {"RESERVED", "PENDING", + "UNRESERVABLE"}; + uint32_t num_pages = 0; + ret = amdsmi_get_bad_page_info(device_handles[j], &num_pages, + nullptr); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_bad_page_info:\n"); + if (!num_pages) { + printf("\tNo bad pages found.\n"); + } else { + amdsmi_retired_page_record_t bad_page_info[num_pages] = {}; + ret = amdsmi_get_bad_page_info(device_handles[j], &num_pages, + bad_page_info); + CHK_AMDSMI_RET(ret) + for (uint32_t page_it = 0; page_it < num_pages; page_it += 1) { + printf(" Page[%d]\n", page_it); + printf("\tAddress: %lu\n", + bad_page_info[page_it].page_address); + printf("\tSize: %lu\n", bad_page_info[page_it].page_size); + printf( + "\tStatus: %s\n", + bad_page_status_names[bad_page_info[page_it].status]); + } + } + printf("\n"); + + // Get ECC error counts + amdsmi_error_count_t err_cnt_info = {}; + ret = amdsmi_get_ecc_error_count(device_handles[j], &err_cnt_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_ecc_error_count:\n"); + printf("\tCorrectable errors: %lu\n", err_cnt_info.correctable_count); + printf("\tUncorrectable errors: %lu\n\n", + err_cnt_info.uncorrectable_count); + // Get process list + auto compare = [](const void *a, const void *b) -> int { + return (*(amdsmi_proc_info_t *)a).pid > + (*(amdsmi_proc_info_t *)b).pid + ? 1 + : -1; + }; + + // Get device name + amdsmi_board_info board_info = {}; + ret = amdsmi_get_board_info(device_handles[j], &board_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_board_info:\n"); + std::cout << "\tdevice [" << j + << "]\n\t\tProduct name: " << board_info.product_name + << "\n" + << "\t\tProduct number: " << board_info.product_serial + << "\n" + << "\t\tSerial number: " << board_info.serial_number + << "\n\n"; + + // Get temperature + int64_t val_i64 = 0; + ret = amdsmi_dev_temp_metric_get(device_handles[j], 0, + AMDSMI_TEMP_CURRENT, &val_i64); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_dev_temp_metric_get:\n"); + std::cout << "\t\tTemperature: " << val_i64 / 1000 << "C" + << "\n\n"; + + // Get frame buffer + amdsmi_vram_info_t vram_usage = {}; + ret = amdsmi_get_vram_usage(device_handles[j], &vram_usage); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_vram_usage:\n"); + std::cout << "\t\tFrame buffer usage (MB): " << vram_usage.vram_used + << "/" << vram_usage.vram_total << "\n\n"; + + // Get Cap info + amdsmi_gpu_caps_t caps_info = {}; + ret = amdsmi_get_caps_info(device_handles[j], &caps_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_caps_info:\n"); + std::cout << "\t\tGFX IP Major: " << caps_info.gfx.gfxip_major + << "\n" + << "\t\tGFX IP Minor: " << caps_info.gfx.gfxip_minor + << "\n" + << "\t\tCU IP Count: " << caps_info.gfx.gfxip_cu_count + << "\n" + << "\t\tDMA IP Count: " << caps_info.dma_ip_count << "\n" + << "\t\tGFX IP Count: " << caps_info.gfx_ip_count << "\n" + << "\t\tMM IP Count: " << int(caps_info.mm.mm_ip_count) + << "\n\n"; + + amdsmi_power_cap_info cap_info = {}; + ret = amdsmi_get_power_cap_info(device_handles[j], 0, &cap_info); + CHK_AMDSMI_RET(ret) + printf(" Output of amdsmi_get_power_cap_info:\n"); + std::cout << "\t\t Power Cap: " << cap_info.power_cap / 1000000 + << "W\n\n"; + } + } + + // Clean up resources allocated at amdsmi_init. It will invalidate sockets + // and devices pointers + ret = amdsmi_shut_down(); + CHK_AMDSMI_RET(ret) + + return 0; +} diff --git a/include/amd_smi/amd_smi.h b/include/amd_smi/amd_smi.h index 318d8beeba..76501c501e 100644 --- a/include/amd_smi/amd_smi.h +++ b/include/amd_smi/amd_smi.h @@ -104,7 +104,7 @@ typedef enum device_type { /** * @brief Error codes retured by amd_smi_lib functions */ -typedef enum amdsmi_status { +typedef enum amdsmi_status_t { AMDSMI_STATUS_SUCCESS = 0, /**< Call succeeded */ AMDSMI_STATUS_INVAL, /**< Invalid parameters */ AMDSMI_STATUS_NOT_SUPPORTED, /**< Command not supported */ @@ -334,15 +334,15 @@ typedef struct amdsmi_board_info { } amdsmi_board_info_t; typedef struct amdsmi_temperature { - uint16_t cur_temp; + uint32_t cur_temp; } amdsmi_temperature_t; typedef struct amdsmi_temperature_limit { - uint16_t limit; + uint32_t limit; } amdsmi_temperature_limit_t; typedef struct amdsmi_power_limit { - uint16_t limit; + uint32_t limit; } amdsmi_power_limit_t; typedef struct amdsmi_power_measure { @@ -369,17 +369,22 @@ typedef struct amdsmi_engine_usage { typedef uint32_t amdsmi_process_handle; typedef struct amdsmi_process_info { - char name[AMDSMI_NORMAL_STRING_LENGTH]; - amdsmi_process_handle pid; - uint64_t mem; /** in bytes */ + char name[AMDSMI_NORMAL_STRING_LENGTH]; + amdsmi_process_handle pid; + uint64_t mem; /** in bytes */ + struct { + uint16_t gfx[AMDSMI_MAX_MM_IP_COUNT]; + uint16_t compute[AMDSMI_MAX_MM_IP_COUNT]; + uint16_t sdma[AMDSMI_MAX_MM_IP_COUNT]; + uint16_t enc[AMDSMI_MAX_MM_IP_COUNT]; + uint16_t dec[AMDSMI_MAX_MM_IP_COUNT]; + } engine_usage; /** percentage 0-100% times 100 */ struct { - uint16_t gfx[AMDSMI_MAX_MM_IP_COUNT]; - uint16_t compute[AMDSMI_MAX_MM_IP_COUNT]; - uint16_t sdma[AMDSMI_MAX_MM_IP_COUNT]; - uint16_t enc[AMDSMI_MAX_MM_IP_COUNT]; - uint16_t dec[AMDSMI_MAX_MM_IP_COUNT]; - } usage; /** percentage 0-100% times 100 */ - char container_name[AMDSMI_NORMAL_STRING_LENGTH]; + uint64_t gtt_mem; + uint64_t cpu_mem; + uint64_t vram_mem; + } memory_usage; /** in bytes */ + char container_name[AMDSMI_NORMAL_STRING_LENGTH]; } amdsmi_proc_info_t; //! Guaranteed maximum possible number of supported frequencies @@ -1071,8 +1076,8 @@ typedef struct { * @brief This structure holds error counts. */ typedef struct { - uint64_t correctable_err; //!< Accumulated correctable errors - uint64_t uncorrectable_err; //!< Accumulated uncorrectable errors + uint64_t correctable_count; //!< Accumulated correctable errors + uint64_t uncorrectable_count; //!< Accumulated uncorrectable errors } amdsmi_error_count_t; /** @@ -1795,7 +1800,50 @@ amdsmi_dev_memory_total_get(amdsmi_device_handle device_handle, amdsmi_memory_ty amdsmi_status_t amdsmi_dev_memory_usage_get(amdsmi_device_handle device_handle, amdsmi_memory_type_t mem_type, uint64_t *used); - +/** + * @brief The first call to this API returns the number of bad pages which + * should be used to allocate the buffer that should contain the bad page + * records. + * @details This call will query the device @p device_handle for the + * number of bad pages (written to @p num_pages address). The results are + * written to address held by the @p info pointer. + * @param[in] device_handle a device handle + * @param[out] num_pages Number of bad page records. + * @param[out] info Pointer to amdsmi_retired_page_record_t to which the + * results will be written to. + * @retval ::AMDSMI_STATUS_SUCCESS call was successful + * @retval ::AMDSMI_STATUS_INVAL the parameters are not valid or nullptr + * @retval ::AMDSMI_STATUS_NOT_SUPPORTED API not supported + */ +amdsmi_status_t +amdsmi_get_bad_page_info(amdsmi_device_handle device_handle, uint32_t *num_pages, amdsmi_retired_page_record_t *info); +/** + * @brief Returns if RAS features are enabled or disabled for given block + * + * @details Given a device handle @p device_handle, this function queries the + * state of RAS features for a specific block @p block. Result will be written + * to address held by pointer @p state. + * + * @param[in] device_handle Device handle which to query + * + * @param[in] block Block which to query + * + * @param[inout] state A pointer to amdsmi_ras_err_state_t to which the state + * of block will be written. + * If this parameter is nullptr, this function will return + * ::AMDSMI_STATUS_INVALID_ARGS if the function is supported with the provided + * arguments and ::AMDSMI_STATUS_NOT_SUPPORTED if it is not supported with the + * provided arguments. + * + * @retval ::AMDSMI_STATUS_SUCCESS call was successful + * @retval ::AMDSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments. + * @retval ::AMDSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +amdsmi_status_t +amdsmi_get_ras_features_enabled(amdsmi_device_handle device_handle, amdsmi_gpu_block block, + amdsmi_ras_err_state_t *state); /** * @brief Get percentage of time any device memory is being used * @@ -3676,11 +3724,11 @@ amdsmi_status_t amdsmi_event_notification_stop(amdsmi_device_handle device_handl * \param [out] bdf - Reference to BDF. Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_INVAL - Parameters are invalid - * * -::SMI_ERR_NOT_FOUND - Device cannot be found - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_INVAL - Parameters are invalid + * * -::AMDSMI_STATUS_NOT_FOUND - Device cannot be found + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ amdsmi_status_t amdsmi_get_device_bdf(amdsmi_device_handle dev, amdsmi_bdf_t *bdf); @@ -3698,13 +3746,13 @@ amdsmi_get_device_bdf(amdsmi_device_handle dev, amdsmi_bdf_t *bdf); * allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_INVAL - Parameters are invalid - * * -::SMI_ERR_NOT_FOUND - Device cannot be found - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_INVAL - Parameters are invalid + * * -::AMDSMI_STATUS_NOT_FOUND - Device cannot be found + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_device_uuid(amdsmi_device_handle dev, unsigned int *uuid_length, char *uuid); /** @} */ @@ -3726,13 +3774,13 @@ amdsmi_get_device_uuid(amdsmi_device_handle dev, unsigned int *uuid_length, char * allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NOT_FOUND - Device cannot be found - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NOT_FOUND - Device cannot be found + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_driver_version(amdsmi_device_handle dev, int *length, char *version); /** @} */ @@ -3750,15 +3798,15 @@ amdsmi_get_driver_version(amdsmi_device_handle dev, int *length, char *version); * Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_asic_info(amdsmi_device_handle dev, amdsmi_asic_info_t *info); /** @@ -3770,15 +3818,15 @@ amdsmi_get_asic_info(amdsmi_device_handle dev, amdsmi_asic_info_t *info); * Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialize - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialize + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_board_info(amdsmi_device_handle dev, amdsmi_board_info_t *info); /** @@ -3792,15 +3840,15 @@ amdsmi_get_board_info(amdsmi_device_handle dev, amdsmi_board_info_t *info); * allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_power_cap_info(amdsmi_device_handle dev, uint32_t sensor_ind, amdsmi_power_cap_info_t *info); @@ -3814,15 +3862,15 @@ amdsmi_get_power_cap_info(amdsmi_device_handle dev, uint32_t sensor_ind, * * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_xgmi_info(amdsmi_device_handle dev, amdsmi_xgmi_info_t *info); /** @@ -3835,15 +3883,15 @@ amdsmi_get_xgmi_info(amdsmi_device_handle dev, amdsmi_xgmi_info_t *info); * allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_caps_info(amdsmi_device_handle dev, amdsmi_gpu_caps_t *info); /** @} */ @@ -3860,15 +3908,15 @@ amdsmi_get_caps_info(amdsmi_device_handle dev, amdsmi_gpu_caps_t *info); * \param [out] info - Reference to the fw info. Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_fw_info(amdsmi_device_handle dev, amdsmi_fw_info_t *info); /** @@ -3880,15 +3928,15 @@ amdsmi_get_fw_info(amdsmi_device_handle dev, amdsmi_fw_info_t *info); * Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_vbios_info(amdsmi_device_handle dev, amdsmi_vbios_info_t *info); /** @} */ @@ -3906,15 +3954,15 @@ amdsmi_get_vbios_info(amdsmi_device_handle dev, amdsmi_vbios_info_t *info); * \param [out] info - Reference to the gpu engine usage structure. Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_gpu_activity(amdsmi_device_handle dev, amdsmi_engine_usage_t *info); /** @@ -3926,15 +3974,15 @@ amdsmi_get_gpu_activity(amdsmi_device_handle dev, amdsmi_engine_usage_t *info); * \param [out] info - Reference to the gpu power structure. Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_power_measure(amdsmi_device_handle dev, amdsmi_power_measure_t *info); /** @@ -3950,15 +3998,15 @@ amdsmi_get_power_measure(amdsmi_device_handle dev, amdsmi_power_measure_t *info) * Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_clock_measure(amdsmi_device_handle dev, amdsmi_clk_type_t clk_type, amdsmi_clock_measure_t *info); /** @@ -3973,15 +4021,15 @@ amdsmi_get_clock_measure(amdsmi_device_handle dev, amdsmi_clk_type_t clk_type, a * Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_temperature_measure(amdsmi_device_handle dev, amdsmi_temperature_type_t temp_type, amdsmi_temperature_t *info); /** @@ -3996,16 +4044,16 @@ amdsmi_get_temperature_measure(amdsmi_device_handle dev, amdsmi_temperature_type * Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status -amdsmi_get_temperature_limit(amdsmi_device_handle dev, amdsmi_temperature_t temp_type, amdsmi_temperature_limit_t *limit); +amdsmi_status_t +amdsmi_get_temperature_limit(amdsmi_device_handle dev, amdsmi_temperature_type_t temp_type, amdsmi_temperature_limit_t *limit); /** * \brief Returns power limit of the GPU. @@ -4017,15 +4065,15 @@ amdsmi_get_temperature_limit(amdsmi_device_handle dev, amdsmi_temperature_t temp * Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_power_limit(amdsmi_device_handle dev, amdsmi_power_limit_t *limit); /** @@ -4039,15 +4087,15 @@ amdsmi_get_power_limit(amdsmi_device_handle dev, amdsmi_power_limit_t *limit); * Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_vram_usage(amdsmi_device_handle dev, amdsmi_vram_info_t *info); /** @} */ @@ -4072,15 +4120,15 @@ amdsmi_get_vram_usage(amdsmi_device_handle dev, amdsmi_vram_info_t *info); * * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialize - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_INVAL - Parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialize + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_INVAL - Parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_target_frequency_range(amdsmi_device_handle dev, amdsmi_clk_type_t clk_type, amdsmi_frequency_range_t *range); /** @} */ @@ -4109,13 +4157,13 @@ amdsmi_get_target_frequency_range(amdsmi_device_handle dev, amdsmi_clk_type_t cl * in list or the number of running processes if equal to 0. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_INVAL - The parameters are not valid or NULL - * * -::SMI_ERR_NOMEM - Provided buffer is not large enough - * * -::SMI_ERR_NOT_SUPPORTED - API not supported + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_INVAL - The parameters are not valid or NULL + * * -::AMDSMI_STATUS_NOMEM - Provided buffer is not large enough + * * -::AMDSMI_STATUS_NOT_SUPPORTED - API not supported */ -amdsmi_status +amdsmi_status_t amdsmi_get_process_list(amdsmi_device_handle dev, amdsmi_process_handle *list, uint32_t *max_processes); /** @@ -4129,12 +4177,12 @@ amdsmi_get_process_list(amdsmi_device_handle dev, amdsmi_process_handle *list, u * information. Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_INVAL - The parameters are not valid or NULL - * * -::SMI_ERR_NOT_SUPPORTED - API not supported + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_INVAL - The parameters are not valid or NULL + * * -::AMDSMI_STATUS_NOT_SUPPORTED - API not supported */ -amdsmi_status +amdsmi_status_t amdsmi_get_process_info(amdsmi_device_handle dev, amdsmi_process_handle process, amdsmi_proc_info_t *info); /** @} */ @@ -4154,15 +4202,15 @@ amdsmi_get_process_info(amdsmi_device_handle dev, amdsmi_process_handle process, * Must be allocated by user. * * \return - * * ::SMI_SUCCESS - Successful - * * -::SMI_ERR_RETRY - Device is busy. Please retry - * * -::SMI_ERR_NO_PERM - Library was not initialized - * * -::SMI_ERR_INVAL - The parameters are not valid or NULL - * * -::SMI_ERR_IO - Device is in an unrecoverable state - * * -::SMI_ERR_NOT_INIT - Device is uninitialized - * * -::SMI_ERR_API_FAILED - Other errors + * * ::AMDSMI_STATUS_SUCCESS - Successful + * * -::AMDSMI_STATUS_RETRY - Device is busy. Please retry + * * -::AMDSMI_STATUS_NO_PERM - Library was not initialized + * * -::AMDSMI_STATUS_INVAL - The parameters are not valid or NULL + * * -::AMDSMI_STATUS_IO - Device is in an unrecoverable state + * * -::AMDSMI_STATUS_NOT_INIT - Device is uninitialized + * * -::AMDSMI_STATUS_API_FAILED - Other errors */ -amdsmi_status +amdsmi_status_t amdsmi_get_ecc_error_count(amdsmi_device_handle dev, amdsmi_error_count_t *ec); #ifdef __cplusplus diff --git a/include/amd_smi/impl/amd_smi_drm.h b/include/amd_smi/impl/amd_smi_drm.h index b7a1e65025..ba42fcb8e6 100644 --- a/include/amd_smi/impl/amd_smi_drm.h +++ b/include/amd_smi/impl/amd_smi_drm.h @@ -58,7 +58,13 @@ class AMDSmiDrm { public: amdsmi_status_t init(); amdsmi_status_t cleanup(); - int get_drm_fd_by_index(uint32_t gpu_index) const; + amdsmi_status_t get_drm_fd_by_index(uint32_t gpu_index, uint32_t *fd_info) const; + amdsmi_status_t get_bdf_by_index(uint32_t gpu_index, amdsmi_bdf_t *bdf_info) const; + amdsmi_status_t get_drm_path_by_index(uint32_t gpu_index, std::string *drm_path) const; + std::vector get_bdfs(); + std::vector& get_drm_paths(); + bool check_if_drm_is_supported(); + amdsmi_status_t amdgpu_query_info(int fd, unsigned info_id, unsigned size, void *value); amdsmi_status_t amdgpu_query_fw(int fd, unsigned info_id, unsigned fw_type, @@ -70,6 +76,9 @@ class AMDSmiDrm { private: using DrmCmdWriteFunc = int (*)(int, unsigned long, void *, unsigned long); std::vector drm_fds_; // drm file descriptor by gpu_index + std::vector drm_paths_; // drm path (renderD128 for example) + std::vector drm_bdfs_; // bdf + AMDSmiLibraryLoader lib_loader_; // lazy load libdrm DrmCmdWriteFunc drm_cmd_write_; // drmCommandWrite std::mutex drm_mutex_; diff --git a/include/amd_smi/impl/amd_smi_gpu_device.h b/include/amd_smi/impl/amd_smi_gpu_device.h index 4d9bfde313..9183083e09 100644 --- a/include/amd_smi/impl/amd_smi_gpu_device.h +++ b/include/amd_smi/impl/amd_smi_gpu_device.h @@ -47,16 +47,32 @@ #include "amd_smi/amd_smi.h" #include "amd_smi/impl/amd_smi_device.h" #include "amd_smi/impl/amd_smi_drm.h" +#include "shared_mutex.h" // NOLINT namespace amd { namespace smi { class AMDSmiGPUDevice: public AMDSmiDevice { public: - explicit AMDSmiGPUDevice(uint32_t gpu_id, AMDSmiDrm& drm): - AMDSmiDevice(AMD_GPU), gpu_id_(gpu_id), drm_(drm) {} + AMDSmiGPUDevice(uint32_t gpu_id, uint32_t fd, std::string path, amdsmi_bdf_t bdf, AMDSmiDrm& drm): + AMDSmiDevice(AMD_GPU), gpu_id_(gpu_id), fd_(fd), path_(path), bdf_(bdf), drm_(drm) {} + AMDSmiGPUDevice(uint32_t gpu_id, AMDSmiDrm& drm): + AMDSmiDevice(AMD_GPU), gpu_id_(gpu_id), drm_(drm) { + if (check_if_drm_is_supported()) this->get_drm_data(); + } + ~AMDSmiGPUDevice() { + if (check_if_drm_is_supported()) shared_mutex_close(mutex_); + } + + amdsmi_status_t get_drm_data(); + pthread_mutex_t* get_mutex(); uint32_t get_gpu_id() const; + uint32_t get_gpu_fd() const; + std::string& get_gpu_path(); + amdsmi_bdf_t get_bdf(); + bool check_if_drm_is_supported() { return drm_.check_if_drm_is_supported(); } + amdsmi_status_t amdgpu_query_info(unsigned info_id, unsigned size, void *value) const; amdsmi_status_t amdgpu_query_hw_ip(unsigned info_id, unsigned hw_ip_type, @@ -66,7 +82,11 @@ class AMDSmiGPUDevice: public AMDSmiDevice { amdsmi_status_t amdgpu_query_vbios(void *info) const; private: uint32_t gpu_id_; + uint32_t fd_; + std::string path_; + amdsmi_bdf_t bdf_; AMDSmiDrm& drm_; + shared_mutex_t mutex_; }; diff --git a/include/amd_smi/impl/amd_smi_socket.h b/include/amd_smi/impl/amd_smi_socket.h index 42747d5b44..c4d50a88e9 100644 --- a/include/amd_smi/impl/amd_smi_socket.h +++ b/include/amd_smi/impl/amd_smi_socket.h @@ -60,6 +60,7 @@ class AMDSmiSocket { const std::string& get_socket_id() const { return socket_identifier_;} void add_device(AMDSmiDevice* device) { devices_.push_back(device); } std::vector& get_devices() { return devices_;} + amdsmi_status_t get_device_count(uint32_t* device_count) const; private: std::string socket_identifier_; std::vector devices_; diff --git a/include/amd_smi/impl/amd_smi_utils.h b/include/amd_smi/impl/amd_smi_utils.h new file mode 100644 index 0000000000..47977b17d3 --- /dev/null +++ b/include/amd_smi/impl/amd_smi_utils.h @@ -0,0 +1,44 @@ +/* * Copyright (C) 2022 Advanced Micro Devices. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_ +#define AMD_SMI_INCLUDE_AMD_SMI_UTILS_H_ + +#include "amd_smi/amd_smi.h" +#include "amd_smi/impl/amd_smi_gpu_device.h" +#include "rocm_smi/rocm_smi_utils.h" + + +#define SMIGPUDEVICE_MUTEX(MUTEX) \ + amd::smi::pthread_wrap _pw(*(MUTEX)); \ + amd::smi::ScopedPthread _lock(_pw, true); \ + if (_lock.mutex_not_acquired()) { \ + return AMDSMI_STATUS_BUSY; \ + } + +amdsmi_status_t smi_amdgpu_find_hwmon_dir(amd::smi::AMDSmiGPUDevice* device, std::string* full_path); +amdsmi_status_t smi_amdgpu_get_board_info(amd::smi::AMDSmiGPUDevice* device, amdsmi_board_info_t *info); +amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int *cap); +amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_clk_type_t domain, int *max_freq, int *min_freq, int *num_dpm); +amdsmi_status_t smi_amdgpu_get_enabled_blocks(amd::smi::AMDSmiGPUDevice* device, uint64_t *enabled_blocks); +amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device, uint32_t *num_pages, amdsmi_retired_page_record_t *info); +amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt); + +#endif // diff --git a/include/amd_smi/impl/fdinfo.h b/include/amd_smi/impl/fdinfo.h new file mode 100644 index 0000000000..c37e60cf5b --- /dev/null +++ b/include/amd_smi/impl/fdinfo.h @@ -0,0 +1,38 @@ +/* + * Copyright 2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __FDINFO__ +#define __FDINFO__ + +#ifdef __cplusplus +extern "C" { +#endif + +amdsmi_status_t gpuvsmi_get_pids(const amdsmi_bdf_t &bdf, std::vector &pids, uint64_t *size); +amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid, amdsmi_proc_info_t &info); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index de3e0d7852..1b32e08045 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -50,12 +50,14 @@ #include #include #include +#include #include #include #include #include #include #include "amd_smi/amd_smi.h" +#include "amd_smi/impl/fdinfo.h" #include "amd_smi/impl/amd_smi_common.h" #include "amd_smi/impl/amd_smi_system.h" #include "amd_smi/impl/amd_smi_socket.h" @@ -63,6 +65,7 @@ #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_common.h" #include "amd_smi/impl/amdgpu_drm.h" +#include "amd_smi/impl/amd_smi_utils.h" // TODO(bliu): One to one map to all status code static amdsmi_status_t rsmi_to_amdsmi_status(rsmi_status_t status) { @@ -178,7 +181,9 @@ amdsmi_status_t amdsmi_get_device_handles(amdsmi_socket_handle socket_handle, .handle_to_socket(socket_handle, &socket); if (r != AMDSMI_STATUS_SUCCESS) return r; - *device_count = static_cast(socket->get_devices().size()); + r = socket->get_device_count(device_count); + if (r != AMDSMI_STATUS_SUCCESS) return r; + *device_handles = reinterpret_cast( socket->get_devices().data()); return AMDSMI_STATUS_SUCCESS; @@ -198,54 +203,44 @@ amdsmi_status_t amdsmi_get_device_type(amdsmi_device_handle device_handle , return AMDSMI_STATUS_SUCCESS; } -amdsmi_status_t amdsmi_get_board_info(amdsmi_device_handle device_handle, - amdsmi_board_info_t *info) { - if (info == NULL) { +amdsmi_status_t +amdsmi_get_device_bdf(amdsmi_device_handle device_handle, amdsmi_bdf_t *bdf) { + + if (bdf == NULL) { return AMDSMI_STATUS_INVAL; } - memset(info, 0, sizeof(amdsmi_board_info_t)); + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(device_handle); - // ignore errors so that if the function is not supported, - // it will continue to add other info. - auto r = rsmi_wrapper(rsmi_dev_name_get, device_handle, - info->product_name, AMDSMI_PRODUCT_NAME_LENGTH); - - r = rsmi_wrapper(rsmi_dev_serial_number_get, device_handle, - info->product_serial, AMDSMI_NORMAL_STRING_LENGTH); + if (gpu_device->check_if_drm_is_supported()) { + *bdf = gpu_device->get_bdf(); + } + else { + //TODO + } return AMDSMI_STATUS_SUCCESS; } -// TODO(bliu) : add other asic info -amdsmi_status amdsmi_get_asic_info(amdsmi_device_handle device_handle, - amdsmi_asic_info_t *info) { - if (info == nullptr) +amdsmi_status_t amdsmi_get_board_info(amdsmi_device_handle device_handle, amdsmi_board_info_t *board_info) { + if (board_info == NULL) { return AMDSMI_STATUS_INVAL; - memset(info, 0, sizeof(amdsmi_asic_info_t)); + } - auto r = rsmi_wrapper(rsmi_dev_serial_number_get, device_handle, - info->asic_serial, AMDSMI_NORMAL_STRING_LENGTH); + amdsmi_status_t status; + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(device_handle); - r = rsmi_wrapper(rsmi_dev_brand_get, device_handle, - info->market_name, AMDSMI_NORMAL_STRING_LENGTH); + if (gpu_device->check_if_drm_is_supported()) { + status = smi_amdgpu_get_board_info(gpu_device, board_info); + } + else { + status = rsmi_wrapper(rsmi_dev_name_get, device_handle, board_info->product_name, AMDSMI_PRODUCT_NAME_LENGTH); + status = rsmi_wrapper(rsmi_dev_serial_number_get, device_handle, board_info->product_serial, AMDSMI_NORMAL_STRING_LENGTH); + } - uint16_t vendor_id = 0; - r = rsmi_wrapper(rsmi_dev_vendor_id_get, device_handle, - &vendor_id); - if ( r == AMDSMI_STATUS_SUCCESS) - info->vendor_id = static_cast(vendor_id); - - r = rsmi_wrapper(rsmi_dev_unique_id_get, device_handle, - &(info->device_id)); - - vendor_id = 0; - r = rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, device_handle, - &vendor_id); - if ( r == AMDSMI_STATUS_SUCCESS) - info->subvendor_id = static_cast(vendor_id); - - return AMDSMI_STATUS_SUCCESS; + return status; } amdsmi_status_t amdsmi_dev_temp_metric_get(amdsmi_device_handle device_handle, @@ -333,7 +328,7 @@ amdsmi_status_t amdsmi_get_caps_info(amdsmi_device_handle device_handle, sizeof(struct drm_amdgpu_info_device), &device); if (r != AMDSMI_STATUS_SUCCESS) return r; - info->gfx.gfxip_cu_count = device.cu_active_number; + info->gfx.gfxip_cu_count = (uint16_t)device.cu_active_number; r = gpu_device->amdgpu_query_hw_ip(AMDGPU_INFO_HW_IP_INFO, AMDGPU_HW_IP_GFX, sizeof(ip), &ip); @@ -392,16 +387,6 @@ amdsmi_status_t amdsmi_get_caps_info(amdsmi_device_handle device_handle, return AMDSMI_STATUS_SUCCESS; } -// TODO(bliu): add more vbios info -amdsmi_status amdsmi_get_vbios_info(amdsmi_device_handle device_handle, - amdsmi_vbios_info_t *info) { - if (info == nullptr) { - return AMDSMI_STATUS_INVAL; - } - return rsmi_wrapper(rsmi_dev_vbios_version_get, device_handle, - info->vbios_version_string, AMDSMI_NORMAL_STRING_LENGTH); -} - amdsmi_status_t amdsmi_dev_fan_rpms_get(amdsmi_device_handle device_handle, uint32_t sensor_ind, int64_t *speed) { return rsmi_wrapper(rsmi_dev_fan_rpms_get, device_handle, sensor_ind, @@ -437,7 +422,7 @@ amdsmi_status_t amdsmi_dev_id_get(amdsmi_device_handle device_handle, } // TODO(bliu) : add fw info from libdrm -amdsmi_status amdsmi_get_fw_info(amdsmi_device_handle dev, +amdsmi_status_t amdsmi_get_fw_info(amdsmi_device_handle dev, amdsmi_fw_info_t *info) { const std::map fw_in_rsmi = { { FW_ID_ASD, RSMI_FW_BLOCK_ASD}, @@ -477,6 +462,66 @@ amdsmi_status amdsmi_get_fw_info(amdsmi_device_handle dev, info->num_fw_info++; } } + return AMDSMI_STATUS_SUCCESS; +} + +// TODO(bliu) : add other asic info +amdsmi_status_t +amdsmi_get_asic_info(amdsmi_device_handle device_handle, amdsmi_asic_info_t *info) { + if (info == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + struct drm_amdgpu_info_device dev_info = {}; + struct drm_amdgpu_info_vbios vbios = {}; + char* name; + char *tmp; + + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(device_handle); + amdsmi_status_t status; + if (gpu_device->check_if_drm_is_supported()){ + status = gpu_device->amdgpu_query_info(AMDGPU_INFO_DEV_INFO, sizeof(struct drm_amdgpu_info_device), &dev_info); + if (status != AMDSMI_STATUS_SUCCESS) return status; + status = gpu_device->amdgpu_query_vbios(&vbios); + if (status != AMDSMI_STATUS_SUCCESS) return status; + + SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()) + + std::string path = "/sys/class/drm/" + gpu_device->get_gpu_path() + "/device/unique_id"; + FILE *fp = fopen(path.c_str(), "r"); + if (fp) { + fscanf(fp, "%s", &info->asic_serial); + fclose(fp); + } + + name = strtok_r((char *) vbios.name, " ", &tmp); + if (name) + strncpy(info->market_name, name, AMDSMI_MAX_STRING_LENGTH); + + info->device_id = dev_info.device_id; + info->family = dev_info.family; + info->rev_id = dev_info.pci_rev; + } + else { + uint16_t vendor_id = 0; + + amdsmi_status_t status = rsmi_wrapper(rsmi_dev_serial_number_get, device_handle, + info->asic_serial, AMDSMI_NORMAL_STRING_LENGTH); + + status = rsmi_wrapper(rsmi_dev_brand_get, device_handle, + info->market_name, AMDSMI_NORMAL_STRING_LENGTH); + + status = rsmi_wrapper(rsmi_dev_vendor_id_get, device_handle, + &vendor_id); + if (status == AMDSMI_STATUS_SUCCESS) info->vendor_id = vendor_id; + vendor_id = 0; + + status = rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, device_handle, + &vendor_id); + if (status == AMDSMI_STATUS_SUCCESS) info->subvendor_id = vendor_id; + return status; + } return AMDSMI_STATUS_SUCCESS; } @@ -664,7 +709,7 @@ amdsmi_is_P2P_accessible(amdsmi_device_handle device_handle_src, } // TODO(bliu) : other xgmi related information -amdsmi_status +amdsmi_status_t amdsmi_get_xgmi_info(amdsmi_device_handle device_handle, amdsmi_xgmi_info_t *info) { if (info == nullptr) return AMDSMI_STATUS_INVAL; @@ -882,29 +927,48 @@ amdsmi_status_t amdsmi_dev_gpu_metrics_info_get( reinterpret_cast(pgpu_metrics)); } -// TODO(bliu): read from libdrm -amdsmi_status +amdsmi_status_t amdsmi_get_power_cap_info(amdsmi_device_handle device_handle, uint32_t sensor_ind, amdsmi_power_cap_info_t *info) { if (info == nullptr) return AMDSMI_STATUS_INVAL; - amd::smi::AMDSmiGPUDevice* gpudevice = nullptr; - amdsmi_status_t r = get_gpu_device_from_handle(device_handle, &gpudevice); - if (r != AMDSMI_STATUS_SUCCESS) - return r; + amd::smi::AMDSmiGPUDevice* gpudevice = + static_cast(device_handle); + amdsmi_status_t status; + if (gpudevice->check_if_drm_is_supported()){ + int power_cap = 0; + int dpm = 0; - // Ignore errors to get as much as possible info. - memset(info, 0, sizeof(amdsmi_power_cap_info_t)); - auto rsmi_status = rsmi_dev_power_cap_default_get(gpudevice->get_gpu_id(), - &(info->default_power_cap)); - rsmi_status = rsmi_dev_power_cap_range_get(gpudevice->get_gpu_id(), - sensor_ind, &(info->max_power_cap), &(info->min_power_cap)); - rsmi_status = rsmi_dev_power_cap_get(gpudevice->get_gpu_id(), - sensor_ind, &(info->power_cap)); + status = smi_amdgpu_get_power_cap(gpudevice, &power_cap); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + info->power_cap = power_cap; + + status = smi_amdgpu_get_ranges(gpudevice, CLOCK_TYPE_GFX, + NULL, NULL, &dpm); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + info->dpm_cap = dpm; + + return AMDSMI_STATUS_SUCCESS; + } + else { + // Ignore errors to get as much as possible info. + memset(info, 0, sizeof(amdsmi_power_cap_info_t)); + auto rsmi_status = rsmi_dev_power_cap_default_get(gpudevice->get_gpu_id(), + &(info->default_power_cap)); + rsmi_status = rsmi_dev_power_cap_range_get(gpudevice->get_gpu_id(), + sensor_ind, &(info->max_power_cap), &(info->min_power_cap)); + rsmi_status = rsmi_dev_power_cap_get(gpudevice->get_gpu_id(), + sensor_ind, &(info->power_cap)); + + // TODO(bliu) : dpm_cap + } - // TODO(bliu) : dpm_cap return AMDSMI_STATUS_SUCCESS; } @@ -1192,19 +1256,393 @@ amdsmi_status_t amdsmi_version_str_get(amdsmi_sw_component_t component, return rsmi_to_amdsmi_status(status); } -amdsmi_status amdsmi_get_gpu_activity(amdsmi_device_handle dev, - amdsmi_engine_usage_t *info) { - if (info == nullptr) +amdsmi_status_t +amdsmi_get_vbios_info(amdsmi_device_handle dev, amdsmi_vbios_info_t *info) { + if (info == nullptr) { return AMDSMI_STATUS_INVAL; + } + struct drm_amdgpu_info_vbios vbios = {}; - // Get gpu activity from the gpu_metrics table - amdsmi_gpu_metrics_t gpu_metrics_info; - auto r = amdsmi_dev_gpu_metrics_info_get(dev, &gpu_metrics_info); - if ( r == AMDSMI_STATUS_SUCCESS ) { - info->average_gfx_activity = gpu_metrics_info.average_gfx_activity; - info->average_umc_activity = gpu_metrics_info.average_umc_activity; - info->average_mm_activity[0] = gpu_metrics_info.average_mm_activity; + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(dev); + amdsmi_status_t status; + if (gpu_device->check_if_drm_is_supported()){ + status = gpu_device->amdgpu_query_vbios(&vbios); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + strncpy(info->name, (char *) vbios.name, AMDSMI_MAX_STRING_LENGTH); + strncpy(info->build_date, (char *) vbios.date, AMDSMI_MAX_DATE_LENGTH); + strncpy(info->part_number, (char *) vbios.vbios_pn, AMDSMI_MAX_STRING_LENGTH); + strncpy(info->vbios_version_string, (char *) vbios.vbios_ver_str, AMDSMI_NORMAL_STRING_LENGTH); + info->vbios_version = vbios.version; + } + else { + // rocm } - return r; + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t +amdsmi_get_gpu_activity(amdsmi_device_handle dev, amdsmi_engine_usage_t *info) { + if (info == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + amdsmi_gpu_metrics_t metrics = {}; + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(dev); + amdsmi_status_t status; + status = amdsmi_dev_gpu_metrics_info_get(dev, &metrics); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + info->average_gfx_activity = metrics.average_gfx_activity; + info->average_mm_activity[0] = metrics.average_mm_activity; + info->average_umc_activity = metrics.average_umc_activity; + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t +amdsmi_get_power_limit(amdsmi_device_handle dev, amdsmi_power_limit_t *limit) { + if (limit == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(dev); + amdsmi_status_t status; + int power_limit; + status = smi_amdgpu_get_power_cap(gpu_device, &power_limit); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + limit->limit = (uint16_t)(power_limit); + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t +amdsmi_get_clock_measure(amdsmi_device_handle dev, amdsmi_clk_type_t clk_type, amdsmi_clock_measure_t *info) { + if (info == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + if (clk_type >= CLOCK_TYPE__MAX) { + printf("Domain value greater or equals CLOCK_TYPE__MAX value. Return code: %d", AMDSMI_STATUS_INVAL); + return AMDSMI_STATUS_INVAL; + } + + amdsmi_gpu_metrics_t metrics = {}; + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(dev); + amdsmi_status_t status; + + status = amdsmi_dev_gpu_metrics_info_get(dev, &metrics); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + int max_freq; + status = smi_amdgpu_get_ranges(gpu_device, clk_type, + &max_freq, NULL, NULL); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + info->max_clk = max_freq; + + switch (clk_type) { + case CLOCK_TYPE_GFX: + info->avg_clk = metrics.average_gfxclk_frequency; + info->cur_clk = metrics.current_gfxclk; + break; + case CLOCK_TYPE_MEM: + info->avg_clk = metrics.average_uclk_frequency; + info->cur_clk = metrics.current_uclk; + break; + case CLOCK_TYPE_VCLK0: + info->avg_clk = metrics.average_vclk0_frequency; + info->cur_clk = metrics.current_vclk0; + break; + case CLOCK_TYPE_VCLK1: + info->avg_clk = metrics.average_vclk1_frequency; + info->cur_clk = metrics.current_vclk1; + break; + default: + return AMDSMI_STATUS_INVAL; + } + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t +amdsmi_get_temperature_limit(amdsmi_device_handle dev, amdsmi_temperature_type_t temp_type, amdsmi_temperature_limit_t *limit) { + if (limit == nullptr || temp_type >= TEMPERATURE_TYPE__MAX) { + return AMDSMI_STATUS_INVAL; + } + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(dev); + amdsmi_status_t status; + std::string name; + std::string path; + switch (temp_type) { + case TEMPERATURE_TYPE_EDGE: + name = "edge"; + break; + case TEMPERATURE_TYPE_JUNCTION: + name = "junction"; + break; + case TEMPERATURE_TYPE_VRAM: + name = "mem"; + break; + default: + return AMDSMI_STATUS_INVAL; + } + status = smi_amdgpu_find_hwmon_dir(gpu_device, &path); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + SMIGPUDEVICE_MUTEX(gpu_device->get_mutex()) + + for (int count = 1; ; count++) { + std::string local_path = path + "/temp" + + std::to_string(count); + std::string temp = local_path + "_label"; + char f_name[10]; + std::ifstream file(temp.c_str(), std::ifstream::in); + + if (!file.is_open()) { + printf("Failed to open file: %s \n", temp.c_str()); + return AMDSMI_STATUS_API_FAILED; + } + + file.getline(f_name, 10); + + if (!strstr(name.c_str(), f_name)) { + int readTemp = 0; + temp = local_path + "_crit"; + std::ifstream file2(temp.c_str(), std::ifstream::in); + + if (!file2.is_open()) { + printf("Failed to open file: %s \n", temp.c_str()); + return AMDSMI_STATUS_API_FAILED; + } + + file2.getline(f_name, 10); + if (!sscanf(f_name, "%d", &readTemp)) { + return AMDSMI_STATUS_API_FAILED; + } + limit->limit = (uint16_t)(readTemp / 1000); + break; + } + file.close(); + } + + return AMDSMI_STATUS_SUCCESS; +} +amdsmi_status_t +amdsmi_get_temperature_measure(amdsmi_device_handle dev, amdsmi_temperature_type_t temp_type, amdsmi_temperature_t *info) { + if (info == nullptr || temp_type > TEMPERATURE_TYPE__MAX) { + return AMDSMI_STATUS_INVAL; + } + + amdsmi_gpu_metrics_t metrics; + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(dev); + amdsmi_status_t status; + status = amdsmi_dev_gpu_metrics_info_get(dev, &metrics); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + switch (temp_type) { + case TEMPERATURE_TYPE_EDGE: + info->cur_temp = metrics.temperature_edge; + break; + case TEMPERATURE_TYPE_JUNCTION: + info->cur_temp = metrics.temperature_hotspot; + break; + case TEMPERATURE_TYPE_VRAM: + info->cur_temp = metrics.temperature_mem; + break; + case TEMPERATURE_TYPE_PLX: + info->cur_temp = metrics.temperature_vrsoc; + break; + default: + return AMDSMI_STATUS_INVAL; + } + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t +amdsmi_get_ras_features_enabled(amdsmi_device_handle device_handle, amdsmi_gpu_block block, amdsmi_ras_err_state_t *state) { + if (state == nullptr || block > AMDSMI_GPU_BLOCK_LAST) { + return AMDSMI_STATUS_INVAL; + } + + uint64_t features_mask = 0; + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(device_handle); + amdsmi_status_t status; + status = smi_amdgpu_get_enabled_blocks(gpu_device, &features_mask); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + *state = (features_mask & block) ? AMDSMI_RAS_ERR_STATE_ENABLED : AMDSMI_RAS_ERR_STATE_DISABLED; + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t +amdsmi_get_bad_page_info(amdsmi_device_handle device_handle, uint32_t *num_pages, amdsmi_retired_page_record_t *info) { + if (info == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(device_handle); + amdsmi_status_t status; + if (gpu_device->check_if_drm_is_supported()){ + status = smi_amdgpu_get_bad_page_info(gpu_device, num_pages, info); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + } + else { + // rocm + } + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t +amdsmi_get_ecc_error_count(amdsmi_device_handle dev, amdsmi_error_count_t *ec) { + if (ec == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(dev); + amdsmi_status_t status; + if (gpu_device->check_if_drm_is_supported()){ + status = smi_amdgpu_get_ecc_error_count(gpu_device, ec); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + } + else { + // rocm + } + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t +amdsmi_get_process_list(amdsmi_device_handle dev, amdsmi_process_handle *list, uint32_t *max_processes) { + if (max_processes == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + std::vector pids; + uint32_t i = 0; + uint64_t size = 0; + amdsmi_status_t status; + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(dev); + if (gpu_device->check_if_drm_is_supported()){ + amdsmi_bdf_t bdf = gpu_device->get_bdf(); + status = gpuvsmi_get_pids(bdf, pids, &size); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + if (*max_processes == 0 || (pids.size() == 0)) { + *max_processes = (uint32_t)pids.size(); + return AMDSMI_STATUS_SUCCESS; + } + if (!list) { + return AMDSMI_STATUS_INVAL; + } + if (*max_processes < pids.size()) { + return AMDSMI_STATUS_OUT_OF_RESOURCES; + } + for (auto &pid : pids) { + if (i >= *max_processes) { + break; + } + list[i++] = (uint32_t)pid; + } + *max_processes = (uint32_t)pids.size(); + } + else { + // rocm + } + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t +amdsmi_get_process_info(amdsmi_device_handle dev, amdsmi_process_handle process, amdsmi_proc_info_t *info) { + if (info == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(dev); + amdsmi_status_t status; + if (gpu_device->check_if_drm_is_supported()) { + status = gpuvsmi_get_pid_info(gpu_device->get_bdf(), process, *info); + if (status != AMDSMI_STATUS_SUCCESS) return status; + } + else { + // rocm + } + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t +amdsmi_get_target_frequency_range(amdsmi_device_handle dev, amdsmi_clk_type_t clk_type, amdsmi_frequency_range_t *range) { + if (range == nullptr || clk_type > CLOCK_TYPE__MAX) { + return AMDSMI_STATUS_INVAL; + } + + amdsmi_gpu_metrics_t metrics = {}; + amd::smi::AMDSmiGPUDevice* gpu_device = + static_cast(dev); + amdsmi_status_t status; + + int min = 0, max = 0; + status = amdsmi_dev_gpu_metrics_info_get(dev, &metrics); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + status = smi_amdgpu_get_ranges(gpu_device, clk_type, &max, &min, nullptr); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + + range->supported_freq_range.lower_bound = (long)min; + range->current_freq_range.lower_bound = (long)min; + range->supported_freq_range.upper_bound = (long)max; + max = 0; + switch (clk_type) { + case CLOCK_TYPE_GFX: + max = metrics.current_gfxclk; + break; + case CLOCK_TYPE_MEM: + max = metrics.current_uclk; + break; + case CLOCK_TYPE_VCLK0: + max = metrics.current_vclk0; + break; + case CLOCK_TYPE_VCLK1: + max = metrics.current_vclk1; + break; + default: + return AMDSMI_STATUS_INVAL; + } + range->current_freq_range.upper_bound = (long)max; + + return AMDSMI_STATUS_SUCCESS; } diff --git a/src/amd_smi/amd_smi_drm.cc b/src/amd_smi/amd_smi_drm.cc index 45ff3d1d8b..cbda7203a9 100644 --- a/src/amd_smi/amd_smi_drm.cc +++ b/src/amd_smi/amd_smi_drm.cc @@ -55,27 +55,40 @@ namespace smi { amdsmi_status_t AMDSmiDrm::init() { // A few RAII handler + using dir_ptr = std::unique_ptr; using drm_version_ptr = std::unique_ptr; + // using drm_device_ptr = std::unique_ptr(drmDevicePtr, + // decltype(&drmFreeDevice)); struct dirent *dir = nullptr; int fd = -1; + amdsmi_status_t status = lib_loader_.load("libdrm.so"); if (status != AMDSMI_STATUS_SUCCESS) { return status; } + // load symbol from libdrm drm_cmd_write_ = nullptr; status = lib_loader_.load_symbol(&drm_cmd_write_, "drmCommandWrite"); if (status != AMDSMI_STATUS_SUCCESS) { return status; } - using drmGetVersionType = drmVersionPtr (*)(int); // drmGetVersion - using drmFreeVersionType = void (*)(drmVersionPtr); // drmFreeVersion + + using drmGetVersionType = drmVersionPtr (*)(int); // drmGetVersion + using drmFreeVersionType = void (*)(drmVersionPtr); // drmFreeVersion + using drmGetDeviceType = int(*)(int, drmDevicePtr*); // drmGetDevice + using drmFreeDeviceType = void(*)(drmDevicePtr*); // drmFreeDevice + drmGetVersionType drm_get_version = nullptr; drmFreeVersionType drm_free_version = nullptr; + + drmGetDeviceType drm_get_device = nullptr; + drmFreeDeviceType drm_free_device = nullptr; + status = lib_loader_.load_symbol(&drm_get_version, "drmGetVersion"); if (status != AMDSMI_STATUS_SUCCESS) { return status; @@ -85,10 +98,20 @@ amdsmi_status_t AMDSmiDrm::init() { return status; } + status = lib_loader_.load_symbol(&drm_get_device, "drmGetDevice"); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + status = lib_loader_.load_symbol(&drm_free_device, "drmFreeDevice"); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } auto d = dir_ptr(opendir("/dev/dri/"), &closedir); if (d == nullptr) return AMDSMI_STATUS_NOT_INIT; + drmDevicePtr device; + while ((dir = readdir(d.get())) != NULL) { char* name_cstr = new char[sizeof(dir->d_name) + 10]; auto name = std::unique_ptr(name_cstr); @@ -105,7 +128,22 @@ amdsmi_status_t AMDSmiDrm::init() { continue; } + if (drm_get_device(fd, &device) != 0) { + drm_free_device(&device); + return AMDSMI_STATUS_DRM_ERROR; + } + drm_fds_.push_back(fd); + drm_paths_.push_back(dir->d_name); + + amdsmi_bdf_t bdf; + bdf.function_number = device->businfo.pci->func; + bdf.device_number = device->businfo.pci->dev; + bdf.bus_number = device->businfo.pci->bus; + bdf.domain_number = device->businfo.pci->domain; + + drm_bdfs_.push_back(bdf); + drm_free_device(&device); } return AMDSMI_STATUS_SUCCESS; @@ -115,7 +153,10 @@ amdsmi_status_t AMDSmiDrm::cleanup() { for (unsigned int i=0; i < drm_fds_.size(); i++) { close(drm_fds_[i]); } + drm_fds_.clear(); + drm_paths_.clear(); + drm_bdfs_.clear(); lib_loader_.unload(); return AMDSMI_STATUS_SUCCESS; } @@ -190,9 +231,34 @@ amdsmi_status_t AMDSmiDrm::amdgpu_query_vbios(int fd, void *info) { } -int AMDSmiDrm::get_drm_fd_by_index(uint32_t gpu_index) const { - if (gpu_index + 1 > drm_fds_.size()) return -1; - return drm_fds_[gpu_index]; +amdsmi_status_t AMDSmiDrm::get_drm_fd_by_index(uint32_t gpu_index, uint32_t *fd_info) const { + if (gpu_index + 1 > drm_fds_.size()) return AMDSMI_STATUS_NOT_SUPPORTED; + *fd_info = drm_fds_[gpu_index]; + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t AMDSmiDrm::get_bdf_by_index(uint32_t gpu_index, amdsmi_bdf_t *bdf_info) const { + if (gpu_index + 1 > drm_bdfs_.size()) return AMDSMI_STATUS_NOT_SUPPORTED; + *bdf_info = drm_bdfs_[gpu_index]; + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t AMDSmiDrm::get_drm_path_by_index(uint32_t gpu_index, std::string *drm_path) const { + if (gpu_index + 1 > drm_paths_.size()) return AMDSMI_STATUS_NOT_SUPPORTED; + *drm_path = drm_paths_[gpu_index]; + return AMDSMI_STATUS_SUCCESS; +} + +std::vector& AMDSmiDrm::get_drm_paths() { + return drm_paths_; +} + +bool AMDSmiDrm::check_if_drm_is_supported() { + return drm_cmd_write_ != NULL ? true : false; +} + +std::vector AMDSmiDrm::get_bdfs() { + return drm_bdfs_; } } // namespace smi diff --git a/src/amd_smi/amd_smi_gpu_device.cc b/src/amd_smi/amd_smi_gpu_device.cc index f6748667e7..958b0dd48e 100644 --- a/src/amd_smi/amd_smi_gpu_device.cc +++ b/src/amd_smi/amd_smi_gpu_device.cc @@ -52,33 +52,78 @@ uint32_t AMDSmiGPUDevice::get_gpu_id() const { return gpu_id_; } +uint32_t AMDSmiGPUDevice::get_gpu_fd() const { + return fd_; +} + +std::string& AMDSmiGPUDevice::get_gpu_path() { + return path_; +} + +amdsmi_bdf_t AMDSmiGPUDevice::get_bdf() { + return bdf_; +} +amdsmi_status_t AMDSmiGPUDevice::get_drm_data() { + amdsmi_status_t ret; + uint32_t fd = 0; + std::string path; + amdsmi_bdf_t bdf; + ret = drm_.get_drm_fd_by_index(gpu_id_, &fd); + if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED; + ret = drm_.get_drm_path_by_index(gpu_id_, &path); + if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED; + ret = drm_.get_bdf_by_index(gpu_id_, &bdf); + if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED; + + mutex_ = shared_mutex_init(path.c_str(), 0777); + if (mutex_.ptr == nullptr) { + printf("Failed to create shared mem. mutex."); + return AMDSMI_STATUS_INIT_ERROR; + } + bdf_ = bdf, path_ = path, fd_ = fd; + + return AMDSMI_STATUS_SUCCESS; +} + +pthread_mutex_t* AMDSmiGPUDevice::get_mutex() { + return mutex_.ptr; +} + amdsmi_status_t AMDSmiGPUDevice::amdgpu_query_info(unsigned info_id, unsigned size, void *value) const { - int fd = drm_.get_drm_fd_by_index(gpu_id_); - if (fd == -1) return AMDSMI_STATUS_NOT_SUPPORTED; + amdsmi_status_t ret; + uint32_t fd = 0; + ret = drm_.get_drm_fd_by_index(gpu_id_, &fd); + if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED; return drm_.amdgpu_query_info(fd, info_id, size, value); } amdsmi_status_t AMDSmiGPUDevice::amdgpu_query_hw_ip(unsigned info_id, unsigned hw_ip_type, unsigned size, void *value) const { - int fd = drm_.get_drm_fd_by_index(gpu_id_); - if (fd == -1) return AMDSMI_STATUS_NOT_SUPPORTED; + amdsmi_status_t ret; + uint32_t fd = 0; + ret = drm_.get_drm_fd_by_index(gpu_id_, &fd); + if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED; return drm_.amdgpu_query_hw_ip(fd, info_id, hw_ip_type, size, value); } amdsmi_status_t AMDSmiGPUDevice::amdgpu_query_fw(unsigned info_id, unsigned fw_type, unsigned size, void *value) const { - int fd = drm_.get_drm_fd_by_index(gpu_id_); - if (fd == -1) return AMDSMI_STATUS_NOT_SUPPORTED; + amdsmi_status_t ret; + uint32_t fd = 0; + ret = drm_.get_drm_fd_by_index(gpu_id_, &fd); + if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED; return drm_.amdgpu_query_fw(fd, info_id, fw_type, size, value); } amdsmi_status_t AMDSmiGPUDevice::amdgpu_query_vbios(void *info) const { - int fd = drm_.get_drm_fd_by_index(gpu_id_); - if (fd == -1) return AMDSMI_STATUS_NOT_SUPPORTED; + amdsmi_status_t ret; + uint32_t fd = 0; + ret = drm_.get_drm_fd_by_index(gpu_id_, &fd); + if (ret != AMDSMI_STATUS_SUCCESS) return AMDSMI_STATUS_NOT_SUPPORTED;; return drm_.amdgpu_query_vbios(fd, info); } diff --git a/src/amd_smi/amd_smi_socket.cc b/src/amd_smi/amd_smi_socket.cc index 52ce69e65b..0c824e513b 100644 --- a/src/amd_smi/amd_smi_socket.cc +++ b/src/amd_smi/amd_smi_socket.cc @@ -55,6 +55,11 @@ AMDSmiSocket::~AMDSmiSocket() { devices_.clear(); } +amdsmi_status_t AMDSmiSocket::get_device_count(uint32_t* device_count) const { + *device_count = static_cast(devices_.size()); + return AMDSMI_STATUS_SUCCESS; +} + } // namespace smi } // namespace amd diff --git a/src/amd_smi/amd_smi_system.cc b/src/amd_smi/amd_smi_system.cc index 4cdba74087..b315cb67be 100644 --- a/src/amd_smi/amd_smi_system.cc +++ b/src/amd_smi/amd_smi_system.cc @@ -45,6 +45,7 @@ #include "amd_smi/impl/amd_smi_system.h" #include "amd_smi/impl/amd_smi_gpu_device.h" #include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_main.h" namespace amd { @@ -54,55 +55,91 @@ namespace smi { amdsmi_status_t AMDSmiSystem::init(uint64_t flags) { init_flag_ = flags; + amdsmi_status_t amd_smi_status; // populate sockets and devices if (flags & AMDSMI_INIT_AMD_GPUS) { - drm_.init(); + amd_smi_status = drm_.init(); // init rsmi rsmi_status_t ret = rsmi_init(flags); if (ret != RSMI_STATUS_SUCCESS) { return static_cast(ret); } - uint32_t device_count = 0; - ret = rsmi_num_monitor_devices(&device_count); - if (ret != RSMI_STATUS_SUCCESS) { - return static_cast(ret); - } + // libdrm is supported + if (amd_smi_status == AMDSMI_STATUS_SUCCESS) { + amd::smi::RocmSMI::getInstance().DiscoverAmdgpuDevices(); + uint32_t device_count = amd::smi::RocmSMI::getInstance().devices().size(); + for (uint32_t i=0; i < device_count; i++) { + std::stringstream ss; + //values for socked id are harcoded + ss << std::setfill('0') << std::uppercase << std::hex + << std::setw(4) << drm_.get_bdfs()[i].domain_number << ":" + << std::setw(2) << drm_.get_bdfs()[i].bus_number << ":" + << std::setw(2) << drm_.get_bdfs()[i].device_number << "." + << std::setw(2) << drm_.get_bdfs()[i].function_number; - for (uint32_t i=0; i < device_count; i++) { - uint64_t bdfid = 0; - ret = rsmi_dev_pci_id_get(i, &bdfid); + // Multiple devices may share the same socket + auto socket_id = ss.str(); + AMDSmiSocket* socket = nullptr; + for (unsigned int j=0; j < sockets_.size(); j++) { + if (sockets_[j]->get_socket_id() == socket_id) { + socket = sockets_[j]; + break; + } + } + if (socket == nullptr) { + socket = new AMDSmiSocket(ss.str()); + sockets_.push_back(socket); + } + + AMDSmiDevice* device = new AMDSmiGPUDevice(i, drm_); + socket->add_device(device); + devices_.insert(device); + } + + } + else { + uint32_t device_count = 0; + ret = rsmi_num_monitor_devices(&device_count); if (ret != RSMI_STATUS_SUCCESS) { return static_cast(ret); } - uint64_t domain = (bdfid >> 32) & 0xffffffff; - uint64_t bus = (bdfid >> 8) & 0xff; - uint64_t device_id = (bdfid >> 3) & 0x1f; - uint64_t function = bdfid & 0x7; - - std::stringstream ss; - ss << std::setfill('0') << std::uppercase << std::hex - << std::setw(4) << domain << ":" << std::setw(2) << bus << ":" - << std::setw(2) << device_id << "." << std::setw(2) << function; - - // Multiple devices may share the same socket - auto socket_id = ss.str(); - AMDSmiSocket* socket = nullptr; - for (unsigned int j=0; j < sockets_.size(); j++) { - if (sockets_[j]->get_socket_id() == socket_id) { - socket = sockets_[j]; - break; + for (uint32_t i=0; i < device_count; i++) { + uint64_t bdfid = 0; + ret = rsmi_dev_pci_id_get(i, &bdfid); + if (ret != RSMI_STATUS_SUCCESS) { + return static_cast(ret); } - } - if (socket == nullptr) { - socket = new AMDSmiSocket(ss.str()); - sockets_.push_back(socket); - } - AMDSmiDevice* device = new AMDSmiGPUDevice(i, drm_); - socket->add_device(device); - devices_.insert(device); + uint64_t domain = (bdfid >> 32) & 0xffffffff; + uint64_t bus = (bdfid >> 8) & 0xff; + uint64_t device_id = (bdfid >> 3) & 0x1f; + uint64_t function = bdfid & 0x7; + + std::stringstream ss; + ss << std::setfill('0') << std::uppercase << std::hex + << std::setw(4) << domain << ":" << std::setw(2) << bus << ":" + << std::setw(2) << device_id << "." << std::setw(2) << function; + + // Multiple devices may share the same socket + auto socket_id = ss.str(); + AMDSmiSocket* socket = nullptr; + for (unsigned int j=0; j < sockets_.size(); j++) { + if (sockets_[j]->get_socket_id() == socket_id) { + socket = sockets_[j]; + break; + } + } + if (socket == nullptr) { + socket = new AMDSmiSocket(ss.str()); + sockets_.push_back(socket); + } + + AMDSmiDevice* device = new AMDSmiGPUDevice(i, drm_); + socket->add_device(device); + devices_.insert(device); + } } } else { return AMDSMI_STATUS_NOT_SUPPORTED; diff --git a/src/amd_smi/amd_smi_utils.cc b/src/amd_smi/amd_smi_utils.cc new file mode 100644 index 0000000000..b8c629de5b --- /dev/null +++ b/src/amd_smi/amd_smi_utils.cc @@ -0,0 +1,382 @@ +/* * Copyright (C) 2022 Advanced Micro Devices. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "amd_smi/impl/amd_smi_utils.h" +#include "shared_mutex.h" // NOLINT + +static const uint32_t kAmdGpuId = 0x1002; + +static bool isAMDGPU(std::string dev_path) { + std::string vend_path = dev_path + "/device/vendor"; + std::string vbios_v_path = dev_path + "/device/vbios_version"; + if (!amd::smi::FileExists(vend_path.c_str())) { + return false; + } + + if (!amd::smi::FileExists(vbios_v_path.c_str())) { + return false; + } + + std::ifstream fs; + fs.open(vend_path); + + if (!fs.is_open()) { + return false; + } + + uint32_t vendor_id; + + fs >> std::hex >> vendor_id; + + fs.close(); + + if (vendor_id == kAmdGpuId) { + return true; + } + return false; +} + +amdsmi_status_t smi_amdgpu_find_hwmon_dir(amd::smi::AMDSmiGPUDevice *device, std::string* full_path) +{ + if (!device->check_if_drm_is_supported()) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + if (full_path == nullptr) { + return AMDSMI_STATUS_API_FAILED; + } + SMIGPUDEVICE_MUTEX(device->get_mutex()) + + DIR *dh; + struct dirent * contents; + std::string device_path = "/sys/class/drm/" + device->get_gpu_path(); + std::string directory_path = device_path + "/device/hwmon/"; + + if (!isAMDGPU(device_path)) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + + dh = opendir(directory_path.c_str()); + if (!dh) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + + /* + First directory is '.', second directory is '..' and third directory is + valid directory for reading sysfs node + */ + while ((contents = readdir(dh)) != NULL) { + std::string name = contents->d_name; + if (name.find("hwmon", 0) != std::string::npos) + *full_path = directory_path + name; + } + + closedir(dh); + + return AMDSMI_STATUS_SUCCESS; +} + + +amdsmi_status_t smi_amdgpu_get_board_info(amd::smi::AMDSmiGPUDevice* device, amdsmi_board_info_t *info) { + if (!device->check_if_drm_is_supported()) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + SMIGPUDEVICE_MUTEX(device->get_mutex()) + std::string product_name_path = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/product_name"); + std::string product_number_path = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/product_number"); + std::string serial_number_path = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/serial_number"); + + FILE *fp; + + fp = fopen(product_name_path.c_str(), "rb"); + if (!fp) { + fgets(info->product_name, sizeof(info->product_name), fp); + fclose(fp); + } + + + fp = fopen(product_number_path.c_str(), "rb"); + if (!fp) { + fgets(info->model_number, sizeof(info->model_number), fp); + fclose(fp); + } + + + fp = fopen(serial_number_path.c_str(), "rb"); + if (!fp) { + fscanf(fp, "%lx", &info->serial_number); + fclose(fp); + } + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int *cap) +{ + if (!device->check_if_drm_is_supported()) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + constexpr int DATA_SIZE = 10; + char val[DATA_SIZE]; + std::string fullpath; + amdsmi_status_t ret = AMDSMI_STATUS_SUCCESS; + + ret = smi_amdgpu_find_hwmon_dir(device, &fullpath); + + SMIGPUDEVICE_MUTEX(device->get_mutex()) + + if (ret) + return ret; + + fullpath += "/power1_cap_max"; + std::ifstream file(fullpath.c_str(), std::ifstream::in); + if (!file.is_open()) { + printf("Failed to open file: %s \n", fullpath.c_str()); + return AMDSMI_STATUS_API_FAILED; + } + + file.getline(val, DATA_SIZE); + + if (sscanf(val, "%d", cap) < 0) { + return AMDSMI_STATUS_API_FAILED; + } + + // Dividing by 1000000 to get measurement in Watts + *cap /= 1000000; + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_clk_type_t domain, + int *max_freq, int *min_freq, int *num_dpm) +{ + if (!device->check_if_drm_is_supported()) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + SMIGPUDEVICE_MUTEX(device->get_mutex()) + std::string fullpath = "/sys/class/drm/" + device->get_gpu_path() + "/device"; + char str[10]; + unsigned int max, min, dpm; + + switch (domain) { + case CLOCK_TYPE_GFX: + fullpath += "/pp_dpm_sclk"; + break; + case CLOCK_TYPE_MEM: + fullpath += "/pp_dpm_mclk"; + break; + case CLOCK_TYPE_VCLK0: + fullpath += "/pp_dpm_vclk"; + break; + case CLOCK_TYPE_VCLK1: + fullpath += "/pp_dpm_vclk1"; + break; + default: + return AMDSMI_STATUS_INVAL; + } + + std::ifstream ranges(fullpath.c_str()); + + if (ranges.fail()) { + printf("Failed to open file: %s \n", fullpath.c_str()); + return AMDSMI_STATUS_API_FAILED; + } + + max = 0; + min = -1; + dpm = 0; + for (std::string line; getline(ranges, line);) { + unsigned int d, freq; + + if (sscanf(line.c_str(), "%u: %d%s", &d, &freq, str) <= 2){ + ranges.close(); + return AMDSMI_STATUS_IO; + } + + max = freq > max ? freq : max; + min = freq < min ? freq: min; + dpm = d > dpm ? d : dpm; + } + + if (num_dpm) + *num_dpm = dpm; + if (max_freq) + *max_freq = max; + if (min_freq) + *min_freq = min; + + ranges.close(); + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t smi_amdgpu_get_enabled_blocks(amd::smi::AMDSmiGPUDevice* device, uint64_t *enabled_blocks) { + if (!device->check_if_drm_is_supported()) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + SMIGPUDEVICE_MUTEX(device->get_mutex()) + std::string fullpath = "/sys/class/drm/" + device->get_gpu_path() + "/device/ras/features"; + std::ifstream f(fullpath.c_str()); + std::string tmp_str; + + if (f.fail()) { + printf("Failed to open file: %s \n", fullpath.c_str()); + return AMDSMI_STATUS_API_FAILED; + } + + std::string line; + getline(f, line); + + std::istringstream f1(line); + + f1 >> tmp_str; // ignore + f1 >> tmp_str; // ignore + f1 >> tmp_str; + + *enabled_blocks = strtoul(tmp_str.c_str(), nullptr, 16); + f.close(); + + if (*enabled_blocks == 0 || *enabled_blocks == ULONG_MAX) { + return AMDSMI_STATUS_API_FAILED; + } + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device, uint32_t *num_pages, amdsmi_retired_page_record_t *info) { + + if (!device->check_if_drm_is_supported()) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + SMIGPUDEVICE_MUTEX(device->get_mutex()) + std::string line; + std::vector badPagesVec; + + std::string fullpath = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/ras/gpu_vram_bad_pages"); + std::ifstream fs(fullpath.c_str()); + + if (fs.fail()) { + printf("Failed to open file: %s \n", fullpath.c_str()); + return AMDSMI_STATUS_NOT_SUPPORTED; + } + + while (std::getline(fs, line)) { + badPagesVec.push_back(line); + } + + if (badPagesVec.size() == 0) { + num_pages = 0; + return AMDSMI_STATUS_SUCCESS; + } + // Remove any *trailing* empty (whitespace) lines + while (badPagesVec.size() != 0 && + badPagesVec.back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) { + badPagesVec.pop_back(); + } + + *num_pages = static_cast(badPagesVec.size()); + + if (info == nullptr) { + return AMDSMI_STATUS_SUCCESS; + } + + char status_code; + amdsmi_memory_page_status_t tmp_stat; + std::string junk; + + for (uint32_t i = 0; i < *num_pages; ++i) { + std::istringstream fs1(badPagesVec[i]); + + fs1 >> std::hex >> info[i].page_address; + fs1 >> junk; + fs1 >> std::hex >> info[i].page_size; + fs1 >> junk; + fs1 >> status_code; + + switch (status_code) { + case 'P': + tmp_stat = AMDSMI_MEM_PAGE_STATUS_PENDING; + break; + + case 'F': + tmp_stat = AMDSMI_MEM_PAGE_STATUS_UNRESERVABLE; + break; + + case 'R': + tmp_stat = AMDSMI_MEM_PAGE_STATUS_RESERVED; + break; + default: + return AMDSMI_STATUS_API_FAILED; + } + info[i].status = tmp_stat; + } + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt) { + + if (!device->check_if_drm_is_supported()) { + return AMDSMI_STATUS_NOT_SUPPORTED; + } + SMIGPUDEVICE_MUTEX(device->get_mutex()) + char str[10]; + + std::string fullpath = "/sys/class/drm/" + device->get_gpu_path() + std::string("/device/ras/umc_err_count"); + std::ifstream f(fullpath.c_str()); + + if (f.fail()) { + printf("Failed to open file: %s \n", fullpath.c_str()); + return AMDSMI_STATUS_NOT_SUPPORTED; + } + + std::string line; + getline(f, line); + sscanf(line.c_str(), "%s%ld", str, &(err_cnt->uncorrectable_count)); + + getline(f, line); + sscanf(line.c_str(), "%s%ld", str, &(err_cnt->correctable_count)); + + f.close(); + + return AMDSMI_STATUS_SUCCESS; +} diff --git a/src/amd_smi/fdinfo.cc b/src/amd_smi/fdinfo.cc new file mode 100644 index 0000000000..d1f29c9e71 --- /dev/null +++ b/src/amd_smi/fdinfo.cc @@ -0,0 +1,264 @@ +/* * Copyright (C) 2022 Advanced Micro Devices. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "amd_smi/amd_smi.h" +#include "amd_smi/impl/amd_smi_utils.h" + +extern "C" { + +amdsmi_status_t gpuvsmi_pid_is_gpu(const std::string &path, const char *bdf) +{ + DIR *d; + struct dirent *dir; + + d = opendir(path.c_str()); + if (!d) + return AMDSMI_STATUS_NO_PERM; + + /* iterate through all the fds, try to find + * a match for the GPU bdf + */ + while ((dir = readdir(d)) != NULL) { + std::string file = path + dir->d_name; + std::ifstream fdinfo(file.c_str()); + for (std::string line; std::getline(fdinfo, line);) { + if (line.find(bdf) != std::string::npos) { + closedir(d); + return AMDSMI_STATUS_SUCCESS; + } + } + } + + closedir(d); + + return AMDSMI_STATUS_NOT_FOUND; +} + +amdsmi_status_t gpuvsmi_get_pids(const amdsmi_bdf_t &bdf, std::vector &pids, uint64_t *size) +{ + char bdf_str[13]; + DIR *d; + struct dirent *dir; + + /* 0000:00:00.0 */ + snprintf(bdf_str, 13, "%04x:%02x:%02x.%d", bdf.domain_number & 0xffff, + bdf.bus_number & 0xff, + bdf.device_number & 0x1f, + bdf.function_number & 0x7); + + d = opendir("/proc"); + if (!d) + return AMDSMI_STATUS_NO_PERM; + + pids.clear(); + /* Find the pid folders in /proc/ that we have access to */ + while ((dir = readdir(d)) != NULL) { + if (dir->d_type == DT_DIR) { + /* Try to cast the name of the folder to a + * number, if it fails, it is not */ + char *p; + long int pid; + + pid = strtol(dir->d_name, &p, 10); + if (*p != 0) + continue; + + /* Check if fdinfo is accesible */ + std::string path = "/proc/" + std::string(dir->d_name) + "/fdinfo/"; + + if (access(path.c_str(), R_OK)) + continue; + + /* check if GPU is present */ + if (gpuvsmi_pid_is_gpu(path, bdf_str)) + continue; + pids.push_back(pid); + } + } + closedir(d); + + *size = pids.size(); + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t gpuvsmi_get_pid_info(const amdsmi_bdf_t &bdf, long int pid, + amdsmi_proc_info_t &info) +{ + char bdf_str[13]; + DIR *d; + struct dirent *dir; + + /* 0000:00:00.0 */ + snprintf(bdf_str, 13, "%04x:%02x:%02x.%d", bdf.domain_number & 0xffff, + bdf.bus_number & 0xff, + bdf.device_number & 0x1f, + bdf.function_number & 0x7); + + + std::string path = "/proc/" + std::to_string(pid) + "/fdinfo/"; + std::string name_path = "/proc/" + std::to_string(pid) + "/comm"; + std::string cgroup_path = "/proc/" + std::to_string(pid) + "/cgroup"; + + if (gpuvsmi_pid_is_gpu(path.c_str(), bdf_str)) { + return AMDSMI_STATUS_INVAL; + } + + d = opendir(path.c_str()); + if (!d) + return AMDSMI_STATUS_NO_PERM; + + /* Vectors to check if repated fd pasid */ + std::vector pasids; + + memset(&info, 0, sizeof(info)); + /* Iterate through all fdinfos */ + while ((dir = readdir(d)) != NULL) { + + std::string file = path + dir->d_name; + std::ifstream fdinfo(file.c_str()); + + for (std::string line; getline(fdinfo, line);) { + if (line.find("pasid:") != std::string::npos) { + int pasid; + + if (sscanf(line.c_str(), "pasid: %d", &pasid) != 1) + continue; + + auto it = std::find(pasids.begin(), pasids.end(), pasid); + + if (it == pasids.end()) + pasids.push_back(pasid); + } else if (line.find("gtt mem:") != std::string::npos) { + unsigned long mem; + + if (sscanf(line.c_str(), "gtt mem: %lu", &mem) != 1) + continue; + + info.mem += mem * 1024; + info.memory_usage.gtt_mem += mem * 1024; + } else if (line.find("cpu mem:") != std::string::npos) { + unsigned long mem; + + if (sscanf(line.c_str(), "cpu mem: %lu", &mem) != 1) + continue; + + info.mem += mem * 1024; + info.memory_usage.cpu_mem += mem * 1024; + } else if (line.find("vram mem:") != std::string::npos) { + unsigned long mem; + + if (sscanf(line.c_str(), "vram mem: %lu", &mem) != 1) + continue; + + info.mem += mem * 1024; + info.memory_usage.vram_mem += mem * 1024; + } else if (line.find("gfx") != std::string::npos) { + float usage; + int ring; + + if (sscanf(line.c_str(), "gfx%d: %f%%", &ring, &usage) != 2) + continue; + + if (ring >= AMDSMI_MAX_MM_IP_COUNT) + continue; + + info.engine_usage.gfx[ring] += (uint16_t)(usage * 100); + } else if (line.find("compute") != std::string::npos) { + float usage; + int ring; + + if (sscanf(line.c_str(), "compute%d: %f%%", &ring, &usage) != 2) + continue; + + if (ring >= AMDSMI_MAX_MM_IP_COUNT) + continue; + + info.engine_usage.compute[ring] += (uint16_t)(usage * 100); + } else if (line.find("dma") != std::string::npos) { + float usage; + int ring; + + if (sscanf(line.c_str(), "dma%d: %f%%", &ring, &usage) != 2) + continue; + + if (ring >= AMDSMI_MAX_MM_IP_COUNT) + continue; + + info.engine_usage.sdma[ring] += (uint16_t)(usage * 100); + } else if (line.find("enc") != std::string::npos) { + float usage; + int ring; + + if (sscanf(line.c_str(), "enc%d: %f%%", &ring, &usage) != 2) + continue; + + if (ring >= AMDSMI_MAX_MM_IP_COUNT) + continue; + + info.engine_usage.enc[ring] += (uint16_t)(usage * 100); + } else if (line.find("dec") != std::string::npos) { + float usage; + int ring; + + if (sscanf(line.c_str(), "dec%d: %f%%", &ring, &usage) != 2) + continue; + + if (ring >= AMDSMI_MAX_MM_IP_COUNT) + continue; + + info.engine_usage.dec[ring] += (uint16_t)(usage * 100); + } + } + } + + + closedir(d); + + if (!pasids.size()) + return AMDSMI_STATUS_NOT_FOUND; + + std::ifstream filename(name_path.c_str()); + std::string name; + + getline(filename, name); + + if (name.empty()) + return AMDSMI_STATUS_API_FAILED; + + strncpy(info.name, name.c_str(), std::min( + (unsigned long) AMDSMI_NORMAL_STRING_LENGTH, + name.length())); + + info.pid = (uint32_t)pid; + + return AMDSMI_STATUS_SUCCESS; +} + +} // extern "C" diff --git a/tests/amd_smi_test/functional/err_cnt_read.cc b/tests/amd_smi_test/functional/err_cnt_read.cc index 71131e187a..abd4107c2d 100755 --- a/tests/amd_smi_test/functional/err_cnt_read.cc +++ b/tests/amd_smi_test/functional/err_cnt_read.cc @@ -160,7 +160,7 @@ void TestErrCntRead::Run(void) { << std::endl; std::cout << "\t\tCorrectable errors: " << ec.correctable_err << std::endl; - std::cout << "\t\tUncorrectable errors: " << ec.uncorrectable_err + std::cout << "\t\tUncorrectable errors: " << ec.uncorrectable_count << std::endl; } // Verify api support checking functionality is working