From b546e272e00025701530fded5eb79e8fc3ead3d5 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Fri, 7 Oct 2022 10:35:26 -0500 Subject: [PATCH] Change the get_socket_handles and get_device_handles APIs interface Those two APIs are changed to let the user get the handles count, allocate memory, and then return handles to the allocated memory. Change-Id: Ibe28a89ad188c99da6af3af1740b2b25ff22ba06 [ROCm/amdsmi commit: 2b2d11c446f479317a88f512c32a5974e60516d9] --- .../amdsmi/example/amd_smi_drm_example.cc | 31 +++-- .../amdsmi/example/amd_smi_nodrm_example.cc | 31 +++-- projects/amdsmi/include/amd_smi/amd_smi.h | 42 +++++-- projects/amdsmi/src/amd_smi/amd_smi.cc | 117 ++++++++++-------- .../amd_smi_test/functional/err_cnt_read.cc | 2 +- .../functional/process_info_read.cc | 1 - .../amdsmi/tests/amd_smi_test/test_base.cc | 23 +++- .../amdsmi/tests/amd_smi_test/test_base.h | 3 +- .../amdsmi/tests/rocm_smi_test/CMakeLists.txt | 20 +-- 9 files changed, 175 insertions(+), 95 deletions(-) diff --git a/projects/amdsmi/example/amd_smi_drm_example.cc b/projects/amdsmi/example/amd_smi_drm_example.cc index c2c5779fa3..c8fe25c80e 100644 --- a/projects/amdsmi/example/amd_smi_drm_example.cc +++ b/projects/amdsmi/example/amd_smi_drm_example.cc @@ -73,24 +73,37 @@ int main() { // Get all sockets uint32_t socket_count = 0; - amdsmi_socket_handle *sockets = nullptr; - ret = amdsmi_get_socket_handles(&socket_count, &sockets); + + // Get the socket count available for the system. + ret = amdsmi_get_socket_handles(&socket_count, nullptr); CHK_AMDSMI_RET(ret) + + // Allocate the memory for the sockets + std::vector sockets(socket_count); + // Get the sockets of the system + ret = amdsmi_get_socket_handles(&socket_count, &sockets[0]); + CHK_AMDSMI_RET(ret) + std::cout << "Total Socket: " << socket_count << std::endl; // For each socket, get identifier and devices for (uint32_t i = 0; i < socket_count; i++) { // Get Socket info - char socket_name[128]; - ret = amdsmi_get_socket_info(sockets[i], socket_name, 128); + char socket_info[128]; + ret = amdsmi_get_socket_info(sockets[i], socket_info, 128); CHK_AMDSMI_RET(ret) - std::cout << "Socket " << socket_name << std::endl; + std::cout << "Socket " << socket_info << std::endl; - // Get all devices of the socket + // Get the device count available for the socket. uint32_t device_count = 0; - amdsmi_device_handle *device_handles = nullptr; - ret = amdsmi_get_device_handles(sockets[i], &device_count, - &device_handles); + ret = amdsmi_get_device_handles(sockets[i], &device_count, nullptr); + CHK_AMDSMI_RET(ret) + + // Allocate the memory for the device handlers on the socket + std::vector device_handles(device_count); + // Get all devices of the socket + ret = amdsmi_get_device_handles(sockets[i], + &device_count, &device_handles[0]); CHK_AMDSMI_RET(ret) // For each device of the socket, get name and temperature. diff --git a/projects/amdsmi/example/amd_smi_nodrm_example.cc b/projects/amdsmi/example/amd_smi_nodrm_example.cc index 3fb96bcdde..2710765270 100644 --- a/projects/amdsmi/example/amd_smi_nodrm_example.cc +++ b/projects/amdsmi/example/amd_smi_nodrm_example.cc @@ -73,24 +73,37 @@ int main() { // Get all sockets uint32_t socket_count = 0; - amdsmi_socket_handle *sockets = nullptr; - ret = amdsmi_get_socket_handles(&socket_count, &sockets); + + // Get the socket count available for the system. + ret = amdsmi_get_socket_handles(&socket_count, nullptr); CHK_AMDSMI_RET(ret) + + // Allocate the memory for the sockets + std::vector sockets(socket_count); + // Get the sockets of the system + ret = amdsmi_get_socket_handles(&socket_count, &sockets[0]); + CHK_AMDSMI_RET(ret) + std::cout << "Total Socket: " << socket_count << std::endl; // For each socket, get identifier and devices for (uint32_t i = 0; i < socket_count; i++) { // Get Socket info - char socket_name[128]; - ret = amdsmi_get_socket_info(sockets[i], socket_name, 128); + char socket_info[128]; + ret = amdsmi_get_socket_info(sockets[i], socket_info, 128); CHK_AMDSMI_RET(ret) - std::cout << "Socket " << socket_name << std::endl; + std::cout << "Socket " << socket_info << std::endl; - // Get all devices of the socket + // Get the device count available for the socket. uint32_t device_count = 0; - amdsmi_device_handle *device_handles = nullptr; - ret = amdsmi_get_device_handles(sockets[i], &device_count, - &device_handles); + ret = amdsmi_get_device_handles(sockets[i], &device_count, nullptr); + CHK_AMDSMI_RET(ret) + + // Allocate the memory for the device handlers on the socket + std::vector device_handles(device_count); + // Get all devices of the socket + ret = amdsmi_get_device_handles(sockets[i], + &device_count, &device_handles[0]); CHK_AMDSMI_RET(ret) // For each device of the socket, get name and temperature. diff --git a/projects/amdsmi/include/amd_smi/amd_smi.h b/projects/amdsmi/include/amd_smi/amd_smi.h index 76501c501e..fd3f7541eb 100644 --- a/projects/amdsmi/include/amd_smi/amd_smi.h +++ b/projects/amdsmi/include/amd_smi/amd_smi.h @@ -1170,23 +1170,34 @@ amdsmi_status_t amdsmi_shut_down(void); /** * @brief Get the list of socket handles in the system. - * + * * @details Depends on what flag is pass to ::amdsmi_init(flags). AMDSMI_INIT_AMD_GPUS * returns sockets with AMD GPUS, and AMDSMI_INIT_AMD_GPUS | AMDSMI_INIT_AMD_CPUS returns * sockets with either AMD GPUS or CPUS. * The socket handles can be used to query the device handles in that socket, which * will be used in other APIs to get device detail information or telemtries. * - * @param[out] socket_count The total count of sockets found in the system. + * @param[inout] socket_count As input, the value passed + * through this parameter is the number of ::amdsmi_socket_handle's that + * may be safely written to the memory pointed to by @p socket_handles. This is the + * limit on how many socket handles will be written to @p socket_handles. On return, @p + * socket_count will contain the number of socket handles written to @p socket_handles, + * or the number of socket handles that could have been written if enough memory had been + * provided. + * If @p socket_handles is NULL, as output, @p socket_count will contain + * how many sockets are available to read in the system. * - * @param[out] socket_handles a list of socket handles in the system. + * @param[inout] socket_handles A pointer to a block of memory to which the + * ::amdsmi_socket_handle values will be written. This value may be NULL. + * In this case, this function can be used to query how many sockets are + * available to read in the system. * * @retval ::AMDSMI_STATUS_SUCCESS call was successful * @retval ::AMDSMI_STATUS_INVAL the provided arguments are not valid * */ amdsmi_status_t amdsmi_get_socket_handles(uint32_t *socket_count, - amdsmi_socket_handle* socket_handles[]); + amdsmi_socket_handle* socket_handles); /** * @brief Get the socket information @@ -1216,20 +1227,35 @@ amdsmi_status_t amdsmi_get_socket_info( * type devices: An APU on a socket have both CPUs and GPUs. * Currently, only AMD GPUs are supported. * + * The number of device count is returned through @p device_count + * if @p device_handles is NULL. Then the number of @p device_count can be pass + * as input to retrieval all devices on the socket to @p device_handles. + * * @param[in] socket_handle The socket to query * - * @param[out] device_count The total count of devices on the socket. + * @param[inout] device_count As input, the value passed + * through this parameter is the number of ::amdsmi_device_handle's that + * may be safely written to the memory pointed to by @p device_handles. This is the + * limit on how many device handles will be written to @p device_handles. On return, @p + * device_count will contain the number of device handles written to @p device_handles, + * or the number of device handles that could have been written if enough memory had been + * provided. + * If @p device_handles is NULL, as output, @p device_count will contain + * how many devices are available to read for the socket. * - * @param[out] device_handles a list of devices handles on the socket. + * @param[inout] device_handles A pointer to a block of memory to which the + * ::device_handles values will be written. This value may be NULL. + * In this case, this function can be used to query how many devices are + * available to read. * * @retval ::AMDSMI_STATUS_SUCCESS call was successful * @retval ::AMDSMI_STATUS_INVAL the provided arguments are not valid * - * */ amdsmi_status_t amdsmi_get_device_handles(amdsmi_socket_handle socket_handle, uint32_t *device_count, - amdsmi_device_handle* device_handles[]); + amdsmi_device_handle* device_handles); + /** * @brief Get the device type * diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 1b32e08045..cf5938ab16 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -138,15 +138,28 @@ amdsmi_status_string(amdsmi_status_t status, const char **status_string) { } amdsmi_status_t amdsmi_get_socket_handles(uint32_t *socket_count, - amdsmi_socket_handle* socket_handles[]) { - if (socket_count == nullptr || socket_handles == nullptr) { + amdsmi_socket_handle* socket_handles) { + if (socket_count == nullptr) { return AMDSMI_STATUS_INVAL; } std::vector& sockets = amd::smi::AMDSmiSystem::getInstance().get_sockets(); - *socket_count = static_cast(sockets.size()); - *socket_handles = reinterpret_cast(sockets.data()); + uint32_t socket_size = static_cast(sockets.size()); + // Get the socket size + if (socket_handles == nullptr) { + *socket_count = socket_size; + return AMDSMI_STATUS_SUCCESS; + } + + // If the socket_handles can hold all sockets, return all of them. + *socket_count = *socket_count >= socket_size ? socket_size : *socket_count; + + // Copy the socket handles + for (uint32_t i = 0; i < *socket_count; i++) { + socket_handles[i] = reinterpret_cast(sockets[i]); + } + return AMDSMI_STATUS_SUCCESS; } @@ -168,24 +181,36 @@ amdsmi_status_t amdsmi_get_socket_info( return AMDSMI_STATUS_SUCCESS; } - amdsmi_status_t amdsmi_get_device_handles(amdsmi_socket_handle socket_handle, - uint32_t *device_count, - amdsmi_device_handle* device_handles[]) { + uint32_t* device_count, + amdsmi_device_handle* device_handles) { if (device_count == nullptr) { return AMDSMI_STATUS_INVAL; } + // Get the socket object via socket handle. amd::smi::AMDSmiSocket* socket = nullptr; amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance() .handle_to_socket(socket_handle, &socket); if (r != AMDSMI_STATUS_SUCCESS) return r; - r = socket->get_device_count(device_count); - if (r != AMDSMI_STATUS_SUCCESS) return r; - *device_handles = reinterpret_cast( - socket->get_devices().data()); + std::vector& devices = socket->get_devices(); + uint32_t device_size = static_cast(devices.size()); + // Get the device count only + if (device_handles == nullptr) { + *device_count = device_size; + return AMDSMI_STATUS_SUCCESS; + } + + // If the device_handles can hold all devices, return all of them. + *device_count = *device_count >= device_size ? device_size : *device_count; + + // Copy the device handles + for (uint32_t i = 0; i < *device_count; i++) { + device_handles[i] = reinterpret_cast(devices[i]); + } + return AMDSMI_STATUS_SUCCESS; } @@ -205,7 +230,6 @@ amdsmi_status_t amdsmi_get_device_type(amdsmi_device_handle device_handle , amdsmi_status_t amdsmi_get_device_bdf(amdsmi_device_handle device_handle, amdsmi_bdf_t *bdf) { - if (bdf == NULL) { return AMDSMI_STATUS_INVAL; } @@ -213,12 +237,8 @@ amdsmi_get_device_bdf(amdsmi_device_handle device_handle, amdsmi_bdf_t *bdf) { amd::smi::AMDSmiGPUDevice* gpu_device = static_cast(device_handle); - if (gpu_device->check_if_drm_is_supported()) { - *bdf = gpu_device->get_bdf(); - } - else { - //TODO - } + // get bdf from sysfs file + *bdf = gpu_device->get_bdf(); return AMDSMI_STATUS_SUCCESS; } @@ -503,25 +523,23 @@ amdsmi_get_asic_info(amdsmi_device_handle device_handle, amdsmi_asic_info_t *inf info->family = dev_info.family; info->rev_id = dev_info.pci_rev; } - else { - uint16_t vendor_id = 0; + // For other sysfs related information, get from rocm-smi + uint16_t vendor_id = 0; - amdsmi_status_t status = rsmi_wrapper(rsmi_dev_serial_number_get, device_handle, + status = rsmi_wrapper(rsmi_dev_serial_number_get, device_handle, info->asic_serial, AMDSMI_NORMAL_STRING_LENGTH); - status = rsmi_wrapper(rsmi_dev_brand_get, device_handle, + status = rsmi_wrapper(rsmi_dev_brand_get, device_handle, info->market_name, AMDSMI_NORMAL_STRING_LENGTH); - status = rsmi_wrapper(rsmi_dev_vendor_id_get, device_handle, + status = rsmi_wrapper(rsmi_dev_vendor_id_get, device_handle, &vendor_id); - if (status == AMDSMI_STATUS_SUCCESS) info->vendor_id = vendor_id; - vendor_id = 0; + if (status == AMDSMI_STATUS_SUCCESS) info->vendor_id = vendor_id; - status = rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, device_handle, + vendor_id = 0; + status = rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, device_handle, &vendor_id); - if (status == AMDSMI_STATUS_SUCCESS) info->subvendor_id = vendor_id; - return status; - } + if (status == AMDSMI_STATUS_SUCCESS) info->subvendor_id = vendor_id; return AMDSMI_STATUS_SUCCESS; } @@ -937,38 +955,28 @@ amdsmi_get_power_cap_info(amdsmi_device_handle device_handle, amd::smi::AMDSmiGPUDevice* gpudevice = static_cast(device_handle); amdsmi_status_t status; - if (gpudevice->check_if_drm_is_supported()){ - int power_cap = 0; - int dpm = 0; - status = smi_amdgpu_get_power_cap(gpudevice, &power_cap); - if (status != AMDSMI_STATUS_SUCCESS) { - return status; - } - info->power_cap = power_cap; + // Ignore errors to get as much as possible info. + memset(info, 0, sizeof(amdsmi_power_cap_info_t)); - status = smi_amdgpu_get_ranges(gpudevice, CLOCK_TYPE_GFX, + // Get power_cap and dpm + int power_cap = 0; + int dpm = 0; + status = smi_amdgpu_get_power_cap(gpudevice, &power_cap); + + info->power_cap = power_cap; + status = smi_amdgpu_get_ranges(gpudevice, CLOCK_TYPE_GFX, NULL, NULL, &dpm); - if (status != AMDSMI_STATUS_SUCCESS) { - return status; - } - info->dpm_cap = dpm; + info->dpm_cap = dpm; - return AMDSMI_STATUS_SUCCESS; - } - else { - // Ignore errors to get as much as possible info. - memset(info, 0, sizeof(amdsmi_power_cap_info_t)); - auto rsmi_status = rsmi_dev_power_cap_default_get(gpudevice->get_gpu_id(), + // Get other information from rocm-smi + auto rsmi_status = rsmi_dev_power_cap_default_get(gpudevice->get_gpu_id(), &(info->default_power_cap)); - rsmi_status = rsmi_dev_power_cap_range_get(gpudevice->get_gpu_id(), + rsmi_status = rsmi_dev_power_cap_range_get(gpudevice->get_gpu_id(), sensor_ind, &(info->max_power_cap), &(info->min_power_cap)); - rsmi_status = rsmi_dev_power_cap_get(gpudevice->get_gpu_id(), + rsmi_status = rsmi_dev_power_cap_get(gpudevice->get_gpu_id(), sensor_ind, &(info->power_cap)); - // TODO(bliu) : dpm_cap - } - return AMDSMI_STATUS_SUCCESS; } @@ -1278,7 +1286,8 @@ amdsmi_get_vbios_info(amdsmi_device_handle dev, amdsmi_vbios_info_t *info) { info->vbios_version = vbios.version; } else { - // rocm + // those information can only get the from libdrm + return AMDSMI_STATUS_NOT_SUPPORTED; } return AMDSMI_STATUS_SUCCESS; diff --git a/projects/amdsmi/tests/amd_smi_test/functional/err_cnt_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/err_cnt_read.cc index abd4107c2d..4852a87927 100755 --- a/projects/amdsmi/tests/amd_smi_test/functional/err_cnt_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/err_cnt_read.cc @@ -158,7 +158,7 @@ void TestErrCntRead::Run(void) { std::cout << "\t**Error counts for " << GetBlockNameStr(static_cast(b)) << " block: " << std::endl; - std::cout << "\t\tCorrectable errors: " << ec.correctable_err + std::cout << "\t\tCorrectable errors: " << ec.correctable_count << std::endl; std::cout << "\t\tUncorrectable errors: " << ec.uncorrectable_count << std::endl; diff --git a/projects/amdsmi/tests/amd_smi_test/functional/process_info_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/process_info_read.cc index facf69d21a..aa7a5420b6 100755 --- a/projects/amdsmi/tests/amd_smi_test/functional/process_info_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/process_info_read.cc @@ -103,7 +103,6 @@ void TestProcInfoRead::Run(void) { } uint32_t num_devices = num_monitor_devs(); - CHK_ERR_ASRT(err) err = amdsmi_compute_process_info_get(nullptr, &num_proc_found); if (err != AMDSMI_STATUS_SUCCESS) { diff --git a/projects/amdsmi/tests/amd_smi_test/test_base.cc b/projects/amdsmi/tests/amd_smi_test/test_base.cc index 695e6ff8da..5fcb513f1c 100644 --- a/projects/amdsmi/tests/amd_smi_test/test_base.cc +++ b/projects/amdsmi/tests/amd_smi_test/test_base.cc @@ -104,8 +104,16 @@ void TestBase::SetUp(uint64_t init_flags) { } ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS); - socket_count_ = 0; - err = amdsmi_get_socket_handles(&socket_count_, &sockets_); + + err = amdsmi_get_socket_handles(&socket_count_, nullptr); + if (err != AMDSMI_STATUS_SUCCESS) { + setup_failed_ = true; + } + ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS); + + // allocate memory + sockets_.resize(socket_count_); + err = amdsmi_get_socket_handles(&socket_count_, &sockets_[0]); if (err != AMDSMI_STATUS_SUCCESS) { setup_failed_ = true; } @@ -116,9 +124,16 @@ void TestBase::SetUp(uint64_t init_flags) { for (uint32_t i=0; i < socket_count_; i++) { // Get all devices of the socket uint32_t device_count = 0; - amdsmi_device_handle* device_handles = nullptr; err = amdsmi_get_device_handles(sockets_[i], - &device_count, &device_handles); + &device_count, nullptr); + if (err != AMDSMI_STATUS_SUCCESS) { + setup_failed_ = true; + } + ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS); + + std::vector device_handles(device_count); + err = amdsmi_get_device_handles(sockets_[i], + &device_count, &device_handles[0]); if (err != AMDSMI_STATUS_SUCCESS) { setup_failed_ = true; } diff --git a/projects/amdsmi/tests/amd_smi_test/test_base.h b/projects/amdsmi/tests/amd_smi_test/test_base.h index 822287f574..bf2a45ff88 100644 --- a/projects/amdsmi/tests/amd_smi_test/test_base.h +++ b/projects/amdsmi/tests/amd_smi_test/test_base.h @@ -46,6 +46,7 @@ #define TESTS_AMD_SMI_TEST_TEST_BASE_H_ #include +#include #include "amd_smi/amd_smi.h" // The max devices can be monitored @@ -126,7 +127,7 @@ class TestBase { ///< device handles amdsmi_device_handle device_handles_[MAX_MONITOR_DEVICES]; uint32_t socket_count_; ///< socket count - amdsmi_socket_handle* sockets_; ///< sockets + std::vector sockets_; ///< sockets private: std::string description_; diff --git a/projects/amdsmi/tests/rocm_smi_test/CMakeLists.txt b/projects/amdsmi/tests/rocm_smi_test/CMakeLists.txt index e7321d0b97..3987505fb6 100755 --- a/projects/amdsmi/tests/rocm_smi_test/CMakeLists.txt +++ b/projects/amdsmi/tests/rocm_smi_test/CMakeLists.txt @@ -46,9 +46,13 @@ endif() # # Required Defines first: +if(NOT DEFINED RSMI_INC_DIR) + set(RSMI_INC_DIR ${ROCM_DIR}/include) +endif() +if(NOT DEFINED RSMI_LIB_DIR) + set(RSMI_LIB_DIR ${ROCM_DIR}/lib) +endif() -set(RSMI_INC_DIR ${ROCM_DIR}/include) -set(RSMI_LIB_DIR ${ROCM_DIR}/lib) # # Determine RSMI Header files are present # (no external source dependencies) @@ -64,14 +68,14 @@ else() endif() # if (${IS64BIT} EQUAL 0) - if(NOT EXISTS ${RSMI_LIB_DIR}/librocm_smi32.so) - message("ERROR: ${RSMI_LIB_DIR}/librocm_smi32.so doesn't exist. Check value of ROCM_DIR define") + if(NOT EXISTS ${RSMI_LIB_DIR}/libamd_smi32.so) + message("ERROR: ${RSMI_LIB_DIR}/libamd_smi32.so doesn't exist. Check value of ROCM_DIR define") return() endif() else() - if(NOT EXISTS ${RSMI_LIB_DIR}/librocm_smi64.so) + if(NOT EXISTS ${RSMI_LIB_DIR}/libamd_smi64.so) message("ERROR: Define RSMI_LIB_DIR pointing to RSMI library is not set") - message(" missing: ${RSMI_LIB_DIR}/librocm_smi64.so") + message(" missing: ${RSMI_LIB_DIR}/libamd_smi64.so") return() endif() endif() @@ -195,10 +199,10 @@ LINK_DIRECTORIES(${RSMI_LIB_DIR} ${GTEST_LIB_DIR}) # Extend the list of libraries to be used for linking rsmi apps # if (IS64BIT) - set(RSMITST_LIBS ${RSMITST_LIBS} rocm_smi64) + set(RSMITST_LIBS ${RSMITST_LIBS} amd_smi64) set(RSMITST "rsmitst64") else() - set(RSMITST_LIBS ${RSMITST_LIBS} rocm_smi32) + set(RSMITST_LIBS ${RSMITST_LIBS} amd_smi32) set(RSMITST "rsmitst") endif()