Change the get_socket_handles and get_device_handles APIs interface

Those two APIs are changed to let the user get the handles count,
allocate memory, and then return handles to the allocated memory.

Change-Id: Ibe28a89ad188c99da6af3af1740b2b25ff22ba06


[ROCm/amdsmi commit: 2b2d11c446]
This commit is contained in:
Bill(Shuzhou) Liu
2022-10-07 10:35:26 -05:00
bovenliggende bd4ff14bd0
commit b546e272e0
9 gewijzigde bestanden met toevoegingen van 175 en 95 verwijderingen
@@ -73,24 +73,37 @@ int main() {
// Get all sockets
uint32_t socket_count = 0;
amdsmi_socket_handle *sockets = nullptr;
ret = amdsmi_get_socket_handles(&socket_count, &sockets);
// Get the socket count available for the system.
ret = amdsmi_get_socket_handles(&socket_count, nullptr);
CHK_AMDSMI_RET(ret)
// Allocate the memory for the sockets
std::vector<amdsmi_socket_handle> sockets(socket_count);
// Get the sockets of the system
ret = amdsmi_get_socket_handles(&socket_count, &sockets[0]);
CHK_AMDSMI_RET(ret)
std::cout << "Total Socket: " << socket_count << std::endl;
// For each socket, get identifier and devices
for (uint32_t i = 0; i < socket_count; i++) {
// Get Socket info
char socket_name[128];
ret = amdsmi_get_socket_info(sockets[i], socket_name, 128);
char socket_info[128];
ret = amdsmi_get_socket_info(sockets[i], socket_info, 128);
CHK_AMDSMI_RET(ret)
std::cout << "Socket " << socket_name << std::endl;
std::cout << "Socket " << socket_info << std::endl;
// Get all devices of the socket
// Get the device count available for the socket.
uint32_t device_count = 0;
amdsmi_device_handle *device_handles = nullptr;
ret = amdsmi_get_device_handles(sockets[i], &device_count,
&device_handles);
ret = amdsmi_get_device_handles(sockets[i], &device_count, nullptr);
CHK_AMDSMI_RET(ret)
// Allocate the memory for the device handlers on the socket
std::vector<amdsmi_device_handle> device_handles(device_count);
// Get all devices of the socket
ret = amdsmi_get_device_handles(sockets[i],
&device_count, &device_handles[0]);
CHK_AMDSMI_RET(ret)
// For each device of the socket, get name and temperature.
@@ -73,24 +73,37 @@ int main() {
// Get all sockets
uint32_t socket_count = 0;
amdsmi_socket_handle *sockets = nullptr;
ret = amdsmi_get_socket_handles(&socket_count, &sockets);
// Get the socket count available for the system.
ret = amdsmi_get_socket_handles(&socket_count, nullptr);
CHK_AMDSMI_RET(ret)
// Allocate the memory for the sockets
std::vector<amdsmi_socket_handle> sockets(socket_count);
// Get the sockets of the system
ret = amdsmi_get_socket_handles(&socket_count, &sockets[0]);
CHK_AMDSMI_RET(ret)
std::cout << "Total Socket: " << socket_count << std::endl;
// For each socket, get identifier and devices
for (uint32_t i = 0; i < socket_count; i++) {
// Get Socket info
char socket_name[128];
ret = amdsmi_get_socket_info(sockets[i], socket_name, 128);
char socket_info[128];
ret = amdsmi_get_socket_info(sockets[i], socket_info, 128);
CHK_AMDSMI_RET(ret)
std::cout << "Socket " << socket_name << std::endl;
std::cout << "Socket " << socket_info << std::endl;
// Get all devices of the socket
// Get the device count available for the socket.
uint32_t device_count = 0;
amdsmi_device_handle *device_handles = nullptr;
ret = amdsmi_get_device_handles(sockets[i], &device_count,
&device_handles);
ret = amdsmi_get_device_handles(sockets[i], &device_count, nullptr);
CHK_AMDSMI_RET(ret)
// Allocate the memory for the device handlers on the socket
std::vector<amdsmi_device_handle> device_handles(device_count);
// Get all devices of the socket
ret = amdsmi_get_device_handles(sockets[i],
&device_count, &device_handles[0]);
CHK_AMDSMI_RET(ret)
// For each device of the socket, get name and temperature.
@@ -1170,23 +1170,34 @@ amdsmi_status_t amdsmi_shut_down(void);
/**
* @brief Get the list of socket handles in the system.
*
*
* @details Depends on what flag is pass to ::amdsmi_init(flags). AMDSMI_INIT_AMD_GPUS
* returns sockets with AMD GPUS, and AMDSMI_INIT_AMD_GPUS | AMDSMI_INIT_AMD_CPUS returns
* sockets with either AMD GPUS or CPUS.
* The socket handles can be used to query the device handles in that socket, which
* will be used in other APIs to get device detail information or telemtries.
*
* @param[out] socket_count The total count of sockets found in the system.
* @param[inout] socket_count As input, the value passed
* through this parameter is the number of ::amdsmi_socket_handle's that
* may be safely written to the memory pointed to by @p socket_handles. This is the
* limit on how many socket handles will be written to @p socket_handles. On return, @p
* socket_count will contain the number of socket handles written to @p socket_handles,
* or the number of socket handles that could have been written if enough memory had been
* provided.
* If @p socket_handles is NULL, as output, @p socket_count will contain
* how many sockets are available to read in the system.
*
* @param[out] socket_handles a list of socket handles in the system.
* @param[inout] socket_handles A pointer to a block of memory to which the
* ::amdsmi_socket_handle values will be written. This value may be NULL.
* In this case, this function can be used to query how many sockets are
* available to read in the system.
*
* @retval ::AMDSMI_STATUS_SUCCESS call was successful
* @retval ::AMDSMI_STATUS_INVAL the provided arguments are not valid
*
*/
amdsmi_status_t amdsmi_get_socket_handles(uint32_t *socket_count,
amdsmi_socket_handle* socket_handles[]);
amdsmi_socket_handle* socket_handles);
/**
* @brief Get the socket information
@@ -1216,20 +1227,35 @@ amdsmi_status_t amdsmi_get_socket_info(
* type devices: An APU on a socket have both CPUs and GPUs.
* Currently, only AMD GPUs are supported.
*
* The number of device count is returned through @p device_count
* if @p device_handles is NULL. Then the number of @p device_count can be pass
* as input to retrieval all devices on the socket to @p device_handles.
*
* @param[in] socket_handle The socket to query
*
* @param[out] device_count The total count of devices on the socket.
* @param[inout] device_count As input, the value passed
* through this parameter is the number of ::amdsmi_device_handle's that
* may be safely written to the memory pointed to by @p device_handles. This is the
* limit on how many device handles will be written to @p device_handles. On return, @p
* device_count will contain the number of device handles written to @p device_handles,
* or the number of device handles that could have been written if enough memory had been
* provided.
* If @p device_handles is NULL, as output, @p device_count will contain
* how many devices are available to read for the socket.
*
* @param[out] device_handles a list of devices handles on the socket.
* @param[inout] device_handles A pointer to a block of memory to which the
* ::device_handles values will be written. This value may be NULL.
* In this case, this function can be used to query how many devices are
* available to read.
*
* @retval ::AMDSMI_STATUS_SUCCESS call was successful
* @retval ::AMDSMI_STATUS_INVAL the provided arguments are not valid
*
*
*/
amdsmi_status_t amdsmi_get_device_handles(amdsmi_socket_handle socket_handle,
uint32_t *device_count,
amdsmi_device_handle* device_handles[]);
amdsmi_device_handle* device_handles);
/**
* @brief Get the device type
*
+63 -54
Bestand weergeven
@@ -138,15 +138,28 @@ amdsmi_status_string(amdsmi_status_t status, const char **status_string) {
}
amdsmi_status_t amdsmi_get_socket_handles(uint32_t *socket_count,
amdsmi_socket_handle* socket_handles[]) {
if (socket_count == nullptr || socket_handles == nullptr) {
amdsmi_socket_handle* socket_handles) {
if (socket_count == nullptr) {
return AMDSMI_STATUS_INVAL;
}
std::vector<amd::smi::AMDSmiSocket*>& sockets
= amd::smi::AMDSmiSystem::getInstance().get_sockets();
*socket_count = static_cast<uint32_t>(sockets.size());
*socket_handles = reinterpret_cast<amdsmi_socket_handle*>(sockets.data());
uint32_t socket_size = static_cast<uint32_t>(sockets.size());
// Get the socket size
if (socket_handles == nullptr) {
*socket_count = socket_size;
return AMDSMI_STATUS_SUCCESS;
}
// If the socket_handles can hold all sockets, return all of them.
*socket_count = *socket_count >= socket_size ? socket_size : *socket_count;
// Copy the socket handles
for (uint32_t i = 0; i < *socket_count; i++) {
socket_handles[i] = reinterpret_cast<amdsmi_socket_handle>(sockets[i]);
}
return AMDSMI_STATUS_SUCCESS;
}
@@ -168,24 +181,36 @@ amdsmi_status_t amdsmi_get_socket_info(
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_device_handles(amdsmi_socket_handle socket_handle,
uint32_t *device_count,
amdsmi_device_handle* device_handles[]) {
uint32_t* device_count,
amdsmi_device_handle* device_handles) {
if (device_count == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// Get the socket object via socket handle.
amd::smi::AMDSmiSocket* socket = nullptr;
amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance()
.handle_to_socket(socket_handle, &socket);
if (r != AMDSMI_STATUS_SUCCESS) return r;
r = socket->get_device_count(device_count);
if (r != AMDSMI_STATUS_SUCCESS) return r;
*device_handles = reinterpret_cast<amdsmi_device_handle*>(
socket->get_devices().data());
std::vector<amd::smi::AMDSmiDevice*>& devices = socket->get_devices();
uint32_t device_size = static_cast<uint32_t>(devices.size());
// Get the device count only
if (device_handles == nullptr) {
*device_count = device_size;
return AMDSMI_STATUS_SUCCESS;
}
// If the device_handles can hold all devices, return all of them.
*device_count = *device_count >= device_size ? device_size : *device_count;
// Copy the device handles
for (uint32_t i = 0; i < *device_count; i++) {
device_handles[i] = reinterpret_cast<amdsmi_device_handle>(devices[i]);
}
return AMDSMI_STATUS_SUCCESS;
}
@@ -205,7 +230,6 @@ amdsmi_status_t amdsmi_get_device_type(amdsmi_device_handle device_handle ,
amdsmi_status_t
amdsmi_get_device_bdf(amdsmi_device_handle device_handle, amdsmi_bdf_t *bdf) {
if (bdf == NULL) {
return AMDSMI_STATUS_INVAL;
}
@@ -213,12 +237,8 @@ amdsmi_get_device_bdf(amdsmi_device_handle device_handle, amdsmi_bdf_t *bdf) {
amd::smi::AMDSmiGPUDevice* gpu_device =
static_cast<amd::smi::AMDSmiGPUDevice*>(device_handle);
if (gpu_device->check_if_drm_is_supported()) {
*bdf = gpu_device->get_bdf();
}
else {
//TODO
}
// get bdf from sysfs file
*bdf = gpu_device->get_bdf();
return AMDSMI_STATUS_SUCCESS;
}
@@ -503,25 +523,23 @@ amdsmi_get_asic_info(amdsmi_device_handle device_handle, amdsmi_asic_info_t *inf
info->family = dev_info.family;
info->rev_id = dev_info.pci_rev;
}
else {
uint16_t vendor_id = 0;
// For other sysfs related information, get from rocm-smi
uint16_t vendor_id = 0;
amdsmi_status_t status = rsmi_wrapper(rsmi_dev_serial_number_get, device_handle,
status = rsmi_wrapper(rsmi_dev_serial_number_get, device_handle,
info->asic_serial, AMDSMI_NORMAL_STRING_LENGTH);
status = rsmi_wrapper(rsmi_dev_brand_get, device_handle,
status = rsmi_wrapper(rsmi_dev_brand_get, device_handle,
info->market_name, AMDSMI_NORMAL_STRING_LENGTH);
status = rsmi_wrapper(rsmi_dev_vendor_id_get, device_handle,
status = rsmi_wrapper(rsmi_dev_vendor_id_get, device_handle,
&vendor_id);
if (status == AMDSMI_STATUS_SUCCESS) info->vendor_id = vendor_id;
vendor_id = 0;
if (status == AMDSMI_STATUS_SUCCESS) info->vendor_id = vendor_id;
status = rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, device_handle,
vendor_id = 0;
status = rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, device_handle,
&vendor_id);
if (status == AMDSMI_STATUS_SUCCESS) info->subvendor_id = vendor_id;
return status;
}
if (status == AMDSMI_STATUS_SUCCESS) info->subvendor_id = vendor_id;
return AMDSMI_STATUS_SUCCESS;
}
@@ -937,38 +955,28 @@ amdsmi_get_power_cap_info(amdsmi_device_handle device_handle,
amd::smi::AMDSmiGPUDevice* gpudevice =
static_cast<amd::smi::AMDSmiGPUDevice*>(device_handle);
amdsmi_status_t status;
if (gpudevice->check_if_drm_is_supported()){
int power_cap = 0;
int dpm = 0;
status = smi_amdgpu_get_power_cap(gpudevice, &power_cap);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
info->power_cap = power_cap;
// Ignore errors to get as much as possible info.
memset(info, 0, sizeof(amdsmi_power_cap_info_t));
status = smi_amdgpu_get_ranges(gpudevice, CLOCK_TYPE_GFX,
// Get power_cap and dpm
int power_cap = 0;
int dpm = 0;
status = smi_amdgpu_get_power_cap(gpudevice, &power_cap);
info->power_cap = power_cap;
status = smi_amdgpu_get_ranges(gpudevice, CLOCK_TYPE_GFX,
NULL, NULL, &dpm);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
info->dpm_cap = dpm;
info->dpm_cap = dpm;
return AMDSMI_STATUS_SUCCESS;
}
else {
// Ignore errors to get as much as possible info.
memset(info, 0, sizeof(amdsmi_power_cap_info_t));
auto rsmi_status = rsmi_dev_power_cap_default_get(gpudevice->get_gpu_id(),
// Get other information from rocm-smi
auto rsmi_status = rsmi_dev_power_cap_default_get(gpudevice->get_gpu_id(),
&(info->default_power_cap));
rsmi_status = rsmi_dev_power_cap_range_get(gpudevice->get_gpu_id(),
rsmi_status = rsmi_dev_power_cap_range_get(gpudevice->get_gpu_id(),
sensor_ind, &(info->max_power_cap), &(info->min_power_cap));
rsmi_status = rsmi_dev_power_cap_get(gpudevice->get_gpu_id(),
rsmi_status = rsmi_dev_power_cap_get(gpudevice->get_gpu_id(),
sensor_ind, &(info->power_cap));
// TODO(bliu) : dpm_cap
}
return AMDSMI_STATUS_SUCCESS;
}
@@ -1278,7 +1286,8 @@ amdsmi_get_vbios_info(amdsmi_device_handle dev, amdsmi_vbios_info_t *info) {
info->vbios_version = vbios.version;
}
else {
// rocm
// those information can only get the from libdrm
return AMDSMI_STATUS_NOT_SUPPORTED;
}
return AMDSMI_STATUS_SUCCESS;
@@ -158,7 +158,7 @@ void TestErrCntRead::Run(void) {
std::cout << "\t**Error counts for " <<
GetBlockNameStr(static_cast<amdsmi_gpu_block_t>(b)) << " block: "
<< std::endl;
std::cout << "\t\tCorrectable errors: " << ec.correctable_err
std::cout << "\t\tCorrectable errors: " << ec.correctable_count
<< std::endl;
std::cout << "\t\tUncorrectable errors: " << ec.uncorrectable_count
<< std::endl;
@@ -103,7 +103,6 @@ void TestProcInfoRead::Run(void) {
}
uint32_t num_devices = num_monitor_devs();
CHK_ERR_ASRT(err)
err = amdsmi_compute_process_info_get(nullptr, &num_proc_found);
if (err != AMDSMI_STATUS_SUCCESS) {
@@ -104,8 +104,16 @@ void TestBase::SetUp(uint64_t init_flags) {
}
ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS);
socket_count_ = 0;
err = amdsmi_get_socket_handles(&socket_count_, &sockets_);
err = amdsmi_get_socket_handles(&socket_count_, nullptr);
if (err != AMDSMI_STATUS_SUCCESS) {
setup_failed_ = true;
}
ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS);
// allocate memory
sockets_.resize(socket_count_);
err = amdsmi_get_socket_handles(&socket_count_, &sockets_[0]);
if (err != AMDSMI_STATUS_SUCCESS) {
setup_failed_ = true;
}
@@ -116,9 +124,16 @@ void TestBase::SetUp(uint64_t init_flags) {
for (uint32_t i=0; i < socket_count_; i++) {
// Get all devices of the socket
uint32_t device_count = 0;
amdsmi_device_handle* device_handles = nullptr;
err = amdsmi_get_device_handles(sockets_[i],
&device_count, &device_handles);
&device_count, nullptr);
if (err != AMDSMI_STATUS_SUCCESS) {
setup_failed_ = true;
}
ASSERT_EQ(err, AMDSMI_STATUS_SUCCESS);
std::vector<amdsmi_device_handle> device_handles(device_count);
err = amdsmi_get_device_handles(sockets_[i],
&device_count, &device_handles[0]);
if (err != AMDSMI_STATUS_SUCCESS) {
setup_failed_ = true;
}
@@ -46,6 +46,7 @@
#define TESTS_AMD_SMI_TEST_TEST_BASE_H_
#include <string>
#include <vector>
#include "amd_smi/amd_smi.h"
// The max devices can be monitored
@@ -126,7 +127,7 @@ class TestBase {
///< device handles
amdsmi_device_handle device_handles_[MAX_MONITOR_DEVICES];
uint32_t socket_count_; ///< socket count
amdsmi_socket_handle* sockets_; ///< sockets
std::vector<amdsmi_socket_handle> sockets_; ///< sockets
private:
std::string description_;
@@ -46,9 +46,13 @@ endif()
#
# Required Defines first:
if(NOT DEFINED RSMI_INC_DIR)
set(RSMI_INC_DIR ${ROCM_DIR}/include)
endif()
if(NOT DEFINED RSMI_LIB_DIR)
set(RSMI_LIB_DIR ${ROCM_DIR}/lib)
endif()
set(RSMI_INC_DIR ${ROCM_DIR}/include)
set(RSMI_LIB_DIR ${ROCM_DIR}/lib)
#
# Determine RSMI Header files are present
# (no external source dependencies)
@@ -64,14 +68,14 @@ else()
endif()
#
if (${IS64BIT} EQUAL 0)
if(NOT EXISTS ${RSMI_LIB_DIR}/librocm_smi32.so)
message("ERROR: ${RSMI_LIB_DIR}/librocm_smi32.so doesn't exist. Check value of ROCM_DIR define")
if(NOT EXISTS ${RSMI_LIB_DIR}/libamd_smi32.so)
message("ERROR: ${RSMI_LIB_DIR}/libamd_smi32.so doesn't exist. Check value of ROCM_DIR define")
return()
endif()
else()
if(NOT EXISTS ${RSMI_LIB_DIR}/librocm_smi64.so)
if(NOT EXISTS ${RSMI_LIB_DIR}/libamd_smi64.so)
message("ERROR: Define RSMI_LIB_DIR pointing to RSMI library is not set")
message(" missing: ${RSMI_LIB_DIR}/librocm_smi64.so")
message(" missing: ${RSMI_LIB_DIR}/libamd_smi64.so")
return()
endif()
endif()
@@ -195,10 +199,10 @@ LINK_DIRECTORIES(${RSMI_LIB_DIR} ${GTEST_LIB_DIR})
# Extend the list of libraries to be used for linking rsmi apps
#
if (IS64BIT)
set(RSMITST_LIBS ${RSMITST_LIBS} rocm_smi64)
set(RSMITST_LIBS ${RSMITST_LIBS} amd_smi64)
set(RSMITST "rsmitst64")
else()
set(RSMITST_LIBS ${RSMITST_LIBS} rocm_smi32)
set(RSMITST_LIBS ${RSMITST_LIBS} amd_smi32)
set(RSMITST "rsmitst")
endif()