Files
rocm-systems/src/amd_smi/amd_smi.cc
T
Saeed, Oosman 05ea00dcc4 [SWDEV_562432] update inband CPER meta data to be more consistent with OOB (#824)
* Added Product Serial Number to the raw_bytes cper entries
* Added Product Serial Number to the Python API return
---------

Signed-off-by: Saeed, Oosman <Oosman.Saeed@amd.com>
Signed-off-by: Arif, Maisam <Maisam.Arif@amd.com>
2025-11-17 13:25:56 -06:00

6583 wiersze
254 KiB
C++

// SPDX-License-Identifier: MIT
/*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <assert.h>
#include <errno.h>
#include <sys/utsname.h>
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <cstdlib>
#include <string>
#include <algorithm>
#include <sstream>
#include <iomanip>
#include <iostream>
#include <fstream>
#include <queue>
#include <vector>
#include <set>
#include <map>
#include <memory>
#include <limits>
#include <functional>
#include <exception>
#include "config/amd_smi_config.h"
#include "amd_smi/amdsmi.h"
#include "amd_smi/impl/fdinfo.h"
#include "amd_smi/impl/amd_smi_common.h"
#include "amd_smi/impl/amd_smi_cper.h"
#include "amd_smi/impl/amd_smi_system.h"
#include "amd_smi/impl/amd_smi_socket.h"
#include "amd_smi/impl/amd_smi_gpu_device.h"
#include "amd_smi/impl/amd_smi_uuid.h"
#include "amd_smi/impl/xf86drm.h"
#include "amd_smi/impl/amd_smi_utils.h"
#include "amd_smi/impl/amd_smi_processor.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi_logger.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_kfd.h"
// a global instance of std::mutex to protect data passed during threads
std::mutex myMutex;
// To enable multiple init and shutdown calls, the reference count is used
// to track the number of times the library has been initialized.
static int init_ref_count = 0;
#define SIZE 10
char proc_id[SIZE] = "\0";
#define AMDSMI_CHECK_INIT() do { \
if (init_ref_count == 0) { \
return AMDSMI_STATUS_NOT_INIT; \
} \
} while (0)
static const std::map<amdsmi_accelerator_partition_type_t, std::string> partition_types_map = {
{ AMDSMI_ACCELERATOR_PARTITION_SPX, "SPX" },
{ AMDSMI_ACCELERATOR_PARTITION_DPX, "DPX" },
{ AMDSMI_ACCELERATOR_PARTITION_TPX, "TPX" },
{ AMDSMI_ACCELERATOR_PARTITION_QPX, "QPX" },
{ AMDSMI_ACCELERATOR_PARTITION_CPX, "CPX" },
{ AMDSMI_ACCELERATOR_PARTITION_MAX, "MAX" },
};
static const std::map<amdsmi_accelerator_partition_type_t,
rsmi_compute_partition_type_t> accelerator_to_RSMI = {
{ AMDSMI_ACCELERATOR_PARTITION_SPX, RSMI_COMPUTE_PARTITION_SPX },
{ AMDSMI_ACCELERATOR_PARTITION_DPX, RSMI_COMPUTE_PARTITION_DPX },
{ AMDSMI_ACCELERATOR_PARTITION_TPX, RSMI_COMPUTE_PARTITION_TPX },
{ AMDSMI_ACCELERATOR_PARTITION_QPX, RSMI_COMPUTE_PARTITION_QPX },
{ AMDSMI_ACCELERATOR_PARTITION_CPX, RSMI_COMPUTE_PARTITION_CPX }
};
static const std::map<amdsmi_accelerator_partition_resource_type_t,
std::string> resource_types_map = {
{ AMDSMI_ACCELERATOR_XCC, "XCC" },
{ AMDSMI_ACCELERATOR_ENCODER, "ENCODER" },
{ AMDSMI_ACCELERATOR_DECODER, "DECODER" },
{ AMDSMI_ACCELERATOR_DMA, "DMA" },
{ AMDSMI_ACCELERATOR_JPEG, "JPEG" },
{ AMDSMI_ACCELERATOR_MAX, "MAX" },
};
static const std::map<amdsmi_memory_partition_type_t,
rsmi_memory_partition_type> nps_amdsmi_to_RSMI = {
{ AMDSMI_MEMORY_PARTITION_UNKNOWN, RSMI_MEMORY_PARTITION_UNKNOWN },
{ AMDSMI_MEMORY_PARTITION_NPS1, RSMI_MEMORY_PARTITION_NPS1 },
{ AMDSMI_MEMORY_PARTITION_NPS2, RSMI_MEMORY_PARTITION_NPS2 },
{ AMDSMI_MEMORY_PARTITION_NPS4, RSMI_MEMORY_PARTITION_NPS4 },
{ AMDSMI_MEMORY_PARTITION_NPS8, RSMI_MEMORY_PARTITION_NPS8 }
};
static amdsmi_status_t get_gpu_device_from_handle(amdsmi_processor_handle processor_handle,
amd::smi::AMDSmiGPUDevice** gpudevice) {
AMDSMI_CHECK_INIT();
std::ostringstream ss;
if (processor_handle == nullptr || gpudevice == nullptr) {
ss << __PRETTY_FUNCTION__
<< " | processor_handle is NULL; returning: AMDSMI_STATUS_INVAL";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiProcessor* device = nullptr;
amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance()
.handle_to_processor(processor_handle, &device);
if (r != AMDSMI_STATUS_SUCCESS) return r;
if (device->get_processor_type() == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
*gpudevice = static_cast<amd::smi::AMDSmiGPUDevice*>(device);
return AMDSMI_STATUS_SUCCESS;
}
ss << __PRETTY_FUNCTION__
<< " | returning AMDSMI_STATUS_NOT_SUPPORTED";
LOG_ERROR(ss);
return AMDSMI_STATUS_NOT_SUPPORTED;
}
template <typename F, typename ...Args>
amdsmi_status_t rsmi_wrapper(F && f,
amdsmi_processor_handle processor_handle, uint32_t increment_gpu_id, Args &&... args) {
AMDSMI_CHECK_INIT();
std::ostringstream ss;
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
ss << __PRETTY_FUNCTION__ << " | get_gpu_device_from_handle status = "
<< smi_amdgpu_get_status_string(r, false);
LOG_INFO(ss);
if (r != AMDSMI_STATUS_SUCCESS) return r;
uint32_t total_num_gpu_processors = 0;
rsmi_num_monitor_devices(&total_num_gpu_processors);
uint32_t gpu_index = gpu_device->get_gpu_id() + increment_gpu_id;
ss << __PRETTY_FUNCTION__ << " | total_num_gpu_processors: " << total_num_gpu_processors
<< "; gpu_index: " << gpu_index;
LOG_DEBUG(ss);
if ((gpu_index + 1) > total_num_gpu_processors) {
ss << __PRETTY_FUNCTION__ << " | returning status = AMDSMI_STATUS_NOT_FOUND";
LOG_INFO(ss);
return AMDSMI_STATUS_NOT_FOUND;
}
auto rstatus = std::forward<F>(f)(gpu_index,
std::forward<Args>(args)...);
r = amd::smi::rsmi_to_amdsmi_status(rstatus);
std::string status_string = smi_amdgpu_get_status_string(r, false);
ss << __PRETTY_FUNCTION__ << " | returning status = " << status_string;
LOG_INFO(ss);
return r;
}
amdsmi_status_t
amdsmi_init(uint64_t flags) {
if (init_ref_count > 0 ) {
init_ref_count++;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t status = amd::smi::AMDSmiSystem::getInstance().init(flags);
if (status == AMDSMI_STATUS_SUCCESS) {
init_ref_count++;
}
return status;
}
amdsmi_status_t
amdsmi_shut_down() {
if (init_ref_count == 0) {
return AMDSMI_STATUS_SUCCESS;
}
// Decrement the reference count
init_ref_count--;
// If the reference count is still greater than 0, return success
if (init_ref_count > 0) {
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t status = amd::smi::AMDSmiSystem::getInstance().cleanup();
return status;
}
amdsmi_status_t
amdsmi_status_code_to_string(amdsmi_status_t status, const char **status_string) {
switch (status) {
case AMDSMI_STATUS_SUCCESS:
*status_string = "AMDSMI_STATUS_SUCCESS: Call succeeded.";
break;
case AMDSMI_STATUS_INVAL:
*status_string = "AMDSMI_STATUS_INVAL: Invalid parameters.";
break;
case AMDSMI_STATUS_NOT_SUPPORTED:
*status_string = "AMDSMI_STATUS_NOT_SUPPORTED: Command not supported.";
break;
case AMDSMI_STATUS_NOT_YET_IMPLEMENTED:
*status_string = "AMDSMI_STATUS_NOT_YET_IMPLEMENTED: Not implemented yet.";
break;
case AMDSMI_STATUS_FAIL_LOAD_MODULE:
*status_string = "AMDSMI_STATUS_FAIL_LOAD_MODULE: Fail to load lib module.";
break;
case AMDSMI_STATUS_FAIL_LOAD_SYMBOL:
*status_string = "AMDSMI_STATUS_FAIL_LOAD_SYMBOL: Fail to load symbol.";
break;
case AMDSMI_STATUS_DRM_ERROR:
*status_string = "AMDSMI_STATUS_DRM_ERROR: Error when calling libdrm function.";
break;
case AMDSMI_STATUS_API_FAILED:
*status_string = "AMDSMI_STATUS_API_FAILED: API call failed.";
break;
case AMDSMI_STATUS_RETRY:
*status_string = "AMDSMI_STATUS_RETRY: Retry operation.";
break;
case AMDSMI_STATUS_NO_PERM:
*status_string = "AMDSMI_STATUS_NO_PERM: Permission Denied.";
break;
case AMDSMI_STATUS_INTERRUPT:
*status_string = "AMDSMI_STATUS_INTERRUPT: An interrupt occurred during"
" execution of function.";
break;
case AMDSMI_STATUS_IO:
*status_string = "AMDSMI_STATUS_IO: I/O Error.";
break;
case AMDSMI_STATUS_ADDRESS_FAULT:
*status_string = "AMDSMI_STATUS_ADDRESS_FAULT: Bad address.";
break;
case AMDSMI_STATUS_FILE_ERROR:
*status_string = "AMDSMI_STATUS_FILE_ERROR: Problem accessing a file.";
break;
case AMDSMI_STATUS_OUT_OF_RESOURCES:
*status_string = "AMDSMI_STATUS_OUT_OF_RESOURCES: Not enough memory.";
break;
case AMDSMI_STATUS_INTERNAL_EXCEPTION:
*status_string = "AMDSMI_STATUS_INTERNAL_EXCEPTION: An internal exception was caught.";
break;
case AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS:
*status_string = "AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS: The provided"
" input is out of allowable or safe range.";
break;
case AMDSMI_STATUS_INIT_ERROR:
*status_string = "AMDSMI_STATUS_INIT_ERROR: An error occurred when"
" initializing internal data structures.";
break;
case AMDSMI_STATUS_REFCOUNT_OVERFLOW:
*status_string = "AMDSMI_STATUS_REFCOUNT_OVERFLOW: An internal reference"
" counter exceeded INT32_MAX.";
break;
case AMDSMI_STATUS_DIRECTORY_NOT_FOUND:
*status_string = "AMDSMI_STATUS_DIRECTORY_NOT_FOUND: Error when a"
" directory is not found, maps to ENOTDIR.";
break;
case AMDSMI_STATUS_BUSY:
*status_string = "AMDSMI_STATUS_BUSY: Processor busy.";
break;
case AMDSMI_STATUS_NOT_FOUND:
*status_string = "AMDSMI_STATUS_NOT_FOUND: Processor Not found.";
break;
case AMDSMI_STATUS_NOT_INIT:
*status_string = "AMDSMI_STATUS_NOT_INIT: Processor not initialized.";
break;
case AMDSMI_STATUS_NO_SLOT:
*status_string = "AMDSMI_STATUS_NO_SLOT: No more free slot.";
break;
case AMDSMI_STATUS_DRIVER_NOT_LOADED:
*status_string = "AMDSMI_STATUS_DRIVER_NOT_LOADED: Processor driver not loaded.";
break;
case AMDSMI_STATUS_NO_DATA:
*status_string = "AMDSMI_STATUS_NO_DATA: No data was found for a given input.";
break;
case AMDSMI_STATUS_INSUFFICIENT_SIZE:
*status_string = "AMDSMI_STATUS_INSUFFICIENT_SIZE: Not enough resources"
" were available for the operation.";
break;
case AMDSMI_STATUS_UNEXPECTED_SIZE:
*status_string = "AMDSMI_STATUS_UNEXPECTED_SIZE: An unexpected amount of data"
" was read.";
break;
case AMDSMI_STATUS_UNEXPECTED_DATA:
*status_string = "AMDSMI_STATUS_UNEXPECTED_DATA: The data read or provided to"
" function is not what was expected.";
break;
case AMDSMI_STATUS_NON_AMD_CPU:
*status_string = "AMDSMI_STATUS_NON_AMD_CPU: System has different cpu than AMD.";
break;
case AMDSMI_STATUS_NO_ENERGY_DRV:
*status_string = "AMDSMI_STATUS_NO_ENERGY_DRV: Energy driver not found.";
break;
case AMDSMI_STATUS_NO_MSR_DRV:
*status_string = "AMDSMI_STATUS_NO_MSR_DRV: MSR driver not found.";
break;
case AMDSMI_STATUS_NO_HSMP_DRV:
*status_string = "AMDSMI_STATUS_NO_HSMP_DRV: HSMP driver not found.";
break;
case AMDSMI_STATUS_NO_HSMP_SUP:
*status_string = "AMDSMI_STATUS_NO_HSMP_SUP: HSMP not supported.";
break;
case AMDSMI_STATUS_NO_HSMP_MSG_SUP:
*status_string = "AMDSMI_STATUS_NO_HSMP_MSG_SUP: HSMP message/feature not supported.";
break;
case AMDSMI_STATUS_HSMP_TIMEOUT:
*status_string = "AMDSMI_STATUS_HSMP_TIMEOUT: HSMP message timed out.";
break;
case AMDSMI_STATUS_NO_DRV:
*status_string = "AMDSMI_STATUS_NO_DRV: No Energy and HSMP driver present.";
break;
case AMDSMI_STATUS_FILE_NOT_FOUND:
*status_string = "AMDSMI_STATUS_FILE_NOT_FOUND: file or directory not found.";
break;
case AMDSMI_STATUS_ARG_PTR_NULL:
*status_string = "AMDSMI_STATUS_ARG_PTR_NULL: Parsed argument is invalid.";
break;
case AMDSMI_STATUS_AMDGPU_RESTART_ERR:
*status_string = "AMDSMI_STATUS_AMDGPU_RESTART_ERR: AMDGPU restart failed.";
break;
case AMDSMI_STATUS_SETTING_UNAVAILABLE:
*status_string = "AMDSMI_STATUS_SETTING_UNAVAILABLE: Setting is not available.";
break;
case AMDSMI_STATUS_CORRUPTED_EEPROM:
*status_string = "AMDSMI_STATUS_CORRUPTED_EEPROM: EEPROM is corrupted.";
break;
case AMDSMI_STATUS_MAP_ERROR:
*status_string = "AMDSMI_STATUS_MAP_ERROR: The internal library error did"
" not map to a status code.";
break;
case AMDSMI_STATUS_UNKNOWN_ERROR:
*status_string = "AMDSMI_STATUS_UNKNOWN_ERROR: An unknown error occurred.";
break;
default:
// The case above didn't have a match, so look up the amdsmi status in the rsmi
// status map
// If found, get the rsmi status string. If not, return unknown error string
for (auto& iter : amd::smi::rsmi_status_map) {
if (iter.second == status) {
rsmi_status_string(iter.first, status_string);
return AMDSMI_STATUS_SUCCESS;
}
}
// Not found
*status_string = "An unknown error occurred";
return AMDSMI_STATUS_UNKNOWN_ERROR;
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_socket_handles(uint32_t *socket_count,
amdsmi_socket_handle* socket_handles) {
AMDSMI_CHECK_INIT();
if (socket_count == nullptr) {
return AMDSMI_STATUS_INVAL;
}
std::vector<amd::smi::AMDSmiSocket*>& sockets
= amd::smi::AMDSmiSystem::getInstance().get_sockets();
uint32_t socket_size = static_cast<uint32_t>(sockets.size());
// Get the socket size
if (socket_handles == nullptr) {
*socket_count = socket_size;
return AMDSMI_STATUS_SUCCESS;
}
// If the socket_handles can hold all sockets, return all of them.
*socket_count = *socket_count >= socket_size ? socket_size : *socket_count;
// Copy the socket handles
for (uint32_t i = 0; i < *socket_count; i++) {
socket_handles[i] = reinterpret_cast<amdsmi_socket_handle>(sockets[i]);
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_socket_info(
amdsmi_socket_handle socket_handle,
size_t len, char *name) {
AMDSMI_CHECK_INIT();
if (socket_handle == nullptr || name == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiSocket* socket = nullptr;
amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance()
.handle_to_socket(socket_handle, &socket);
if (r != AMDSMI_STATUS_SUCCESS) return r;
strncpy(name, socket->get_socket_id().c_str(), len);
return AMDSMI_STATUS_SUCCESS;
}
#ifdef ENABLE_ESMI_LIB
amdsmi_status_t amdsmi_get_processor_info(
amdsmi_processor_handle processor_handle,
size_t len, char *name) {
char proc_id[16] = {0};
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr || name == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiProcessor* processor = nullptr;
amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance()
.handle_to_processor(processor_handle, &processor);
if (r != AMDSMI_STATUS_SUCCESS) return r;
snprintf(proc_id, sizeof(proc_id), "%d", processor->get_processor_index());
strncpy(name, proc_id, len);
return AMDSMI_STATUS_SUCCESS;
}
#endif
amdsmi_status_t amdsmi_get_processor_handles(amdsmi_socket_handle socket_handle,
uint32_t* processor_count,
amdsmi_processor_handle* processor_handles) {
AMDSMI_CHECK_INIT();
if (processor_count == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// Get the socket object via socket handle.
amd::smi::AMDSmiSocket* socket = nullptr;
amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance()
.handle_to_socket(socket_handle, &socket);
if (r != AMDSMI_STATUS_SUCCESS) return r;
std::vector<amd::smi::AMDSmiProcessor*>& processors = socket->get_processors();
uint32_t processor_size = static_cast<uint32_t>(processors.size());
// Get the processor count only
if (processor_handles == nullptr) {
*processor_count = processor_size;
return AMDSMI_STATUS_SUCCESS;
}
// If the processor_handles can hold all processors, return all of them.
*processor_count = *processor_count >= processor_size ? processor_size : *processor_count;
// Copy the processor handles
for (uint32_t i = 0; i < *processor_count; i++) {
processor_handles[i] = reinterpret_cast<amdsmi_processor_handle>(processors[i]);
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_node_handle(amdsmi_processor_handle processor_handle,
amdsmi_node_handle *node_handle) {
AMDSMI_CHECK_INIT();
if (node_handle == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// Check if OAM ID is 0
amdsmi_asic_info_t asic_info;
amdsmi_status_t r = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
if (r != AMDSMI_STATUS_SUCCESS) {
return r;
}
if (asic_info.oam_id != 0) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
// Get renderPath
amdsmi_enumeration_info_t enumeration_info;
r = amdsmi_get_gpu_enumeration_info(processor_handle, &enumeration_info);
if (r != AMDSMI_STATUS_SUCCESS) {
return r;
}
namespace fs = std::filesystem;
// Construct the path from /sys/class/drm/renderD* device
fs::path drm_device_path = fs::path("/sys/class/drm") / ("renderD" + std::to_string(enumeration_info.drm_render)) / "device";
fs::path found_board;
try {
// Navigate to the board directory from the DRM device path
fs::path board_dir = drm_device_path / "board";
fs::path npm_status = board_dir / "npm_status";
// Check if board directory and npm_status exist
if (fs::exists(board_dir) && fs::is_directory(board_dir) && fs::exists(npm_status)) {
found_board = board_dir;
}
} catch (...) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
if (found_board.empty()) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
// Store board path so node handle remains valid for library lifetime.
static std::mutex g_node_mu;
static std::map<std::string, std::unique_ptr<std::string>> g_node_registry;
std::string board_path = found_board.string();
{
std::lock_guard<std::mutex> lk(g_node_mu);
auto it = g_node_registry.find(board_path);
if (it == g_node_registry.end()) {
auto ptr = std::make_unique<std::string>(board_path);
amdsmi_node_handle h = reinterpret_cast<amdsmi_node_handle>(ptr.get());
g_node_registry.emplace(board_path, std::move(ptr));
*node_handle = h;
} else {
*node_handle = reinterpret_cast<amdsmi_node_handle>(it->second.get());
}
}
return AMDSMI_STATUS_SUCCESS;
}
#ifdef ENABLE_ESMI_LIB
amdsmi_status_t amdsmi_get_processor_count_from_handles(amdsmi_processor_handle* processor_handles,
uint32_t* processor_count, uint32_t* nr_cpusockets,
uint32_t* nr_cpucores, uint32_t* nr_gpus) {
AMDSMI_CHECK_INIT();
uint32_t count_cpusockets = 0;
uint32_t count_cpucores = 0;
uint32_t count_gpus = 0;
processor_type_t processor_type;
if (processor_count == nullptr || processor_handles == nullptr) {
return AMDSMI_STATUS_INVAL;
}
for (uint32_t i = 0; i < *processor_count; i++) {
amdsmi_status_t r = amdsmi_get_processor_type(processor_handles[i], &processor_type);
if (r != AMDSMI_STATUS_SUCCESS) return r;
if(processor_type == AMDSMI_PROCESSOR_TYPE_AMD_CPU) {
count_cpusockets++;
} else if(processor_type == AMDSMI_PROCESSOR_TYPE_AMD_CPU_CORE) {
count_cpucores++;
} else if(processor_type == AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
count_gpus++;
}
}
*nr_cpusockets = count_cpusockets;
*nr_cpucores = count_cpucores;
*nr_gpus = count_gpus;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_processor_handles_by_type(amdsmi_socket_handle socket_handle,
processor_type_t processor_type,
amdsmi_processor_handle* processor_handles,
uint32_t* processor_count) {
AMDSMI_CHECK_INIT();
if (processor_count == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// Get the socket object via socket handle.
amd::smi::AMDSmiSocket* socket = nullptr;
amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance().handle_to_socket(socket_handle, &socket);
if (r != AMDSMI_STATUS_SUCCESS) return r;
std::vector<amd::smi::AMDSmiProcessor*>& processors = socket->get_processors(processor_type);
uint32_t processor_size = static_cast<uint32_t>(processors.size());
// Get the processor count only
if (processor_handles == nullptr) {
*processor_count = processor_size;
return AMDSMI_STATUS_SUCCESS;
}
// If the processor_handles can hold all processors, return all of them.
*processor_count = *processor_count >= processor_size ? processor_size : *processor_count;
// Copy the processor handles
for (uint32_t i = 0; i < *processor_count; i++) {
processor_handles[i] = reinterpret_cast<amdsmi_processor_handle>(processors[i]);
}
return AMDSMI_STATUS_SUCCESS;
}
#endif
amdsmi_status_t amdsmi_get_processor_type(amdsmi_processor_handle processor_handle,
processor_type_t* processor_type) {
AMDSMI_CHECK_INIT();
if (processor_type == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiProcessor* processor = nullptr;
amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance()
.handle_to_processor(processor_handle, &processor);
if (r != AMDSMI_STATUS_SUCCESS) return r;
*processor_type = processor->get_processor_type();
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_device_bdf(amdsmi_processor_handle processor_handle, amdsmi_bdf_t *bdf) {
AMDSMI_CHECK_INIT();
if (bdf == NULL) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
// get bdf from sysfs file
*bdf = gpu_device->get_bdf();
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_device_uuid(amdsmi_processor_handle processor_handle,
unsigned int *uuid_length,
char *uuid) {
AMDSMI_CHECK_INIT();
if (uuid_length == nullptr || uuid == nullptr || *uuid_length < AMDSMI_GPU_UUID_SIZE) {
return AMDSMI_STATUS_INVAL;
}
uint64_t device_uuid = 0;
uint16_t device_id = std::numeric_limits<uint16_t>::max();
amdsmi_status_t status;
std::ostringstream ss;
status = rsmi_wrapper(rsmi_dev_id_get, processor_handle, 0, &device_id);
if (status != AMDSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__
<< " | rsmi_dev_id_get(): "
<< smi_amdgpu_get_status_string(status, false);
LOG_INFO(ss);
device_id = std::numeric_limits<uint16_t>::max();
}
ss << __PRETTY_FUNCTION__
<< " | device_id (dec): " << device_id << "\n"
<< "; device_id (hex): 0x" << std::hex << device_id << std::dec << "\n"
<< "; rsmi_dev_id_get() status: "
<< smi_amdgpu_get_status_string(status, false) << "\n";
status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0,
&device_uuid);
if (status != AMDSMI_STATUS_SUCCESS) {
LOG_INFO(ss);
return status;
}
ss << "; device_uuid (dec): " << device_uuid << "\n"
<< "; device_uuid (hex): 0x" << std::hex << device_uuid << std::dec << "\n"
<< "; rsmi_dev_unique_id_get() status: "
<< smi_amdgpu_get_status_string(status, false) << "\n";
const uint8_t fcn = 0xff;
/* generate random UUID */
status = amdsmi_uuid_gen(uuid, device_uuid, device_id, fcn);
ss << "; uuid: " << uuid << "\n"
<< "; amdsmi_uuid_gen() status: "
<< smi_amdgpu_get_status_string(status, false) << "\n";
LOG_INFO(ss);
return status;
}
amdsmi_status_t
amdsmi_get_gpu_enumeration_info(amdsmi_processor_handle processor_handle,
amdsmi_enumeration_info_t *info){
// Ensure library initialization
AMDSMI_CHECK_INIT();
if (info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amdsmi_status_t status;
std::ostringstream ss;
// Retrieve GPU device from the processor handle
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
status = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
// Retrieve DRM Card ID
info->drm_card = gpu_device->get_card_id();
// Retrieve DRM Render ID
info->drm_render = gpu_device->get_drm_render_minor();
// Retrieve HIP ID (difference from the smallest node ID) and HSA ID
std::map<uint64_t, std::shared_ptr<amd::smi::KFDNode>> nodes;
if (amd::smi::DiscoverKFDNodes(&nodes) == 0) {
uint32_t smallest_node_id = std::numeric_limits<uint32_t>::max();
for (const auto& node_pair : nodes) {
uint32_t node_id = 0;
if (node_pair.second->get_node_id(&node_id) == 0) {
smallest_node_id = std::min(smallest_node_id, node_id);
}
}
// Default to 0xffffffff as not supported
info->hsa_id = std::numeric_limits<uint32_t>::max();
info->hip_id = std::numeric_limits<uint32_t>::max();
amdsmi_kfd_info_t kfd_info;
status = amdsmi_get_gpu_kfd_info(processor_handle, &kfd_info);
if (status == AMDSMI_STATUS_SUCCESS) {
info->hsa_id = kfd_info.node_id;
info->hip_id = kfd_info.node_id - smallest_node_id;
}
}
// Retrieve HIP UUID
std::ostringstream ss_uuid;
uint64_t device_uuid = 0;
std::string hip_uuid_str;
status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0, &device_uuid);
ss_uuid << "GPU-" << std::hex << std::setw(16) << std::setfill('0') << device_uuid;
hip_uuid_str = ss_uuid.str();
smi_clear_char_and_reinitialize(info->hip_uuid, AMDSMI_MAX_STRING_LENGTH, hip_uuid_str);
ss << "; device_uuid (dec): " << device_uuid << "\n"
<< "; device_uuid (hex): 0x" << std::hex << std::setw(16) << std::setfill('0') << device_uuid << std::dec << "\n"
<< "; rsmi_dev_unique_id_get() status: "
<< smi_amdgpu_get_status_string(status, false) << "\n";
LOG_INFO(ss);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_handle, amdsmi_board_info_t *board_info) {
AMDSMI_CHECK_INIT();
if (board_info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amdsmi_status_t status;
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
status = smi_amdgpu_get_board_info(gpu_device, board_info);
if (board_info->product_serial[0] == '\0') {
status = rsmi_wrapper(rsmi_dev_serial_number_get, processor_handle, 0,
board_info->product_serial, AMDSMI_MAX_STRING_LENGTH);
if (status != AMDSMI_STATUS_SUCCESS) {
memset(board_info->product_serial, 0,
AMDSMI_MAX_STRING_LENGTH * sizeof(board_info->product_serial[0]));
}
}
if (board_info->product_name[0] == '\0') {
status = rsmi_wrapper(rsmi_dev_name_get, processor_handle, 0,
board_info->product_name, AMDSMI_MAX_STRING_LENGTH);
// Check if the value is in hex format
if (status == AMDSMI_STATUS_SUCCESS) {
if (board_info->product_name[0] == '0' && board_info->product_name[1] == 'x') {
memset(board_info->product_name, 0,
AMDSMI_MAX_STRING_LENGTH * sizeof(board_info->product_name[0]));
}
}
if (status != AMDSMI_STATUS_SUCCESS) {
memset(board_info->product_name, 0,
AMDSMI_MAX_STRING_LENGTH * sizeof(board_info->product_name[0]));
}
}
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "[Before rocm smi correction] "
<< "Returning status = AMDSMI_STATUS_SUCCESS"
<< "\n; info->model_number: |" << board_info->model_number << "|"
<< "\n; info->product_serial: |" << board_info->product_serial << "|"
<< "\n; info->fru_id: |" << board_info->fru_id << "|"
<< "\n; info->manufacturer_name: |" << board_info->manufacturer_name << "|"
<< "\n; info->product_name: |" << board_info->product_name << "|";
LOG_INFO(ss);
if (board_info->product_serial[0] == '\0') {
status = rsmi_wrapper(rsmi_dev_serial_number_get, processor_handle, 0,
board_info->product_serial, AMDSMI_MAX_STRING_LENGTH);
if (status != AMDSMI_STATUS_SUCCESS) {
memset(board_info->product_serial, 0,
AMDSMI_MAX_STRING_LENGTH * sizeof(board_info->product_serial[0]));
}
ss << __PRETTY_FUNCTION__ << " | [rsmi_correction] board_info->product_serial= |"
<< board_info->product_serial << "|";
LOG_INFO(ss);
}
if (board_info->product_name[0] == '\0') {
status = rsmi_wrapper(rsmi_dev_name_get, processor_handle, 0,
board_info->product_name,
AMDSMI_MAX_STRING_LENGTH);
// Check if the value is in hex format
if (status == AMDSMI_STATUS_SUCCESS) {
if (board_info->product_name[0] == '0' && board_info->product_name[1] == 'x') {
memset(board_info->product_name, 0,
AMDSMI_MAX_STRING_LENGTH * sizeof(board_info->product_name[0]));
}
}
if (status != AMDSMI_STATUS_SUCCESS) {
memset(board_info->product_name, 0,
AMDSMI_MAX_STRING_LENGTH * sizeof(board_info->product_name[0]));
}
ss << __PRETTY_FUNCTION__ << " | [rsmi_correction] board_info->product_name= |"
<< board_info->product_name << "|";
LOG_INFO(ss);
}
if (board_info->manufacturer_name[0] == '\0') {
status = rsmi_wrapper(rsmi_dev_vendor_name_get, processor_handle, 0,
board_info->manufacturer_name,
AMDSMI_MAX_STRING_LENGTH);
if (status != AMDSMI_STATUS_SUCCESS) {
memset(board_info->manufacturer_name, 0,
AMDSMI_MAX_STRING_LENGTH * sizeof(board_info->manufacturer_name[0]));
}
ss << __PRETTY_FUNCTION__ << " | [rsmi_correction] board_info->manufacturer_name= |"
<< board_info->manufacturer_name << "|";
LOG_INFO(ss);
}
ss << __PRETTY_FUNCTION__ << " | [After rocm smi correction] "
<< "Returning status = AMDSMI_STATUS_SUCCESS"
<< "\n; info->model_number: |" << board_info->model_number << "|"
<< "\n; info->product_serial: |" << board_info->product_serial << "|"
<< "\n; info->fru_id: |" << board_info->fru_id << "|"
<< "\n; info->manufacturer_name: |" << board_info->manufacturer_name << "|"
<< "\n; info->product_name: |" << board_info->product_name << "|";
LOG_INFO(ss);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_gpu_cache_info(
amdsmi_processor_handle processor_handle, amdsmi_gpu_cache_info_t *info) {
AMDSMI_CHECK_INIT();
if (info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t status = get_gpu_device_from_handle(
processor_handle, &gpu_device);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
rsmi_gpu_cache_info_t rsmi_info;
status = rsmi_wrapper(rsmi_dev_cache_info_get, processor_handle, 0,
&rsmi_info);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
// Sysfs cache type
#define HSA_CACHE_TYPE_DATA 0x00000001
#define HSA_CACHE_TYPE_INSTRUCTION 0x00000002
#define HSA_CACHE_TYPE_CPU 0x00000004
#define HSA_CACHE_TYPE_HSACU 0x00000008
info->num_cache_types = rsmi_info.num_cache_types;
for (unsigned int i =0; i < rsmi_info.num_cache_types; i++) {
// convert from sysfs type to CRAT type(HSA Cache Affinity type)
info->cache[i].cache_properties = 0;
if (rsmi_info.cache[i].flags & HSA_CACHE_TYPE_DATA)
info->cache[i].cache_properties |= AMDSMI_CACHE_PROPERTY_DATA_CACHE;
if (rsmi_info.cache[i].flags & HSA_CACHE_TYPE_INSTRUCTION)
info->cache[i].cache_properties |= AMDSMI_CACHE_PROPERTY_INST_CACHE;
if (rsmi_info.cache[i].flags & HSA_CACHE_TYPE_CPU)
info->cache[i].cache_properties |= AMDSMI_CACHE_PROPERTY_CPU_CACHE;
if (rsmi_info.cache[i].flags & HSA_CACHE_TYPE_HSACU)
info->cache[i].cache_properties |= AMDSMI_CACHE_PROPERTY_SIMD_CACHE;
info->cache[i].cache_size = rsmi_info.cache[i].cache_size_kb;
info->cache[i].cache_level = rsmi_info.cache[i].cache_level;
info->cache[i].max_num_cu_shared = rsmi_info.cache[i].max_num_cu_shared;
info->cache[i].num_cache_instance = rsmi_info.cache[i].num_cache_instance;
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle,
amdsmi_temperature_type_t sensor_type,
amdsmi_temperature_metric_t metric, int64_t *temperature) {
AMDSMI_CHECK_INIT();
if (temperature == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// Get the PLX temperature from the gpu_metrics
if (sensor_type == AMDSMI_TEMPERATURE_TYPE_PLX) {
amdsmi_gpu_metrics_t metric_info;
auto r_status = amdsmi_get_gpu_metrics_info(
processor_handle, &metric_info);
if (r_status != AMDSMI_STATUS_SUCCESS)
return r_status;
*temperature = metric_info.temperature_vrsoc;
return r_status;
}
amdsmi_status_t amdsmi_status = rsmi_wrapper(rsmi_dev_temp_metric_get, processor_handle, 0,
static_cast<uint32_t>(sensor_type),
static_cast<rsmi_temperature_metric_t>(metric), temperature);
*temperature /= 1000;
return amdsmi_status;
}
amdsmi_status_t amdsmi_get_npm_info(amdsmi_node_handle node_handle,
amdsmi_npm_info_t *npm_info) {
AMDSMI_CHECK_INIT();
if (node_handle == nullptr || npm_info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// Verify board path from node_handle
auto board_path_str = reinterpret_cast<std::string*>(node_handle);
if (board_path_str == nullptr || board_path_str->empty()) {
return AMDSMI_STATUS_INVAL;
}
rsmi_npm_info_t rsmi_npm_info;
rsmi_status_t rstatus = rsmi_dev_npm_info_get(0, reinterpret_cast<uintptr_t>(node_handle), &rsmi_npm_info);
amdsmi_status_t amdsmi_status = amd::smi::rsmi_to_amdsmi_status(rstatus);
if (amdsmi_status != AMDSMI_STATUS_SUCCESS) {
return amdsmi_status;
}
if (sizeof(amdsmi_npm_info_t) != sizeof(rsmi_npm_info_t)) {
return AMDSMI_STATUS_UNEXPECTED_SIZE;
}
std::memcpy(npm_info, &rsmi_npm_info, sizeof(amdsmi_npm_info_t));
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle,
amdsmi_vram_usage_t *vram_info) {
AMDSMI_CHECK_INIT();
if (vram_info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiProcessor* device = nullptr;
amdsmi_status_t ret = amd::smi::AMDSmiSystem::getInstance()
.handle_to_processor(processor_handle, &device);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
if (device->get_processor_type() != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS) {
return r;
}
std::ostringstream ss;
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex());
std::string render_name = gpu_device->get_gpu_path();
if (render_name.empty()) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
std::string path = "/dev/dri/" + render_name;
ScopedFD drm_fd(path.c_str(), O_RDWR | O_CLOEXEC);
if (!drm_fd.valid()) {
ss << __PRETTY_FUNCTION__
<< " | Failed to open " << path << ": " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false);
LOG_ERROR(ss);
return AMDSMI_STATUS_FILE_ERROR;
}
amd::smi::AMDSmiLibraryLoader libdrm;
amdsmi_status_t status = libdrm.load(LIBDRM_AMDGPU_SONAME);
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load " LIBDRM_AMDGPU_SONAME ": " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
ss << __PRETTY_FUNCTION__
<< " | about to load drmCommandWrite symbol";
LOG_INFO(ss);
// extern int drmCommandWrite(int fd, unsigned long drmCommandIndex,
// void *data, unsigned long size);
typedef int (*drmCommandWrite_t)(int fd, unsigned long drmCommandIndex,
void *data, unsigned long size);
drmCommandWrite_t drmCommandWrite = nullptr;
// load symbol from libdrm
status = libdrm.load_symbol(reinterpret_cast<drmCommandWrite_t *>(&drmCommandWrite),
"drmCommandWrite");
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load drmCommandWrite symbol"
<< " | Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
ss << __PRETTY_FUNCTION__
<< " | drmCommandWrite symbol loaded successfully";
LOG_INFO(ss);
uint64_t total = 0;
r = rsmi_wrapper(rsmi_dev_memory_total_get, processor_handle, 0,
RSMI_MEM_TYPE_VRAM, &total);
if (r == AMDSMI_STATUS_SUCCESS) {
vram_info->vram_total = static_cast<uint32_t>(total / (1024 * 1024));
}
uint64_t vram_used = 0;
struct drm_amdgpu_info request = {};
memset(&request, 0, sizeof(request));
request.return_pointer = reinterpret_cast<unsigned long long>(&vram_used);
request.return_size = sizeof(vram_used);
request.query = AMDGPU_INFO_VRAM_USAGE;
auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request,
sizeof(struct drm_amdgpu_info));
if (drm_write != 0) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Issue - drm_write failed, drm_write (AMDGPU_INFO_VRAM_USAGE): "
<< std::dec << drm_write << "\n"
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_DRM_ERROR, false);
LOG_ERROR(ss);
return AMDSMI_STATUS_DRM_ERROR;
}
vram_info->vram_used = static_cast<uint32_t>(vram_used / (1024 * 1024));
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | vram_info->vram_total (MB): " << std::dec << vram_info->vram_total << "\n"
<< " | vram_info->vram_used (MB): " << std::dec << vram_info->vram_used << "\n"
<< " | Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_SUCCESS, false);
LOG_INFO(ss);
return AMDSMI_STATUS_SUCCESS;
}
static void system_wait(int milli_seconds) {
std::ostringstream ss;
auto start = std::chrono::high_resolution_clock::now();
// 1 ms = 1000 us
int waitTime = milli_seconds * 1000;
ss << __PRETTY_FUNCTION__ << " | "
<< "** Waiting for " << std::dec << waitTime
<< " us (" << waitTime/1000 << " seconds) **";
LOG_DEBUG(ss);
usleep(waitTime);
auto stop = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
ss << __PRETTY_FUNCTION__ << " | "
<< "** Waiting took " << duration.count() / 1000
<< " milli-seconds **";
LOG_DEBUG(ss);
}
amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_handle,
amdsmi_violation_status_t *violation_status) {
AMDSMI_CHECK_INIT();
std::ostringstream ss;
if (violation_status == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// 1 sec = 1000 ms = 1000000 us
// 0.1 sec = 100 ms = 100000 us
constexpr uint64_t kFASTEST_POLL_TIME_MS = 100; // fastest SMU FW sample time is 100 ms
violation_status->reference_timestamp = std::numeric_limits<uint64_t>::max();
violation_status->violation_timestamp = std::numeric_limits<uint64_t>::max();
violation_status->acc_counter = std::numeric_limits<uint64_t>::max();
violation_status->acc_prochot_thrm = std::numeric_limits<uint64_t>::max();
violation_status->acc_ppt_pwr = std::numeric_limits<uint64_t>::max();
violation_status->acc_socket_thrm = std::numeric_limits<uint64_t>::max();
violation_status->acc_vr_thrm = std::numeric_limits<uint64_t>::max();
violation_status->acc_hbm_thrm = std::numeric_limits<uint64_t>::max();
violation_status->acc_gfx_clk_below_host_limit = std::numeric_limits<uint64_t>::max();
violation_status->per_prochot_thrm = std::numeric_limits<uint64_t>::max();
violation_status->per_ppt_pwr = std::numeric_limits<uint64_t>::max();
violation_status->per_socket_thrm = std::numeric_limits<uint64_t>::max();
violation_status->per_vr_thrm = std::numeric_limits<uint64_t>::max();
violation_status->per_hbm_thrm = std::numeric_limits<uint64_t>::max();
violation_status->per_gfx_clk_below_host_limit = std::numeric_limits<uint64_t>::max();
violation_status->active_prochot_thrm = std::numeric_limits<uint8_t>::max();
violation_status->active_ppt_pwr = std::numeric_limits<uint8_t>::max();
violation_status->active_socket_thrm = std::numeric_limits<uint8_t>::max();
violation_status->active_vr_thrm = std::numeric_limits<uint8_t>::max();
violation_status->active_hbm_thrm = std::numeric_limits<uint8_t>::max();
violation_status->active_gfx_clk_below_host_limit = std::numeric_limits<uint8_t>::max();
fill_2d_array(violation_status->acc_gfx_clk_below_host_limit_pwr,
std::numeric_limits<uint64_t>::max());
fill_2d_array(violation_status->acc_gfx_clk_below_host_limit_thm,
std::numeric_limits<uint64_t>::max());
fill_2d_array(violation_status->acc_low_utilization,
std::numeric_limits<uint64_t>::max());
fill_2d_array(violation_status->acc_gfx_clk_below_host_limit_total,
std::numeric_limits<uint64_t>::max());
fill_2d_array(violation_status->per_gfx_clk_below_host_limit_pwr,
std::numeric_limits<uint64_t>::max());
fill_2d_array(violation_status->per_gfx_clk_below_host_limit_thm,
std::numeric_limits<uint64_t>::max());
fill_2d_array(violation_status->per_low_utilization,
std::numeric_limits<uint64_t>::max());
fill_2d_array(violation_status->per_gfx_clk_below_host_limit_total,
std::numeric_limits<uint64_t>::max());
fill_2d_array(violation_status->active_gfx_clk_below_host_limit_pwr,
std::numeric_limits<uint8_t>::max());
fill_2d_array(violation_status->active_gfx_clk_below_host_limit_thm,
std::numeric_limits<uint8_t>::max());
fill_2d_array(violation_status->active_low_utilization,
std::numeric_limits<uint8_t>::max());
fill_2d_array(violation_status->active_gfx_clk_below_host_limit_total,
std::numeric_limits<uint8_t>::max());
const auto p1 = std::chrono::system_clock::now();
auto current_time = std::chrono::duration_cast<std::chrono::microseconds>(
p1.time_since_epoch()).count();
violation_status->reference_timestamp = current_time;
amd::smi::AMDSmiProcessor* device = nullptr;
amdsmi_status_t ret = amd::smi::AMDSmiSystem::getInstance()
.handle_to_processor(processor_handle, &device);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
if (device->get_processor_type() != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS) {
return r;
}
// default to 0xffffffff as not supported
uint32_t partition_id = std::numeric_limits<uint32_t>::max();
auto tmp_partition_id = uint32_t(0);
amdsmi_status_t status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, 0,
&(tmp_partition_id));
// Do not return early if this value fails
// continue to try getting all info
if (status == AMDSMI_STATUS_SUCCESS) {
partition_id = tmp_partition_id;
}
amdsmi_gpu_metrics_t metric_info_a = {};
status = amdsmi_get_gpu_metrics_info(
processor_handle, &metric_info_a);
if (status != AMDSMI_STATUS_SUCCESS) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | amdsmi_get_gpu_metrics_info failed with status = "
<< smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
// Note: Both XCP and partition_id will default to 0, if gpu_metrics file is not present.
// This is why we can check elements in kFIRST_ELEMENT == 0 for both XCP and partition_id.
const uint32_t kFIRST_ELEMENT = 0;
// Check if violation status is supported:
// If all of these values are "undefined" then the feature is not supported on the ASIC
if (metric_info_a.accumulation_counter == std::numeric_limits<uint64_t>::max()
&& metric_info_a.prochot_residency_acc == std::numeric_limits<uint64_t>::max()
&& metric_info_a.ppt_residency_acc == std::numeric_limits<uint64_t>::max()
&& metric_info_a.socket_thm_residency_acc == std::numeric_limits<uint64_t>::max()
&& metric_info_a.vr_thm_residency_acc == std::numeric_limits<uint64_t>::max()
&& metric_info_a.hbm_thm_residency_acc == std::numeric_limits<uint64_t>::max()
&& metric_info_a.xcp_stats[kFIRST_ELEMENT].gfx_below_host_limit_acc[kFIRST_ELEMENT]
== std::numeric_limits<uint64_t>::max()
&& metric_info_a.xcp_stats[kFIRST_ELEMENT].gfx_below_host_limit_ppt_acc[kFIRST_ELEMENT]
== std::numeric_limits<uint64_t>::max()
&& metric_info_a.xcp_stats[kFIRST_ELEMENT].gfx_below_host_limit_thm_acc[kFIRST_ELEMENT]
== std::numeric_limits<uint64_t>::max()
&& metric_info_a.xcp_stats[kFIRST_ELEMENT].gfx_low_utilization_acc[kFIRST_ELEMENT]
== std::numeric_limits<uint64_t>::max()
&& metric_info_a.xcp_stats[kFIRST_ELEMENT].gfx_below_host_limit_total_acc[kFIRST_ELEMENT]
== std::numeric_limits<uint64_t>::max()) {
ss << __PRETTY_FUNCTION__
<< " | ASIC does not support throttle violations!, "
<< "returning AMDSMI_STATUS_NOT_SUPPORTED";
LOG_INFO(ss);
return AMDSMI_STATUS_NOT_SUPPORTED;
}
// wait 100ms before reading again
system_wait(static_cast<int>(kFASTEST_POLL_TIME_MS));
amdsmi_gpu_metrics_t metric_info_b = {};
status = amdsmi_get_gpu_metrics_info(
processor_handle, &metric_info_b);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
// Insert current accumulator counters into struct
violation_status->violation_timestamp = metric_info_b.firmware_timestamp;
violation_status->acc_counter = metric_info_b.accumulation_counter;
violation_status->acc_prochot_thrm = metric_info_b.prochot_residency_acc;
violation_status->acc_ppt_pwr = metric_info_b.ppt_residency_acc;
violation_status->acc_socket_thrm = metric_info_b.socket_thm_residency_acc;
violation_status->acc_vr_thrm = metric_info_b.vr_thm_residency_acc;
violation_status->acc_hbm_thrm = metric_info_b.hbm_thm_residency_acc;
violation_status->acc_gfx_clk_below_host_limit // deprecated
= metric_info_b.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT];
// Copy XCP accumulators into 2D array
auto copy_xcp_metric = [](const auto& src, auto& dst, auto member_ptr) {
for (size_t i = 0; i < AMDSMI_MAX_NUM_XCP; ++i) {
std::copy(
std::begin(src[i].*member_ptr),
std::end(src[i].*member_ptr),
dst[i]);
}
};
copy_xcp_metric(metric_info_b.xcp_stats, violation_status->acc_gfx_clk_below_host_limit_pwr,
&amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_ppt_acc);
copy_xcp_metric(metric_info_b.xcp_stats, violation_status->acc_gfx_clk_below_host_limit_thm,
&amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_thm_acc);
copy_xcp_metric(metric_info_b.xcp_stats, violation_status->acc_low_utilization,
&amdsmi_gpu_xcp_metrics_t::gfx_low_utilization_acc);
copy_xcp_metric(metric_info_b.xcp_stats, violation_status->acc_gfx_clk_below_host_limit_total,
&amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_total_acc);
ss << __PRETTY_FUNCTION__ << " | "
<< "[gpu_metrics A] metric_info_a.accumulation_counter: " << std::dec
<< metric_info_a.accumulation_counter << "\n"
<< "; metric_info_a.prochot_residency_acc: " << std::dec
<< metric_info_a.prochot_residency_acc << "\n"
<< "; metric_info_a.ppt_residency_acc (pviol): " << std::dec
<< metric_info_a.ppt_residency_acc << "\n"
<< "; metric_info_a.socket_thm_residency_acc (tviol): " << std::dec
<< metric_info_a.socket_thm_residency_acc << "\n"
<< "; metric_info_a.vr_thm_residency_acc: " << std::dec
<< metric_info_a.vr_thm_residency_acc << "\n"
<< "; metric_info_a.hbm_thm_residency_acc: " << std::dec
<< metric_info_a.hbm_thm_residency_acc << "\n"
<< "; metric_info_a.xcp_stats[" << partition_id << "].gfx_below_host_limit_acc["
<< kFIRST_ELEMENT << "]: " << std::dec // deprecated
<< metric_info_a.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT] << "\n"
<< " [gpu_metrics B] metric_info_b.accumulation_counter: " << std::dec
<< metric_info_b.accumulation_counter << "\n"
<< "; metric_info_b.prochot_residency_acc: " << std::dec
<< metric_info_b.prochot_residency_acc << "\n"
<< "; metric_info_b.ppt_residency_acc (pviol): " << std::dec
<< metric_info_b.ppt_residency_acc << "\n"
<< "; metric_info_b.socket_thm_residency_acc (tviol): " << std::dec
<< metric_info_b.socket_thm_residency_acc << "\n"
<< "; metric_info_b.vr_thm_residency_acc: " << std::dec
<< metric_info_b.vr_thm_residency_acc << "\n"
<< "; metric_info_b.hbm_thm_residency_acc: " << std::dec
<< metric_info_b.hbm_thm_residency_acc << "\n"
<< "; metric_info_b.xcp_stats[" << partition_id << "].gfx_below_host_limit_acc["
<< kFIRST_ELEMENT << "]: " << std::dec // deprecated
<< metric_info_b.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT] << "\n";
LOG_DEBUG(ss);
if ( (metric_info_b.prochot_residency_acc != std::numeric_limits<uint64_t>::max()
|| metric_info_a.prochot_residency_acc != std::numeric_limits<uint64_t>::max())
&& (metric_info_b.prochot_residency_acc >= metric_info_a.prochot_residency_acc)
&& ((metric_info_b.accumulation_counter - metric_info_a.accumulation_counter) > 0)) {
violation_status->per_prochot_thrm =
(((metric_info_b.prochot_residency_acc - metric_info_a.prochot_residency_acc) * 100) /
(metric_info_b.accumulation_counter - metric_info_a.accumulation_counter));
if (violation_status->per_prochot_thrm > 0) {
violation_status->active_prochot_thrm = 1;
} else {
violation_status->active_prochot_thrm = 0;
}
ss << __PRETTY_FUNCTION__ << " | "
<< "ENTERED prochot_residency_acc | per_prochot_thrm: " << std::dec
<< violation_status->per_prochot_thrm
<< "%; active_prochot_thrm = " << std::dec
<< violation_status->active_prochot_thrm << "\n";
LOG_DEBUG(ss);
}
if ( (metric_info_b.ppt_residency_acc != std::numeric_limits<uint64_t>::max()
|| metric_info_a.ppt_residency_acc != std::numeric_limits<uint64_t>::max())
&& (metric_info_b.ppt_residency_acc >= metric_info_a.ppt_residency_acc)
&& ((metric_info_b.accumulation_counter - metric_info_a.accumulation_counter) > 0)) {
violation_status->per_ppt_pwr =
(((metric_info_b.ppt_residency_acc - metric_info_a.ppt_residency_acc) * 100) /
(metric_info_b.accumulation_counter - metric_info_a.accumulation_counter));
if (violation_status->per_ppt_pwr > 0) {
violation_status->active_ppt_pwr = 1;
} else {
violation_status->active_ppt_pwr = 0;
}
ss << __PRETTY_FUNCTION__ << " | "
<< "ENTERED ppt_residency_acc | per_ppt_pwr: " << std::dec
<< violation_status->per_ppt_pwr
<< "%; active_ppt_pwr = " << std::dec
<< violation_status->active_ppt_pwr << "\n";
LOG_DEBUG(ss);
}
if ( (metric_info_b.socket_thm_residency_acc != std::numeric_limits<uint64_t>::max()
|| metric_info_a.socket_thm_residency_acc != std::numeric_limits<uint64_t>::max())
&& (metric_info_b.socket_thm_residency_acc >= metric_info_a.socket_thm_residency_acc)
&& ((metric_info_b.accumulation_counter - metric_info_a.accumulation_counter) > 0)) {
violation_status->per_socket_thrm =
(((metric_info_b.socket_thm_residency_acc -
metric_info_a.socket_thm_residency_acc) * 100) /
(metric_info_b.accumulation_counter - metric_info_a.accumulation_counter));
if (violation_status->per_socket_thrm > 0) {
violation_status->active_socket_thrm = 1;
} else {
violation_status->active_socket_thrm = 0;
}
ss << __PRETTY_FUNCTION__ << " | "
<< "ENTERED socket_thm_residency_acc | per_socket_thrm: " << std::dec
<< violation_status->per_socket_thrm
<< "%; active_socket_thrm = " << std::dec
<< violation_status->active_socket_thrm << "\n";
LOG_DEBUG(ss);
}
if ( (metric_info_b.vr_thm_residency_acc != std::numeric_limits<uint64_t>::max()
|| metric_info_a.vr_thm_residency_acc != std::numeric_limits<uint64_t>::max())
&& (metric_info_b.vr_thm_residency_acc >= metric_info_a.vr_thm_residency_acc)
&& ((metric_info_b.accumulation_counter - metric_info_a.accumulation_counter) > 0)) {
violation_status->per_vr_thrm =
(((metric_info_b.vr_thm_residency_acc -
metric_info_a.vr_thm_residency_acc) * 100) /
(metric_info_b.accumulation_counter - metric_info_a.accumulation_counter));
if (violation_status->per_vr_thrm > 0) {
violation_status->active_vr_thrm = 1;
} else {
violation_status->active_vr_thrm = 0;
}
ss << __PRETTY_FUNCTION__ << " | "
<< "ENTERED vr_thm_residency_acc | per_vr_thrm: " << std::dec
<< violation_status->per_vr_thrm
<< "%; active_ppt_pwr = " << std::dec
<< violation_status->active_vr_thrm << "\n";
LOG_DEBUG(ss);
}
if ( (metric_info_b.hbm_thm_residency_acc != std::numeric_limits<uint64_t>::max()
|| metric_info_a.hbm_thm_residency_acc != std::numeric_limits<uint64_t>::max())
&& (metric_info_b.hbm_thm_residency_acc >= metric_info_a.hbm_thm_residency_acc)
&& ((metric_info_b.accumulation_counter - metric_info_a.accumulation_counter) > 0) ) {
violation_status->per_hbm_thrm =
(((metric_info_b.hbm_thm_residency_acc -
metric_info_a.hbm_thm_residency_acc) * 100) /
(metric_info_b.accumulation_counter - metric_info_a.accumulation_counter));
if (violation_status->per_hbm_thrm > 0) {
violation_status->active_hbm_thrm = 1;
} else {
violation_status->active_hbm_thrm = 0;
}
ss << __PRETTY_FUNCTION__ << " | "
<< "ENTERED hbm_thm_residency_acc | per_hbm_thrm: " << std::dec
<< violation_status->per_hbm_thrm
<< "%; active_ppt_pwr = " << std::dec
<< violation_status->active_hbm_thrm << "\n";
LOG_DEBUG(ss);
}
// deprecated - design likely needs to include both [XCP][XCC], like the new metrics
if ((metric_info_b.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT]
!= std::numeric_limits<uint64_t>::max() ||
metric_info_a.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT]
!= std::numeric_limits<uint64_t>::max()) &&
(metric_info_b.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT]
>= metric_info_a.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT]) &&
((metric_info_b.accumulation_counter - metric_info_a.accumulation_counter) > 0)) {
violation_status->per_gfx_clk_below_host_limit =
(((metric_info_b.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT] -
metric_info_a.xcp_stats[partition_id].gfx_below_host_limit_acc[kFIRST_ELEMENT])
* 100) /
(metric_info_b.accumulation_counter - metric_info_a.accumulation_counter));
if (violation_status->per_gfx_clk_below_host_limit > 0) {
violation_status->active_gfx_clk_below_host_limit = 1;
} else {
violation_status->active_gfx_clk_below_host_limit = 0;
}
ss << __PRETTY_FUNCTION__ << " | "
<< "ENTERED gfx_below_host_limit_acc | per_gfx_clk_below_host_limit: " << std::dec
<< violation_status->per_gfx_clk_below_host_limit
<< "%; active_ppt_pwr = " << std::boolalpha
<< violation_status->active_gfx_clk_below_host_limit << "\n";
LOG_DEBUG(ss);
}
// one-shot processing of all XCP violation metrics
// using a lambda function to avoid code duplication
using MetricArrayType = uint64_t[AMDSMI_MAX_NUM_XCC];
using MetricMemberPtr = MetricArrayType amdsmi_gpu_xcp_metrics_t::*;
auto process_all_XCP_violation_metrics = [&](
const std::vector<std::pair<std::string, MetricMemberPtr>>& metric_members,
std::vector<std::reference_wrapper<
uint64_t[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]>> per_arrays,
std::vector<std::reference_wrapper<
uint8_t[AMDSMI_MAX_NUM_XCP][AMDSMI_MAX_NUM_XCC]>> active_arrays) {
uint64_t counter_delta = static_cast<uint64_t>(metric_info_b.accumulation_counter)
- static_cast<uint64_t>(metric_info_a.accumulation_counter);
ss << __PRETTY_FUNCTION__ << " | Processing all XCP metrics with counter_delta: "
<< std::dec << counter_delta << "\n";
LOG_DEBUG(ss);
for (size_t metric_idx = 0; metric_idx < metric_members.size(); ++metric_idx) {
const auto& member_pair = metric_members[metric_idx];
const std::string& member_name = member_pair.first;
MetricMemberPtr member_ptr = member_pair.second;
auto& per_arr = per_arrays[metric_idx].get();
auto& active_arr = active_arrays[metric_idx].get();
ss << " [Metric] " << member_name << "\n";
for (uint32_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) {
const MetricArrayType& arr_a = metric_info_a.xcp_stats[xcp].*member_ptr;
const MetricArrayType& arr_b = metric_info_b.xcp_stats[xcp].*member_ptr;
ss << " xcp: " << xcp << " (";
for (uint32_t xcc = 0; xcc < AMDSMI_MAX_NUM_XCC; ++xcc) {
uint64_t val_a = arr_a[xcc];
uint64_t val_b = arr_b[xcc];
if (val_b == std::numeric_limits<uint64_t>::max() ||
val_a == std::numeric_limits<uint64_t>::max() ||
counter_delta <= 0 ||
val_b < val_a) {
per_arr[xcp][xcc] = std::numeric_limits<uint64_t>::max();
active_arr[xcp][xcc] = std::numeric_limits<uint8_t>::max();
ss << "[Invalid] (" << std::dec << per_arr[xcp][xcc]
<< ", " << static_cast<int>(active_arr[xcp][xcc]) << ") ";
continue;
}
uint64_t percent = ((val_b - val_a) * 100) / counter_delta;
per_arr[xcp][xcc] = percent;
active_arr[xcp][xcc] = (percent > 0) ? 1 : 0;
ss << "[Valid] (" << std::dec << percent << "%, "
<< std::boolalpha << static_cast<bool>(active_arr[xcp][xcc])
<< ") | val_b: " << std::dec << val_b
<< ", val_a: " << std::dec << val_a
<< ", counter_delta: " << std::dec << counter_delta << " ";
}
ss << ")\n";
}
}
LOG_DEBUG(ss);
};
// Prepare metric members and arrays for processing
const std::vector<std::pair<std::string, MetricMemberPtr>> metric_members = {
{"gfx_below_host_limit_ppt_acc", &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_ppt_acc},
{"gfx_below_host_limit_thm_acc", &amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_thm_acc},
{"gfx_low_utilization_acc", &amdsmi_gpu_xcp_metrics_t::gfx_low_utilization_acc},
{"gfx_below_host_limit_total_acc",
&amdsmi_gpu_xcp_metrics_t::gfx_below_host_limit_total_acc}
};
process_all_XCP_violation_metrics(
metric_members,
{
std::ref(violation_status->per_gfx_clk_below_host_limit_pwr),
std::ref(violation_status->per_gfx_clk_below_host_limit_thm),
std::ref(violation_status->per_low_utilization),
std::ref(violation_status->per_gfx_clk_below_host_limit_total)
},
{
std::ref(violation_status->active_gfx_clk_below_host_limit_pwr),
std::ref(violation_status->active_gfx_clk_below_host_limit_thm),
std::ref(violation_status->active_low_utilization),
std::ref(violation_status->active_gfx_clk_below_host_limit_total)
});
ss << __PRETTY_FUNCTION__ << " | "
<< "RETURNING AMDSMI_STATUS_SUCCESS | "
<< "violation_status->reference_timestamp (time since epoch): " << std::dec
<< violation_status->reference_timestamp
<< "; violation_status->violation_timestamp (ms): " << std::dec
<< violation_status->violation_timestamp
<< "; violation_status->per_prochot_thrm (%): " << std::dec
<< violation_status->per_prochot_thrm
<< "; violation_status->per_ppt_pwr (%): " << std::dec
<< violation_status->per_ppt_pwr
<< "; violation_status->per_socket_thrm (%): " << std::dec
<< violation_status->per_socket_thrm
<< "; violation_status->per_vr_thrm (%): " << std::dec
<< violation_status->per_vr_thrm
<< "; violation_status->per_hbm_thrm (%): " << std::dec
<< violation_status->per_hbm_thrm
<< "; violation_status->per_gfx_clk_below_host_limit (%): " << std::dec // deprecated
<< violation_status->per_gfx_clk_below_host_limit
<< "; violation_status->active_prochot_thrm (bool): " << std::boolalpha
<< static_cast<int>(violation_status->active_prochot_thrm)
<< "; violation_status->active_ppt_pwr (bool): " << std::boolalpha
<< static_cast<int>(violation_status->active_ppt_pwr)
<< "; violation_status->active_socket_thrm (bool): " << std::boolalpha
<< static_cast<int>(violation_status->active_socket_thrm)
<< "; violation_status->active_vr_thrm (bool): " << std::boolalpha
<< static_cast<int>(violation_status->active_vr_thrm)
<< "; violation_status->active_hbm_thrm (bool): " << std::boolalpha
<< static_cast<int>(violation_status->active_hbm_thrm)
<< "; violation_status->active_gfx_clk_below_host_limit (bool): " // deprecated
<< std::boolalpha << static_cast<int>(violation_status->active_gfx_clk_below_host_limit)
<< "\n";
LOG_INFO(ss);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_gpu_fan_rpms(amdsmi_processor_handle processor_handle,
uint32_t sensor_ind, int64_t *speed) {
return rsmi_wrapper(rsmi_dev_fan_rpms_get, processor_handle, 0,
sensor_ind, speed);
}
amdsmi_status_t amdsmi_get_gpu_fan_speed(amdsmi_processor_handle processor_handle,
uint32_t sensor_ind, int64_t *speed) {
return rsmi_wrapper(rsmi_dev_fan_speed_get, processor_handle, 0,
sensor_ind, speed);
}
amdsmi_status_t amdsmi_get_gpu_fan_speed_max(amdsmi_processor_handle processor_handle,
uint32_t sensor_ind, uint64_t *max_speed) {
return rsmi_wrapper(rsmi_dev_fan_speed_max_get, processor_handle, 0,
sensor_ind, max_speed);
}
amdsmi_status_t amdsmi_reset_gpu_fan(amdsmi_processor_handle processor_handle,
uint32_t sensor_ind) {
return rsmi_wrapper(rsmi_dev_fan_reset, processor_handle, 0,
sensor_ind);
}
amdsmi_status_t amdsmi_set_gpu_fan_speed(amdsmi_processor_handle processor_handle,
uint32_t sensor_ind, uint64_t speed) {
// Bare Metal and passthrough only feature
amdsmi_virtualization_mode_t virt_mode;
if (amdsmi_get_gpu_virtualization_mode(processor_handle, &virt_mode) == AMDSMI_STATUS_SUCCESS) {
if (virt_mode == AMDSMI_VIRTUALIZATION_MODE_GUEST) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
}
return rsmi_wrapper(rsmi_dev_fan_speed_set, processor_handle, 0,
sensor_ind, speed);
}
amdsmi_status_t amdsmi_get_gpu_id(amdsmi_processor_handle processor_handle,
uint16_t *id) {
return rsmi_wrapper(rsmi_dev_id_get, processor_handle, 0,
id);
}
amdsmi_status_t amdsmi_get_gpu_revision(amdsmi_processor_handle processor_handle,
uint16_t *revision) {
return rsmi_wrapper(rsmi_dev_revision_get, processor_handle, 0,
revision);
}
// TODO(bliu) : add fw info from libdrm
amdsmi_status_t amdsmi_get_fw_info(amdsmi_processor_handle processor_handle,
amdsmi_fw_info_t *info) {
const std::map<amdsmi_fw_block_t, rsmi_fw_block_t> fw_in_rsmi = {
{ AMDSMI_FW_ID_ASD, RSMI_FW_BLOCK_ASD},
{ AMDSMI_FW_ID_CP_CE, RSMI_FW_BLOCK_CE},
{ AMDSMI_FW_ID_DMCU, RSMI_FW_BLOCK_DMCU},
{ AMDSMI_FW_ID_MC, RSMI_FW_BLOCK_MC},
{ AMDSMI_FW_ID_CP_ME, RSMI_FW_BLOCK_ME},
{ AMDSMI_FW_ID_CP_MEC1, RSMI_FW_BLOCK_MEC},
{ AMDSMI_FW_ID_CP_MEC2, RSMI_FW_BLOCK_MEC2},
{ AMDSMI_FW_ID_CP_PFP, RSMI_FW_BLOCK_PFP},
{ AMDSMI_FW_ID_RLC, RSMI_FW_BLOCK_RLC},
{ AMDSMI_FW_ID_RLC_RESTORE_LIST_CNTL, RSMI_FW_BLOCK_RLC_SRLC},
{ AMDSMI_FW_ID_RLC_RESTORE_LIST_GPM_MEM, RSMI_FW_BLOCK_RLC_SRLG},
{ AMDSMI_FW_ID_RLC_RESTORE_LIST_SRM_MEM, RSMI_FW_BLOCK_RLC_SRLS},
{ AMDSMI_FW_ID_SDMA0, RSMI_FW_BLOCK_SDMA},
{ AMDSMI_FW_ID_SDMA1, RSMI_FW_BLOCK_SDMA2},
{ AMDSMI_FW_ID_PM, RSMI_FW_BLOCK_SMC},
{ AMDSMI_FW_ID_PSP_SOSDRV, RSMI_FW_BLOCK_SOS},
{ AMDSMI_FW_ID_TA_RAS, RSMI_FW_BLOCK_TA_RAS},
{ AMDSMI_FW_ID_TA_XGMI, RSMI_FW_BLOCK_TA_XGMI},
{ AMDSMI_FW_ID_UVD, RSMI_FW_BLOCK_UVD},
{ AMDSMI_FW_ID_VCE, RSMI_FW_BLOCK_VCE},
{ AMDSMI_FW_ID_VCN, RSMI_FW_BLOCK_VCN},
{ AMDSMI_FW_ID_PLDM_BUNDLE, RSMI_FW_BLOCK_PLDM_BUNDLE},
};
AMDSMI_CHECK_INIT();
if (info == nullptr)
return AMDSMI_STATUS_INVAL;
memset(info, 0, sizeof(amdsmi_fw_info_t));
// collect all rsmi supported fw block
for (auto ite = fw_in_rsmi.begin(); ite != fw_in_rsmi.end(); ite ++) {
auto status = rsmi_wrapper(rsmi_dev_firmware_version_get, processor_handle, 0,
(*ite).second,
&(info->fw_info_list[info->num_fw_info].fw_version));
if (status == AMDSMI_STATUS_SUCCESS) {
info->fw_info_list[info->num_fw_info].fw_id = (*ite).first;
info->num_fw_info++;
}
}
return AMDSMI_STATUS_SUCCESS;
}
// If similar caches are implemented in the future, make this generic and move it
namespace {
struct AsicInfoCache {
amdsmi_asic_info_t info{};
std::chrono::steady_clock::time_point last_read;
bool valid = false;
std::mutex mtx;
};
std::unordered_map<std::string, AsicInfoCache> g_asic_info_cache_map;
std::mutex g_asic_info_cache_map_mu;
static const std::chrono::milliseconds kAsicInfoCacheDuration(
read_env_ms("AMDSMI_ASIC_INFO_CACHE_MS", 10000)
);
}
amdsmi_status_t
amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_info_t *info) {
AMDSMI_CHECK_INIT();
if (info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
struct drm_amdgpu_info_device dev_info = {};
uint16_t vendor_id = 0;
uint16_t subvendor_id = 0;
uint16_t device_id = 0;
uint16_t subsystem_id = 0;
char temp_market_name[AMDSMI_MAX_STRING_LENGTH] = {0};
smi_clear_char_and_reinitialize(info->market_name, AMDSMI_MAX_STRING_LENGTH, temp_market_name);
info->market_name[0] = '\0';
info->vendor_id = std::numeric_limits<uint32_t>::max();
info->vendor_name[0] = '\0';
info->subvendor_id = std::numeric_limits<uint32_t>::max();
info->device_id = std::numeric_limits<uint64_t>::max();
info->rev_id = std::numeric_limits<uint16_t>::max();
info->asic_serial[0] = '\0';
info->oam_id = std::numeric_limits<uint32_t>::max();
info->num_of_compute_units = std::numeric_limits<uint32_t>::max();
info->target_graphics_version = std::numeric_limits<uint64_t>::max();
info->subsystem_id = std::numeric_limits<uint32_t>::max();
std::ostringstream ss;
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS) {
return r;
}
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex())
// ---- ASIC info cache ----
const std::string key = gpu_device->get_gpu_path();
AsicInfoCache* cache_ptr = nullptr;
{
std::lock_guard<std::mutex> map_lk(g_asic_info_cache_map_mu);
cache_ptr = &g_asic_info_cache_map[key];
}
{
std::lock_guard<std::mutex> lk(cache_ptr->mtx);
auto now = std::chrono::steady_clock::now();
auto last_read_delta = std::chrono::duration_cast<std::chrono::milliseconds>(now - cache_ptr->last_read);
if (cache_ptr->valid &&
kAsicInfoCacheDuration > std::chrono::milliseconds::zero() &&
last_read_delta < kAsicInfoCacheDuration) {
*info = cache_ptr->info;
ss << "Returned cached ASIC info for key=" << key
<< " (age=" << last_read_delta.count() << "ms)";
LOG_INFO(ss);
return AMDSMI_STATUS_SUCCESS;
}
}
/**
* For other sysfs related information, get from rocm-smi
*/
// Ensure asic_serial defaults to an unsupported value
std::string max_uint64_str = "ffffffffffffffff";
smi_clear_char_and_reinitialize(info->asic_serial, AMDSMI_MAX_STRING_LENGTH, max_uint64_str);
uint64_t device_uuid = 0;
amdsmi_status_t status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0,
&device_uuid);
if (status == AMDSMI_STATUS_SUCCESS) {
ss.clear();
ss << std::hex << std::setw(16) << std::setfill('0') << device_uuid;
std::string asic_serial_str = ss.str();
ss.clear();
smi_clear_char_and_reinitialize(info->asic_serial, AMDSMI_MAX_STRING_LENGTH,
asic_serial_str);
ss << __PRETTY_FUNCTION__
<< " | Retrieved unique_id from rsmi: " << processor_handle << "\n"
<< " ; Successfully fell back to KFD's unique_id... \n"
<< " ; info->asic_serial (hex): " << info->asic_serial << "\n"
<< " ; info->asic_serial (dec): " << std::dec
<< static_cast<uint64_t>(std::stoull(asic_serial_str, nullptr, 16));
LOG_INFO(ss);
}
status = rsmi_wrapper(rsmi_dev_subsystem_vendor_id_get, processor_handle, 0,
&subvendor_id);
if (status == AMDSMI_STATUS_SUCCESS) info->subvendor_id = subvendor_id;
status = rsmi_wrapper(rsmi_dev_subsystem_id_get, processor_handle, 0,
&subsystem_id);
if (status == AMDSMI_STATUS_SUCCESS) info->subsystem_id = subsystem_id;
char temp_vendor_name[AMDSMI_MAX_STRING_LENGTH] = {0};
status = rsmi_wrapper(rsmi_dev_pcie_vendor_name_get, processor_handle, 0,
temp_vendor_name, AMDSMI_MAX_STRING_LENGTH);
if (status == AMDSMI_STATUS_SUCCESS) {
smi_clear_char_and_reinitialize(info->vendor_name, AMDSMI_MAX_STRING_LENGTH,
temp_vendor_name);
}
uint16_t tmp_oam_id = 0;
status = rsmi_wrapper(rsmi_dev_xgmi_physical_id_get, processor_handle, 0,
&(tmp_oam_id));
if (status == AMDSMI_STATUS_SUCCESS) {
info->oam_id = tmp_oam_id;
}
auto tmp_num_of_compute_units = uint32_t(0);
status = rsmi_wrapper(amd::smi::rsmi_dev_number_of_computes_get, processor_handle, 0,
&(tmp_num_of_compute_units));
if (status == AMDSMI_STATUS_SUCCESS) {
info->num_of_compute_units = tmp_num_of_compute_units;
}
auto tmp_target_gfx_version = uint64_t(0);
status = rsmi_wrapper(rsmi_dev_target_graphics_version_get, processor_handle, 0,
&(tmp_target_gfx_version));
if (status == AMDSMI_STATUS_SUCCESS) {
info->target_graphics_version = tmp_target_gfx_version;
}
status = rsmi_wrapper(rsmi_dev_id_get, processor_handle, 0,
&device_id);
ss << __PRETTY_FUNCTION__ << " | rsmi_dev_id_get() returned: "
<< smi_amdgpu_get_status_string(status, true) << "\n"
<< " ; device_id (dec): " << std::dec << device_id << "\n"
<< " ; device_id (hex): 0x"
<< std::hex << std::setw(4) << std::setfill('0') << device_id << std::dec;
LOG_INFO(ss);
if (status == AMDSMI_STATUS_SUCCESS) {
info->device_id = static_cast<uint64_t>(device_id);
}
info->rev_id = dev_info.pci_rev;
status = rsmi_wrapper(rsmi_dev_vendor_id_get, processor_handle, 0,
&vendor_id);
if (status == AMDSMI_STATUS_SUCCESS) {
info->vendor_id = vendor_id;
}
// If vendor name is empty and the vendor id is 0x1002, set vendor name to AMD vendor string
if ((info->vendor_name[0] == '\0') && info->vendor_id == 0x1002) {
std::string amd_name = "Advanced Micro Devices Inc. [AMD/ATI]";
smi_clear_char_and_reinitialize(info->vendor_name, AMDSMI_MAX_STRING_LENGTH, amd_name);
}
status = smi_amdgpu_get_market_name_from_dev_id(gpu_device, info->market_name);
if (status != AMDSMI_STATUS_SUCCESS) {
status = rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0,
temp_market_name, AMDSMI_MAX_STRING_LENGTH);
if (status == AMDSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__
<< " | rsmi_dev_brand_get() returned: "
<< smi_amdgpu_get_status_string(status, false) << "\n"
<< " ; temp_market_name: " << temp_market_name << "\n";
LOG_INFO(ss);
smi_clear_char_and_reinitialize(info->market_name, AMDSMI_MAX_STRING_LENGTH,
temp_market_name);
} else {
ss << __PRETTY_FUNCTION__
<< " | rsmi_dev_brand_get() failed: "
<< smi_amdgpu_get_status_string(status, false) << "\n";
LOG_INFO(ss);
}
}
std::string render_name = gpu_device->get_gpu_path();
if (render_name.empty()) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
std::string path = "/dev/dri/" + render_name;
ScopedFD drm_fd(path.c_str(), O_RDWR | O_CLOEXEC);
if (!drm_fd.valid()) {
ss << __PRETTY_FUNCTION__
<< " | Failed to open " << path << ": " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false);
LOG_ERROR(ss);
return AMDSMI_STATUS_FILE_ERROR;
}
amd::smi::AMDSmiLibraryLoader libdrm;
status = libdrm.load(LIBDRM_AMDGPU_SONAME);
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load " LIBDRM_AMDGPU_SONAME ": " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
// extern int drmCommandWrite(int fd, unsigned long drmCommandIndex,
// void *data, unsigned long size);
typedef int (*drmCommandWrite_t)(int fd, unsigned long drmCommandIndex,
void *data, unsigned long size);
drmCommandWrite_t drmCommandWrite = nullptr;
// load symbol from libdrm
status = libdrm.load_symbol(reinterpret_cast<drmCommandWrite_t *>(&drmCommandWrite),
"drmCommandWrite");
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load drmCommandWrite symbol"
<< " | Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
// Get the device info
memset(&dev_info, 0, sizeof(struct drm_amdgpu_info_device));
struct drm_amdgpu_info request = {};
memset(&request, 0, sizeof(request));
request.return_pointer = reinterpret_cast<unsigned long long>(&dev_info);
request.return_size = sizeof(struct drm_amdgpu_info_device);
request.query = AMDGPU_INFO_DEV_INFO;
auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request,
sizeof(struct drm_amdgpu_info));
if (drm_write != 0) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Issue - drm_write failed, drm_write: " << std::dec << drm_write << "\n"
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_DRM_ERROR, false);
LOG_ERROR(ss);
return AMDSMI_STATUS_DRM_ERROR;
}
// TODO(cpoag): check if this is correct, might be able to go through KGD/KFD
info->rev_id = static_cast<uint32_t>(dev_info.pci_rev);
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | info->market_name: " << info->market_name << "\n"
<< " | info->vendor_id (dec): " << std::dec << info->vendor_id << "\n"
<< " | info->vendor_id (hex): 0x"
<< std::hex << std::setw(4) << std::setfill('0') << info->vendor_id << "\n"
<< " | info->vendor_name: " << info->vendor_name << "\n"
<< " | info->subvendor_id (dec): " << std::dec << info->subvendor_id << "\n"
<< " | info->subvendor_id (hex): 0x"
<< std::hex << std::setw(4) << std::setfill('0') << info->subvendor_id << "\n"
<< " | info->device_id (dec): " << std::dec << info->device_id << "\n"
<< " | info->device_id (hex): 0x"
<< std::hex << std::setw(4) << std::setfill('0') << info->device_id << "\n"
<< " | info->rev_id (dec): " << std::dec << info->rev_id << "\n"
<< " | info->rev_id (hex): 0x"
<< std::hex << std::setw(4) << std::setfill('0') << info->rev_id << "\n"
<< " | info->asic_serial: 0x" << info->asic_serial << "\n"
<< " | info->oam_id (dec): " << std::dec << info->oam_id << "\n"
<< " | info->oam_id (hex): 0x"
<< std::hex << std::setw(4) << std::setfill('0') << info->oam_id << "\n"
<< " | info->num_of_compute_units (dec): " << std::dec
<< info->num_of_compute_units << "\n"
<< " | info->target_graphics_version: gfx"
<< std::hex << info->target_graphics_version << "\n"
<< " | Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_SUCCESS, true);
LOG_INFO(ss);
// ---- Store cache success ----
if (status == AMDSMI_STATUS_SUCCESS &&
kAsicInfoCacheDuration > std::chrono::milliseconds::zero()) {
auto now = std::chrono::steady_clock::now();
std::lock_guard<std::mutex> lk(cache_ptr->mtx);
cache_ptr->info = *info;
cache_ptr->last_read = now;
cache_ptr->valid = true;
ss << "Successfully Cached ASIC info for key=" << key;
LOG_INFO(ss);
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle,
amdsmi_xgmi_link_status_t *link_status) {
AMDSMI_CHECK_INIT();
if (link_status == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amdsmi_gpu_metrics_t metric_info = {};
amdsmi_status_t status = amdsmi_get_gpu_metrics_info(
processor_handle, &metric_info);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
uint32_t dev_num = 0;
rsmi_num_monitor_devices(&dev_num);
link_status->total_links = AMDSMI_MAX_NUM_XGMI_LINKS;
// get the status values from the metric info
for (unsigned int i = 0; i < link_status->total_links; i++) {
if (metric_info.xgmi_link_status[i] == std::numeric_limits<uint16_t>::max()) {
link_status->status[i] = AMDSMI_XGMI_LINK_DISABLE;
} else if (metric_info.xgmi_link_status[i] == 0) {
link_status->status[i] = AMDSMI_XGMI_LINK_DOWN;
} else if (metric_info.xgmi_link_status[i] == 1) {
link_status->status[i] = AMDSMI_XGMI_LINK_UP;
} else {
return AMDSMI_STATUS_UNEXPECTED_DATA;
}
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_gpu_kfd_info(amdsmi_processor_handle processor_handle,
amdsmi_kfd_info_t *info) {
AMDSMI_CHECK_INIT();
if (info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amdsmi_status_t status;
// default to 0xffffffffffffffff as not supported
info->kfd_id = std::numeric_limits<uint64_t>::max();
auto tmp_kfd_id = uint64_t(0);
status = rsmi_wrapper(rsmi_dev_guid_get, processor_handle, 0,
&(tmp_kfd_id));
// Do not return early if this value fails
// continue to try getting all info
if (status == AMDSMI_STATUS_SUCCESS) {
info->kfd_id = tmp_kfd_id;
}
// default to 0xffffffff as not supported
info->node_id = std::numeric_limits<uint32_t>::max();
auto tmp_node_id = uint32_t(0);
status = rsmi_wrapper(rsmi_dev_node_id_get, processor_handle, 0,
&(tmp_node_id));
// Do not return early if this value fails
// continue to try getting all info
if (status == AMDSMI_STATUS_SUCCESS) {
info->node_id = tmp_node_id;
}
// default to 0xffffffff as not supported
info->current_partition_id = std::numeric_limits<uint32_t>::max();
auto tmp_current_partition_id = uint32_t(0);
status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, 0,
&(tmp_current_partition_id));
// Do not return early if this value fails
// continue to try getting all info
if (status == AMDSMI_STATUS_SUCCESS) {
info->current_partition_id = tmp_current_partition_id;
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_gpu_subsystem_id(amdsmi_processor_handle processor_handle,
uint16_t *id) {
return rsmi_wrapper(rsmi_dev_subsystem_id_get, processor_handle, 0,
id);
}
amdsmi_status_t amdsmi_get_gpu_subsystem_name(
amdsmi_processor_handle processor_handle,
char *name, size_t len) {
return rsmi_wrapper(rsmi_dev_subsystem_name_get, processor_handle, 0,
name, len);
}
amdsmi_status_t amdsmi_get_gpu_vendor_name(
amdsmi_processor_handle processor_handle, char *name, size_t len) {
return rsmi_wrapper(rsmi_dev_vendor_name_get, processor_handle, 0,
name, len);
}
amdsmi_status_t amdsmi_get_gpu_vram_vendor(amdsmi_processor_handle processor_handle,
char *brand, uint32_t len) {
return rsmi_wrapper(rsmi_dev_vram_vendor_get, processor_handle, 0,
brand, len);
}
amdsmi_status_t amdsmi_get_gpu_vram_info(
amdsmi_processor_handle processor_handle, amdsmi_vram_info_t *info) {
AMDSMI_CHECK_INIT();
if (info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle,
&gpu_device);
if (r != AMDSMI_STATUS_SUCCESS) {
return r;
}
std::ostringstream ss;
// init the info structure with default value
info->vram_type = AMDSMI_VRAM_TYPE_UNKNOWN;
info->vram_size = 0;
strncpy(info->vram_vendor, "UNKNOWN", AMDSMI_MAX_STRING_LENGTH);
info->vram_bit_width = std::numeric_limits<decltype(info->vram_bit_width)>::max();
info->vram_max_bandwidth = std::numeric_limits<decltype(info->vram_max_bandwidth)>::max();
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex());
std::string render_name = gpu_device->get_gpu_path();
std::string path = "/dev/dri/" + render_name;
if (render_name.empty()) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
ScopedFD drm_fd(path.c_str(), O_RDWR | O_CLOEXEC);
if (!drm_fd.valid()) {
ss << __PRETTY_FUNCTION__
<< " | Failed to open " << path << ": " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false);
LOG_ERROR(ss);
return AMDSMI_STATUS_FILE_ERROR;
}
amd::smi::AMDSmiLibraryLoader libdrm;
amdsmi_status_t status = libdrm.load(LIBDRM_AMDGPU_SONAME);
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load " LIBDRM_AMDGPU_SONAME ": " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
ss << __PRETTY_FUNCTION__
<< " | about to load drmCommandWrite symbol";
LOG_INFO(ss);
// extern int drmCommandWrite(int fd, unsigned long drmCommandIndex,
// void *data, unsigned long size);
typedef int (*drmCommandWrite_t)(int fd, unsigned long drmCommandIndex,
void *data, unsigned long size);
drmCommandWrite_t drmCommandWrite = nullptr;
// load symbol from libdrm
status = libdrm.load_symbol(reinterpret_cast<drmCommandWrite_t *>(&drmCommandWrite),
"drmCommandWrite");
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load drmCommandWrite symbol"
<< " | Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
ss << __PRETTY_FUNCTION__
<< " | drmCommandWrite symbol loaded successfully";
LOG_INFO(ss);
struct drm_amdgpu_info_device dev_info = {};
memset(&dev_info, 0, sizeof(struct drm_amdgpu_info_device));
struct drm_amdgpu_info request = {};
memset(&request, 0, sizeof(request));
request.return_pointer = reinterpret_cast<unsigned long long>(&dev_info);
request.return_size = sizeof(struct drm_amdgpu_info_device);
request.query = AMDGPU_INFO_DEV_INFO;
auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request,
sizeof(struct drm_amdgpu_info));
if (drm_write != 0) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Issue - drm_write failed, drm_write: " << std::dec << drm_write << "\n"
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_DRM_ERROR, false);
LOG_ERROR(ss);
return AMDSMI_STATUS_DRM_ERROR;
}
info->vram_type = amd::smi::vram_type_value(dev_info.vram_type);
info->vram_bit_width = dev_info.vram_bit_width;
libdrm.unload();
// if vram type is greater than the max enum set it to unknown
if (info->vram_type > AMDSMI_VRAM_TYPE__MAX) info->vram_type = AMDSMI_VRAM_TYPE_UNKNOWN;
// set info->vram_max_bandwidth to gpu_metrics vram_max_bandwidth if it is not set
amdsmi_gpu_metrics_t metric_info = {};
r = amdsmi_get_gpu_metrics_info(processor_handle, &metric_info);
if (r == AMDSMI_STATUS_SUCCESS) {
info->vram_max_bandwidth = metric_info.vram_max_bandwidth;
}
// map the vendor name to enum
char brand[256] = {'\0'};
r = rsmi_wrapper(rsmi_dev_vram_vendor_get, processor_handle, 0, brand, 255);
if (r == AMDSMI_STATUS_SUCCESS) {
for (auto &x : brand)
x = static_cast<char>(toupper(x));
strncpy(info->vram_vendor, brand, AMDSMI_MAX_STRING_LENGTH);
}
uint64_t total = 0;
r = rsmi_wrapper(rsmi_dev_memory_total_get, processor_handle, 0,
RSMI_MEM_TYPE_VRAM, &total);
if (r == AMDSMI_STATUS_SUCCESS) {
info->vram_size = total / (1024 * 1024);
}
ss << __PRETTY_FUNCTION__
<< " | info->vram_type: " << std::dec << info->vram_type << "\n"
<< "; info->vram_size (MB): " << std::dec << info->vram_size << "\n"
<< "; info->vram_vendor: " << std::dec << info->vram_vendor << "\n"
<< "; info->vram_bit_width: " << std::dec
<< (info->vram_bit_width == std::numeric_limits<uint64_t>::max() ?
"N/A" : std::to_string(info->vram_bit_width)) << "\n"
<< "; info->vram_max_bandwidth (GB/s): " << std::dec
<< info->vram_max_bandwidth << "\n"
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_SUCCESS, false);
LOG_INFO(ss);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_init_gpu_event_notification(amdsmi_processor_handle processor_handle) {
return rsmi_wrapper(rsmi_event_notification_init, processor_handle, 0);
}
amdsmi_status_t
amdsmi_set_gpu_event_notification_mask(amdsmi_processor_handle processor_handle,
uint64_t mask) {
return rsmi_wrapper(rsmi_event_notification_mask_set, processor_handle, 0, mask);
}
amdsmi_status_t
amdsmi_get_gpu_event_notification(int timeout_ms,
uint32_t *num_elem, amdsmi_evt_notification_data_t *data) {
AMDSMI_CHECK_INIT();
if (num_elem == nullptr || data == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// Get the rsmi data
std::vector<rsmi_evt_notification_data_t> r_data(*num_elem);
rsmi_status_t r = rsmi_event_notification_get(
timeout_ms, num_elem, &r_data[0]);
if (r != RSMI_STATUS_SUCCESS) {
return amd::smi::rsmi_to_amdsmi_status(r);
}
// convert output
for (uint32_t i=0; i < *num_elem; i++) {
rsmi_evt_notification_data_t rsmi_data = r_data[i];
data[i].event = static_cast<amdsmi_evt_notification_type_t>(
rsmi_data.event);
// Size is tied max event notification size
snprintf(data[i].message,
AMDSMI_MAX_STRING_LENGTH,
"%s",
rsmi_data.message);
amdsmi_status_t r = amd::smi::AMDSmiSystem::getInstance()
.gpu_index_to_handle(rsmi_data.dv_ind, &(data[i].processor_handle));
if (r != AMDSMI_STATUS_SUCCESS) return r;
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_stop_gpu_event_notification(
amdsmi_processor_handle processor_handle) {
return rsmi_wrapper(rsmi_event_notification_stop, processor_handle, 0);
}
amdsmi_status_t amdsmi_gpu_counter_group_supported(
amdsmi_processor_handle processor_handle, amdsmi_event_group_t group) {
return rsmi_wrapper(rsmi_dev_counter_group_supported, processor_handle, 0,
static_cast<rsmi_event_group_t>(group));
}
amdsmi_status_t amdsmi_gpu_create_counter(amdsmi_processor_handle processor_handle,
amdsmi_event_type_t type, amdsmi_event_handle_t *evnt_handle) {
return rsmi_wrapper(rsmi_dev_counter_create, processor_handle, 0,
static_cast<rsmi_event_type_t>(type),
static_cast<rsmi_event_handle_t*>(evnt_handle));
}
amdsmi_status_t amdsmi_gpu_destroy_counter(amdsmi_event_handle_t evnt_handle) {
rsmi_status_t r = rsmi_dev_counter_destroy(
static_cast<rsmi_event_handle_t>(evnt_handle));
return amd::smi::rsmi_to_amdsmi_status(r);
}
amdsmi_status_t amdsmi_gpu_control_counter(amdsmi_event_handle_t evt_handle,
amdsmi_counter_command_t cmd, void *cmd_args) {
rsmi_status_t r = rsmi_counter_control(
static_cast<rsmi_event_handle_t>(evt_handle),
static_cast<rsmi_counter_command_t>(cmd), cmd_args);
return amd::smi::rsmi_to_amdsmi_status(r);
}
amdsmi_status_t
amdsmi_gpu_read_counter(amdsmi_event_handle_t evt_handle,
amdsmi_counter_value_t *value) {
rsmi_status_t r = rsmi_counter_read(
static_cast<rsmi_event_handle_t>(evt_handle),
reinterpret_cast<rsmi_counter_value_t*>(value));
return amd::smi::rsmi_to_amdsmi_status(r);
}
amdsmi_status_t
amdsmi_get_gpu_available_counters(amdsmi_processor_handle processor_handle,
amdsmi_event_group_t grp, uint32_t *available) {
return rsmi_wrapper(rsmi_counter_available_counters_get, processor_handle, 0,
static_cast<rsmi_event_group_t>(grp),
available);
}
amdsmi_status_t
amdsmi_topo_get_numa_node_number(amdsmi_processor_handle processor_handle, uint32_t *numa_node) {
return rsmi_wrapper(rsmi_topo_get_numa_node_number, processor_handle, 0, numa_node);
}
amdsmi_status_t
amdsmi_topo_get_link_weight(amdsmi_processor_handle processor_handle_src, amdsmi_processor_handle processor_handle_dst,
uint64_t *weight) {
AMDSMI_CHECK_INIT();
amd::smi::AMDSmiGPUDevice* src_device = nullptr;
amd::smi::AMDSmiGPUDevice* dst_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle_src, &src_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
r = get_gpu_device_from_handle(processor_handle_dst, &dst_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
auto rstatus = rsmi_topo_get_link_weight(src_device->get_gpu_id(), dst_device->get_gpu_id(),
weight);
return amd::smi::rsmi_to_amdsmi_status(rstatus);
}
amdsmi_status_t
amdsmi_get_minmax_bandwidth_between_processors(amdsmi_processor_handle processor_handle_src, amdsmi_processor_handle processor_handle_dst,
uint64_t *min_bandwidth, uint64_t *max_bandwidth) {
AMDSMI_CHECK_INIT();
amd::smi::AMDSmiGPUDevice* src_device = nullptr;
amd::smi::AMDSmiGPUDevice* dst_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle_src, &src_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
r = get_gpu_device_from_handle(processor_handle_dst, &dst_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
auto rstatus = rsmi_minmax_bandwidth_get(src_device->get_gpu_id(), dst_device->get_gpu_id(),
min_bandwidth, max_bandwidth);
return amd::smi::rsmi_to_amdsmi_status(rstatus);
}
amdsmi_status_t amdsmi_get_link_metrics(amdsmi_processor_handle processor_handle,
amdsmi_link_metrics_t *link_metrics) {
AMDSMI_CHECK_INIT();
if (link_metrics == nullptr) return AMDSMI_STATUS_INVAL;
amdsmi_gpu_metrics_t metric_info = {};
for (unsigned int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) {
link_metrics->links[i].max_bandwidth = std::numeric_limits<uint32_t>::max();
}
amdsmi_status_t status = amdsmi_get_gpu_metrics_info(
processor_handle, &metric_info);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
link_metrics->num_links = AMDSMI_MAX_NUM_XGMI_LINKS;
uint16_t link_to_dst_node[AMDSMI_MAX_NUM_XGMI_LINKS];
std::fill_n(link_to_dst_node, AMDSMI_MAX_NUM_XGMI_LINKS, std::numeric_limits<uint16_t>::max());
status = rsmi_wrapper(rsmi_dev_xgmi_port_num_get, processor_handle, 0,
&link_metrics->num_links, link_to_dst_node);
for (unsigned int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; i++) {
memset(&link_metrics->links[i].bdf, 0xFF, sizeof(amdsmi_bdf_t));
if (link_to_dst_node[i] != std::numeric_limits<uint16_t>::max()) {
uint32_t node_id = link_to_dst_node[i];
std::string node_symlink = "node" + std::to_string(node_id);
std::string sysfs_base = "/sys/bus/pci/devices/";
DIR *dir = opendir(sysfs_base.c_str());
if (dir) {
struct dirent *entry;
while ((entry = readdir(dir)) != nullptr) {
if (entry->d_type != DT_DIR && entry->d_type != DT_LNK)
continue;
std::string bdf = entry->d_name;
if (bdf == "." || bdf == "..") continue;
std::string symlink_path = sysfs_base + bdf + "/xgmi_hive_info/" + node_symlink;
char buf[PATH_MAX] = {0};
ssize_t len = readlink(symlink_path.c_str(), buf, sizeof(buf)-1);
if (len > 0) {
buf[len] = '\0';
std::string target(buf);
size_t last_slash = target.find_last_of('/');
std::string bdf_str = (last_slash != std::string::npos) ? target.substr(last_slash + 1) : target;
// Parse BDF string: "dddd:bb:dd.f"
unsigned domain = 0, bus = 0, device = 0, function = 0;
if (sscanf(bdf_str.c_str(), "%4x:%2x:%2x.%1x", &domain, &bus, &device, &function) == 4) {
amdsmi_bdf_t dst_bdf = {};
dst_bdf.domain_number = static_cast<uint16_t>(domain);
dst_bdf.bus_number = static_cast<uint8_t>(bus);
dst_bdf.device_number = static_cast<uint8_t>(device);
dst_bdf.function_number = static_cast<uint8_t>(function);
link_metrics->links[i].bdf = dst_bdf;
}
break; // Found, stop searching
}
}
closedir(dir);
}
}
link_metrics->links[i].read = metric_info.xgmi_read_data_acc[i];
link_metrics->links[i].write = metric_info.xgmi_write_data_acc[i];
link_metrics->links[i].link_type = AMDSMI_LINK_TYPE_XGMI;
link_metrics->links[i].bit_rate = metric_info.xgmi_link_speed;
if ((metric_info.xgmi_link_speed != std::numeric_limits<uint16_t>::max()) &&
(metric_info.xgmi_link_width != std::numeric_limits<uint16_t>::max()))
link_metrics->links[i].max_bandwidth = metric_info.xgmi_link_speed * metric_info.xgmi_link_width;
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_topo_get_link_type(amdsmi_processor_handle processor_handle_src, amdsmi_processor_handle processor_handle_dst,
uint64_t *hops, amdsmi_link_type_t *type) {
AMDSMI_CHECK_INIT();
amd::smi::AMDSmiGPUDevice* src_device = nullptr;
amd::smi::AMDSmiGPUDevice* dst_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle_src, &src_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
r = get_gpu_device_from_handle(processor_handle_dst, &dst_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
auto rstatus = rsmi_topo_get_link_type(src_device->get_gpu_id(), dst_device->get_gpu_id(),
hops, reinterpret_cast<RSMI_IO_LINK_TYPE*>(type));
return amd::smi::rsmi_to_amdsmi_status(rstatus);
}
amdsmi_status_t
amdsmi_is_P2P_accessible(amdsmi_processor_handle processor_handle_src,
amdsmi_processor_handle processor_handle_dst,
bool *accessible) {
AMDSMI_CHECK_INIT();
amd::smi::AMDSmiGPUDevice* src_device = nullptr;
amd::smi::AMDSmiGPUDevice* dst_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle_src, &src_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
r = get_gpu_device_from_handle(processor_handle_dst, &dst_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
auto rstatus = rsmi_is_P2P_accessible(src_device->get_gpu_id(), dst_device->get_gpu_id(),
accessible);
return amd::smi::rsmi_to_amdsmi_status(rstatus);
}
amdsmi_status_t
amdsmi_topo_get_p2p_status(amdsmi_processor_handle processor_handle_src,
amdsmi_processor_handle processor_handle_dst,
amdsmi_link_type_t *type, amdsmi_p2p_capability_t *cap) {
AMDSMI_CHECK_INIT();
amd::smi::AMDSmiGPUDevice* src_device = nullptr;
amd::smi::AMDSmiGPUDevice* dst_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle_src, &src_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
r = get_gpu_device_from_handle(processor_handle_dst, &dst_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
auto rstatus = rsmi_topo_get_p2p_status(src_device->get_gpu_id(), dst_device->get_gpu_id(),
reinterpret_cast<RSMI_IO_LINK_TYPE*>(type),
reinterpret_cast<rsmi_p2p_capability_t*>(cap));
return amd::smi::rsmi_to_amdsmi_status(rstatus);
}
// Compute Partition functions
amdsmi_status_t
amdsmi_get_gpu_compute_partition(amdsmi_processor_handle processor_handle,
char *compute_partition, uint32_t len) {
AMDSMI_CHECK_INIT();
std::ostringstream ss;
auto status = rsmi_wrapper(rsmi_dev_compute_partition_get, processor_handle, 0,
compute_partition, len);
ss << __PRETTY_FUNCTION__ << " | rsmi_dev_compute_partition_get() returned: "
<< smi_amdgpu_get_status_string(status, false);
LOG_INFO(ss);
return status;
}
amdsmi_status_t
amdsmi_set_gpu_compute_partition(amdsmi_processor_handle processor_handle,
amdsmi_compute_partition_type_t compute_partition) {
AMDSMI_CHECK_INIT();
auto ret_resp = rsmi_wrapper(rsmi_dev_compute_partition_set, processor_handle, 0,
static_cast<rsmi_compute_partition_type_t>(compute_partition));
return ret_resp;
}
// Memory Partition functions
amdsmi_status_t
amdsmi_get_gpu_memory_partition(amdsmi_processor_handle processor_handle,
char *memory_partition, uint32_t len) {
AMDSMI_CHECK_INIT();
amdsmi_status_t ret = rsmi_wrapper(rsmi_dev_memory_partition_get, processor_handle, 0,
memory_partition, len);
return ret;
}
amdsmi_status_t
amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle,
amdsmi_memory_partition_type_t memory_partition) {
AMDSMI_CHECK_INIT();
if (memory_partition != AMDSMI_MEMORY_PARTITION_UNKNOWN
&& memory_partition != AMDSMI_MEMORY_PARTITION_NPS1
&& memory_partition != AMDSMI_MEMORY_PARTITION_NPS2
&& memory_partition != AMDSMI_MEMORY_PARTITION_NPS4
&& memory_partition != AMDSMI_MEMORY_PARTITION_NPS8) {
return AMDSMI_STATUS_INVAL;
}
std::ostringstream ss;
std::lock_guard<std::mutex> g(myMutex);
const uint32_t k256 = 256;
char current_partition[k256];
std::string current_partition_str = "UNKNOWN";
std::string req_user_partition = "UNKNOWN";
req_user_partition.clear();
switch (memory_partition) {
case AMDSMI_MEMORY_PARTITION_NPS1:
req_user_partition = "NPS1";
break;
case AMDSMI_MEMORY_PARTITION_NPS2:
req_user_partition = "NPS2";
break;
case AMDSMI_MEMORY_PARTITION_NPS4:
req_user_partition = "NPS4";
break;
case AMDSMI_MEMORY_PARTITION_NPS8:
req_user_partition = "NPS8";
break;
default:
req_user_partition = "UNKNOWN";
break;
}
rsmi_memory_partition_type_t rsmi_type;
auto it = nps_amdsmi_to_RSMI.find(memory_partition);
if (it != nps_amdsmi_to_RSMI.end()) {
rsmi_type = it->second;
} else if (it == nps_amdsmi_to_RSMI.end()) {
return AMDSMI_STATUS_INVAL;
}
amdsmi_status_t ret = rsmi_wrapper(rsmi_dev_memory_partition_set, processor_handle, 0,
rsmi_type);
amdsmi_status_t ret_get = rsmi_wrapper(rsmi_dev_memory_partition_get, processor_handle, 0,
current_partition, k256);
if (ret_get == AMDSMI_STATUS_SUCCESS) {
current_partition_str.clear();
current_partition_str = current_partition;
}
ss << __PRETTY_FUNCTION__
<< " | After attepting to set memory partition to " << req_user_partition << "\n"
<< " | Current memory partition is " << current_partition_str << "\n"
<< " | Returning: " << smi_amdgpu_get_status_string(ret, false)
<< " | User will need to reload driver in order to see a NPS mode change";
LOG_INFO(ss);
return ret;
}
amdsmi_status_t
amdsmi_get_gpu_memory_partition_config(amdsmi_processor_handle processor_handle,
amdsmi_memory_partition_config_t *config) {
AMDSMI_CHECK_INIT();
std::ostringstream ss;
if (config == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// initialization for devices which do not support partitions
amdsmi_nps_caps_t flags;
flags.nps_flags.nps1_cap = 0;
flags.nps_flags.nps2_cap = 0;
flags.nps_flags.nps4_cap = 0;
flags.nps_flags.nps8_cap = 0;
config->partition_caps = flags;
config->mp_mode = AMDSMI_MEMORY_PARTITION_UNKNOWN;
// TODO(amdsmi_team): Will BM/guest VMs have numa ranges?
config->num_numa_ranges = 0;
// current memory partition
constexpr uint32_t kCurrentPartitionSize = 5;
char current_mem_partition[kCurrentPartitionSize];
std::string current_mem_partition_str = "N/A";
amdsmi_status_t status = amdsmi_get_gpu_memory_partition(processor_handle,
current_mem_partition, kCurrentPartitionSize);
ss << __PRETTY_FUNCTION__ << " | amdsmi_get_gpu_memory_partition() current_partition = |"
<< current_mem_partition << "|";
LOG_DEBUG(ss);
current_mem_partition_str = current_mem_partition;
if (status == AMDSMI_STATUS_SUCCESS) {
if (current_mem_partition_str == "NPS1") {
config->mp_mode = AMDSMI_MEMORY_PARTITION_NPS1;
} else if (current_mem_partition_str == "NPS2") {
config->mp_mode = AMDSMI_MEMORY_PARTITION_NPS2;
} else if (current_mem_partition_str == "NPS4") {
config->mp_mode = AMDSMI_MEMORY_PARTITION_NPS4;
} else if (current_mem_partition_str == "NPS8") {
config->mp_mode = AMDSMI_MEMORY_PARTITION_NPS8;
}
}
// Add memory partition capabilities here
constexpr uint32_t kLenCapsSize = 30;
char memory_caps[kLenCapsSize];
auto status_mem_caps = rsmi_wrapper(rsmi_dev_memory_partition_capabilities_get,
processor_handle, 0,
memory_caps, kLenCapsSize);
ss << __PRETTY_FUNCTION__
<< " | rsmi_dev_memory_partition_capabilities_get Returning: "
<< smi_amdgpu_get_status_string(status, false)
<< " | Type: memory_partition_capabilities"
<< " | Data: " << memory_caps;
LOG_DEBUG(ss);
std::string memory_caps_str = "N/A";
if (status_mem_caps == AMDSMI_STATUS_SUCCESS) { // older kernels may not support this
memory_caps_str = std::string(memory_caps);
if (memory_caps_str.find("NPS1") != std::string::npos) {
flags.nps_flags.nps1_cap = 1;
}
if (memory_caps_str.find("NPS2") != std::string::npos) {
flags.nps_flags.nps2_cap = 1;
}
if (memory_caps_str.find("NPS4") != std::string::npos) {
flags.nps_flags.nps4_cap = 1;
}
if (memory_caps_str.find("NPS8") != std::string::npos) {
flags.nps_flags.nps8_cap = 1;
}
}
config->partition_caps = flags;
return status;
}
amdsmi_status_t
amdsmi_set_gpu_memory_partition_mode(amdsmi_processor_handle processor_handle,
amdsmi_memory_partition_type_t mode) {
AMDSMI_CHECK_INIT();
return amdsmi_set_gpu_memory_partition(processor_handle, mode);
}
// Accelerator Partition functions
amdsmi_status_t
amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle processor_handle,
amdsmi_accelerator_partition_profile_config_t *profile_config) {
AMDSMI_CHECK_INIT();
if (!amd::smi::is_sudo_user()) {
return AMDSMI_STATUS_NO_PERM;
}
std::ostringstream ss;
ss << __PRETTY_FUNCTION__
<< " | START ";
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
if (profile_config == nullptr) {
ss << __PRETTY_FUNCTION__ << " | profile_config is nullptr" << std::endl;
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
// Initialize values
amdsmi_status_t return_status = AMDSMI_STATUS_NOT_SUPPORTED;
amdsmi_status_t status = AMDSMI_STATUS_NOT_SUPPORTED;
profile_config->default_profile_index = 0;
profile_config->num_profiles = 0;
profile_config->num_resource_profiles = 0;
profile_config->resource_profiles->profile_index = 0;
profile_config->resource_profiles->resource_type = AMDSMI_ACCELERATOR_MAX;
profile_config->resource_profiles->partition_resource = 0;
profile_config->resource_profiles->num_partitions_share_resource = 0;
amdsmi_nps_caps_t flags;
flags.nps_flags.nps1_cap = 0;
flags.nps_flags.nps2_cap = 0;
flags.nps_flags.nps4_cap = 0;
flags.nps_flags.nps8_cap = 0;
ss << __PRETTY_FUNCTION__
<< " | 1";
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
// get supported xcp_configs (this will tell use # of profiles/index's)
// /sys/class/drm/../device/compute_partition_config/supported_xcp_configs
// otherwise fall back to use /sys/class/drm/../device/available_compute_partition
// ex. SPX, DPX, QPX, CPX
std::string accelerator_caps_str = "N/A";
constexpr uint32_t kLenXCPConfigSize = 30;
char supported_xcp_configs[kLenXCPConfigSize];
bool use_xcp_config = false;
return_status
= rsmi_wrapper(rsmi_dev_compute_partition_supported_xcp_configs_get, processor_handle, 0,
supported_xcp_configs, kLenXCPConfigSize);
if (return_status == AMDSMI_STATUS_SUCCESS) {
accelerator_caps_str.clear();
accelerator_caps_str = std::string(supported_xcp_configs);
accelerator_caps_str = amd::smi::trimAllWhiteSpace(accelerator_caps_str);
use_xcp_config = true;
} else { // initialize what we can
ss << __PRETTY_FUNCTION__
<< "\n | rsmi_dev_compute_partition_supported_xcp_configs_get()"
<< " returned: " << smi_amdgpu_get_status_string(return_status, false)
<< "\n | Defaulting to use rsmi_dev_compute_partition_capabilities_get";
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
return_status = rsmi_wrapper(rsmi_dev_compute_partition_capabilities_get,
processor_handle, 0,
supported_xcp_configs, kLenXCPConfigSize);
if (return_status == AMDSMI_STATUS_SUCCESS) {
accelerator_caps_str.clear();
accelerator_caps_str = std::string(supported_xcp_configs);
accelerator_caps_str = amd::smi::trimAllWhiteSpace(accelerator_caps_str);
} else {
ss << __PRETTY_FUNCTION__
<< "\n | rsmi_dev_compute_partition_capabilities_get() failed, "
<< "likely due to feature not supported"
<< "\n | Returning: " << smi_amdgpu_get_status_string(return_status, false);
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
return return_status;
}
}
ss << __PRETTY_FUNCTION__
<< (use_xcp_config ? "\n | Used rsmi_dev_compute_partition_supported_xcp_configs_get()" :
"\n | Used rsmi_dev_compute_partition_capabilities_get()")
<< "\n | Returning: " << smi_amdgpu_get_status_string(return_status, false)
<< "\n | Type: "
<< (use_xcp_config ? amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs):
amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition))
<< "\n | Data: " << accelerator_caps_str;
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
if (accelerator_caps_str.find("SPX") != std::string::npos) {
profile_config->profiles[profile_config->num_profiles].profile_type
= AMDSMI_ACCELERATOR_PARTITION_SPX;
profile_config->profiles[profile_config->num_profiles].num_partitions = 1;
profile_config->profiles[profile_config->num_profiles].profile_index
= profile_config->num_profiles;
// default all memory partition caps to 0
profile_config->profiles[profile_config->num_profiles].memory_caps = flags;
profile_config->num_profiles++;
}
if (accelerator_caps_str.find("DPX") != std::string::npos) {
profile_config->profiles[profile_config->num_profiles].profile_type
= AMDSMI_ACCELERATOR_PARTITION_DPX;
profile_config->profiles[profile_config->num_profiles].num_partitions = 2;
profile_config->profiles[profile_config->num_profiles].profile_index
= profile_config->num_profiles;
// default all memory partition caps to 0
profile_config->profiles[profile_config->num_profiles].memory_caps = flags;
profile_config->num_profiles++;
}
if (accelerator_caps_str.find("TPX") != std::string::npos) {
profile_config->profiles[profile_config->num_profiles].profile_type
= AMDSMI_ACCELERATOR_PARTITION_TPX;
profile_config->profiles[profile_config->num_profiles].num_partitions = 3;
profile_config->profiles[profile_config->num_profiles].profile_index
= profile_config->num_profiles;
// default all memory partition caps to 0
profile_config->profiles[profile_config->num_profiles].memory_caps = flags;
profile_config->num_profiles++;
}
if (accelerator_caps_str.find("QPX") != std::string::npos) {
profile_config->profiles[profile_config->num_profiles].profile_type
= AMDSMI_ACCELERATOR_PARTITION_QPX;
profile_config->profiles[profile_config->num_profiles].num_partitions = 4;
profile_config->profiles[profile_config->num_profiles].profile_index
= profile_config->num_profiles;
// default all memory partition caps to 0
profile_config->profiles[profile_config->num_profiles].memory_caps = flags;
profile_config->num_profiles++;
}
if (accelerator_caps_str.find("CPX") != std::string::npos) {
profile_config->profiles[profile_config->num_profiles].profile_type
= AMDSMI_ACCELERATOR_PARTITION_CPX;
// Note: # of XCDs is max # of partitions CPX supports
uint16_t tmp_xcd_count = 0;
status = rsmi_wrapper(rsmi_dev_metrics_xcd_counter_get,
processor_handle, 0, &tmp_xcd_count);
profile_config->profiles[
profile_config->num_profiles].num_partitions = 0; // default to 0
if (status == AMDSMI_STATUS_SUCCESS) {
profile_config->profiles[
profile_config->num_profiles].num_partitions = tmp_xcd_count;
}
profile_config->profiles[profile_config->num_profiles].profile_index
= profile_config->num_profiles;
// default all memory partition caps to 0
profile_config->profiles[profile_config->num_profiles].memory_caps = flags;
profile_config->num_profiles++;
}
ss << __PRETTY_FUNCTION__
<< " | 2";
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
auto resource_index = 0;
// get resource info for each profile
for (auto i = 0U; i < profile_config->num_profiles; i++) {
profile_config->profiles[i].num_resources = 0; // start at 0 resources and increment
auto it = partition_types_map.find(profile_config->profiles[i].profile_type);
std::string partition_type_str = "UNKNOWN";
if (it != partition_types_map.end()) {
partition_type_str.clear();
partition_type_str = it->second;
}
auto it3 = accelerator_to_RSMI.find(profile_config->profiles[i].profile_type);
rsmi_compute_partition_type_t rsmi_partition_type = RSMI_COMPUTE_PARTITION_INVALID;
if (it3 == accelerator_to_RSMI.end()) {
ss << __PRETTY_FUNCTION__ << " | reached end of map\n";
LOG_DEBUG(ss);
continue;
} else {
rsmi_partition_type = it3->second;
}
status = rsmi_wrapper(rsmi_dev_compute_partition_xcp_config_set, processor_handle, 0,
rsmi_partition_type);
ss << __PRETTY_FUNCTION__
<< "\n | profile_num: " << i
<< "\n | profile_type: " << partition_type_str
<< "\n | rsmi_dev_compute_partition_xcp_config_set(" << partition_type_str
<< ") Returning: "
<< smi_amdgpu_get_status_string(status, false)
<< "\n | Type: "
<< amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs)
<< "\n | Data: " << "N/A";
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
// 1) get memory caps for each profile
/**
* rsmi_status_t rsmi_dev_compute_partition_supported_nps_configs_get(uint32_t dv_ind, char *supported_configs,
* uint32_t len);
*/
constexpr uint32_t kLenNPSConfigSize = 30;
char supported_nps_configs[kLenNPSConfigSize];
std::string supported_nps_caps_str = "N/A";
status = rsmi_wrapper(rsmi_dev_compute_partition_supported_nps_configs_get,
processor_handle, 0,
supported_nps_configs, kLenNPSConfigSize);
if (status == AMDSMI_STATUS_SUCCESS) {
supported_nps_caps_str.clear();
supported_nps_caps_str = std::string(supported_nps_configs);
}
if (supported_nps_caps_str.find("NPS1") != std::string::npos) {
profile_config->profiles[i].memory_caps.nps_flags.nps1_cap = 1;
}
if (supported_nps_caps_str.find("NPS2") != std::string::npos) {
profile_config->profiles[i].memory_caps.nps_flags.nps2_cap = 1;
}
if (supported_nps_caps_str.find("NPS4") != std::string::npos) {
profile_config->profiles[i].memory_caps.nps_flags.nps4_cap = 1;
}
if (supported_nps_caps_str.find("NPS8") != std::string::npos) {
profile_config->profiles[i].memory_caps.nps_flags.nps8_cap = 1;
}
// 2) get resource profiles
for (auto r = static_cast<int>(RSMI_ACCELERATOR_XCC);
r < static_cast<int>(RSMI_ACCELERATOR_MAX); r++) {
rsmi_accelerator_partition_resource_type_t type
= static_cast<rsmi_accelerator_partition_resource_type_t>(r);
rsmi_accelerator_partition_resource_profile_t profile;
status = rsmi_wrapper(
rsmi_dev_compute_partition_resource_profile_get, processor_handle, 0,
&type, &profile);
if (status == AMDSMI_STATUS_SUCCESS) {
uint32_t inc_res_profile =
profile_config->num_resource_profiles + 1;
if (inc_res_profile < static_cast<uint32_t>(RSMI_ACCELERATOR_MAX)) {
profile_config->num_resource_profiles = inc_res_profile;
}
profile_config->resource_profiles[resource_index].profile_index = i;
profile_config->resource_profiles[resource_index].resource_type
= static_cast<amdsmi_accelerator_partition_resource_type_t>(type);
profile_config->resource_profiles[resource_index].partition_resource
= profile.partition_resource;
profile_config->resource_profiles[resource_index].num_partitions_share_resource
= profile.num_partitions_share_resource;
auto it3 =
resource_types_map.find(
profile_config->resource_profiles[resource_index].resource_type);
std::string resource_type_str = "UNKNOWN";
if (it3 != resource_types_map.end()) {
resource_type_str.clear();
resource_type_str = it3->second;
}
ss << __PRETTY_FUNCTION__ << " | profile_debug 1 "
<< "\n profile type: " << partition_type_str
<< "\n resource_index: " << resource_index
<< "\n profile_index: " << i
<< "\n resource_type: " << resource_type_str
<< "\n partition_resource: " << profile.partition_resource
<< "\n num_partitions_share_resource: " << profile.num_partitions_share_resource
<< std::endl;
LOG_DEBUG(ss);
resource_index += 1;
uint32_t inc_resources =
profile_config->profiles[i].num_resources + 1;
if (inc_resources < static_cast<uint32_t>(RSMI_ACCELERATOR_MAX)) {
profile_config->profiles[i].num_resources = inc_resources;
}
ss << __PRETTY_FUNCTION__ << " | profile_debug 2 "
<< "\n profile_config->profiles[i].num_resources: "
<< profile_config->profiles[i].num_resources
<< std::endl;
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
}
it = partition_types_map.find(profile_config->profiles[i].profile_type);
partition_type_str = "UNKNOWN";
if (it != partition_types_map.end()) {
partition_type_str.clear();
partition_type_str = it->second;
}
auto it2 = resource_types_map.find(
static_cast<amdsmi_accelerator_partition_resource_type_t>(type));
std::string resource_type_str = "UNKNOWN";
if (it2 != resource_types_map.end()) {
resource_type_str.clear();
resource_type_str = it2->second;
}
auto current_resource_idx = (resource_index >= 1) ? resource_index - 1 : 0;
std::string nps_caps = "N/A";
if (profile_config->profiles[i].memory_caps.nps_flags.nps1_cap == 1) {
if (nps_caps == "N/A") {
nps_caps.clear();
nps_caps = "NPS1";
} else {
nps_caps += ", NPS1";
}
}
if (profile_config->profiles[i].memory_caps.nps_flags.nps2_cap == 1) {
if (nps_caps == "N/A") {
nps_caps.clear();
nps_caps = "NPS2";
} else {
nps_caps += ", NPS2";
}
}
if (profile_config->profiles[i].memory_caps.nps_flags.nps4_cap == 1) {
if (nps_caps == "N/A") {
nps_caps.clear();
nps_caps = "NPS4";
} else {
nps_caps += ", NPS4";
}
}
if (profile_config->profiles[i].memory_caps.nps_flags.nps8_cap == 1) {
if (nps_caps == "N/A") {
nps_caps.clear();
nps_caps = "NPS8";
} else {
nps_caps += ", NPS8";
}
}
ss << __PRETTY_FUNCTION__
<< " | Detailed output"
<< "\n | profile_config->num_profiles: " << profile_config->num_profiles
<< "\n | profile_num (i): " << i
<< "\n | resource_num (r): " << r
<< "\n | current_resource_idx: " << current_resource_idx
<< "\n | profile_config->resource_profiles[current_resource_idx].profile_index: "
<< profile_config->resource_profiles[current_resource_idx].profile_index
<< "\n | profile_config->profiles[i].memory_caps: "
<< nps_caps
<< "\n***********************************************"
<< "\n | profile_config->profiles[i].num_resources: "
<< profile_config->profiles[i].num_resources
<< "\n***********************************************"
<< "\n | profile_type: " << partition_type_str
<< "\n | resource_type: " << resource_type_str
<< "\n | partition_resource: " << profile.partition_resource
<< "\n | num_partitions_share_resource: "
<< profile.num_partitions_share_resource
<< "\n | profile_config->num_resource_profiles: "
<< profile_config->num_resource_profiles
<< "\n | rsmi_dev_compute_partition_resource_profile_get("
<< resource_type_str << ") Returning: "
<< smi_amdgpu_get_status_string(status, false)
<< "\n | Type: "
<< amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs)
<< "\n";
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
} // END resources loop
} // END profile loop
int res_ind = 0;
for (uint32_t i = 0; i < profile_config->num_profiles; i++) {
auto current_profile = profile_config->profiles[i];
std::string profile_type_str = "N/A";
if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_SPX) {
profile_type_str = "SPX";
} else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_DPX) {
profile_type_str = "DPX";
} else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_TPX) {
profile_type_str = "TPX";
} else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_QPX) {
profile_type_str = "QPX";
} else if (current_profile.profile_type == AMDSMI_ACCELERATOR_PARTITION_CPX) {
profile_type_str = "CPX";
}
std::string nps_caps_str = "";
if ((current_profile.memory_caps.nps_flags.nps1_cap == 0
&& current_profile.memory_caps.nps_flags.nps2_cap == 0
&& current_profile.memory_caps.nps_flags.nps4_cap == 0
&& current_profile.memory_caps.nps_flags.nps8_cap == 0)) {
nps_caps_str = "N/A";
} else {
nps_caps_str.clear();
if (current_profile.memory_caps.nps_flags.nps1_cap) {
(nps_caps_str.empty()) ? nps_caps_str += "NPS1" : nps_caps_str += ", NPS1";
}
if (current_profile.memory_caps.nps_flags.nps2_cap) {
(nps_caps_str.empty()) ? nps_caps_str += "NPS2" : nps_caps_str += ", NPS2";
}
if (current_profile.memory_caps.nps_flags.nps4_cap) {
(nps_caps_str.empty()) ? nps_caps_str += "NPS4" : nps_caps_str += ", NPS4";
}
if (current_profile.memory_caps.nps_flags.nps8_cap) {
(nps_caps_str.empty()) ? nps_caps_str += "NPS8" : nps_caps_str += ", NPS8";
}
}
ss << __PRETTY_FUNCTION__ << " | profile_debug; after compiling info p1 "
<< "\n\t**profile_config.profiles[" << i << "]:\n"
<< "\t\tprofile_type: " << profile_type_str
<< "\n\t\tnum_partitions: " << current_profile.num_partitions
<< "\n\t\tmemory_caps: " << nps_caps_str
<< "\n\t\tcurrent_profile.num_resources: " << current_profile.num_resources
<< std::endl;
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
for (uint32_t j = 0; j < current_profile.num_resources; j++) {
auto rp = profile_config->resource_profiles[res_ind];
auto it2 = resource_types_map.find(rp.resource_type);
std::string resource_type_str = "UNKNOWN";
if (it2 != resource_types_map.end()) {
resource_type_str.clear();
resource_type_str = it2->second;
}
ss << __PRETTY_FUNCTION__ << " | profile_debug; after compiling info p2 "
<< "\n\t\t\tprofile_index: " << current_profile.profile_index
<< "\n\t\t\tres_ind: " << res_ind
<< "\n\t\t\tprofile_config.resource_profiles[" << res_ind
<< "].resource_type: "
<< resource_type_str
<< "\n\t\t\tprofile_config.resource_profiles[" << res_ind
<< "].partition_resource: "
<< rp.partition_resource
<< "\n\t\t\tprofile_config.resource_profiles[" << res_ind
<< "].num_partitions_share_resource: "
<< rp.num_partitions_share_resource
<< std::endl;
LOG_DEBUG(ss);
res_ind++;
}
}
ss << __PRETTY_FUNCTION__
<< " | END returning " << smi_amdgpu_get_status_string(return_status, false);
// std::cout << ss.str() << std::endl;
LOG_INFO(ss);
return return_status;
}
amdsmi_status_t
amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_handle,
amdsmi_accelerator_partition_profile_t *profile,
uint32_t *partition_id) {
std::ostringstream ss;
AMDSMI_CHECK_INIT();
if (profile == nullptr || partition_id == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// initialization for devices which do not support partitions
profile->num_partitions = std::numeric_limits<uint32_t>::max();
profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_INVALID;
*partition_id = {0};
profile->profile_index = std::numeric_limits<uint32_t>::max();
profile->num_resources = 0;
amdsmi_nps_caps_t flags;
flags.nps_flags.nps1_cap = 0;
flags.nps_flags.nps2_cap = 0;
flags.nps_flags.nps4_cap = 0;
flags.nps_flags.nps8_cap = 0;
profile->memory_caps = flags;
// TODO(amdsmi_team): add resources here ^
auto tmp_partition_id = uint32_t(0);
amdsmi_status_t status = AMDSMI_STATUS_NOT_SUPPORTED;
// TODO(amdsmi_team): should we do fallback?
// Info doesn't populate properly if missing other files - CLI FIX?
// Reason: older kernels do not support xcp_configs
// get supported xcp_configs (this will tell use # of profiles/index's)
// /sys/class/drm/../device/compute_partition_config/supported_xcp_configs
// otherwise fall back to use /sys/class/drm/../device/available_compute_partition
// ex. SPX, DPX, QPX, CPX
// Depending on what is available, we can determine the profile index
// ex. SPX = 0, DPX = 1, QPX = 2, CPX = 3; other devices may have different values
std::string accelerator_capabilities = "N/A";
constexpr uint32_t kLenXCPConfigSize = 30;
char supported_xcp_configs[kLenXCPConfigSize];
bool use_xcp_config = false;
status
= rsmi_wrapper(rsmi_dev_compute_partition_supported_xcp_configs_get, processor_handle, 0,
supported_xcp_configs, kLenXCPConfigSize);
if (status == AMDSMI_STATUS_SUCCESS) {
accelerator_capabilities.clear();
accelerator_capabilities = std::string(supported_xcp_configs);
use_xcp_config = true;
}
ss << __PRETTY_FUNCTION__
<< (use_xcp_config ? "\n | Used rsmi_dev_compute_partition_supported_xcp_configs_get()" :
"\n | Used rsmi_dev_compute_partition_capabilities_get()")
<< "\n | Returned: " << smi_amdgpu_get_status_string(status, false)
<< "\n | Type: "
<< (use_xcp_config ? amd::smi::Device::get_type_string(amd::smi::kDevSupportedXcpConfigs):
amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition))
<< "\n | Data: " << accelerator_capabilities;
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
// get index by comma and place into a string vector
char delimiter = ',';
std::stringstream ss_obj(accelerator_capabilities);
std::string temp;
std::vector<std::string> tokens;
while (getline(ss_obj, temp, delimiter)) {
temp = amd::smi::trimAllWhiteSpace(temp);
tokens.push_back(temp);
}
// hold all current available compute partition values within tokens vector
std::ostringstream ss_1;
std::copy(std::begin(tokens),
std::end(tokens),
amd::smi::make_ostream_joiner(&ss_1, ", "));
constexpr uint32_t kCurrentPartitionSize = 16;
char current_partition[kCurrentPartitionSize] = {0};
std::string current_partition_str = "N/A";
amdsmi_status_t compute_status = amdsmi_get_gpu_compute_partition(processor_handle,
current_partition, kCurrentPartitionSize);
ss << __PRETTY_FUNCTION__ << " | amdsmi_get_gpu_compute_partition() current_partition = |"
<< current_partition << "|";
LOG_DEBUG(ss);
current_partition_str = current_partition;
if (status == AMDSMI_STATUS_SUCCESS) {
// 1) get profile index from
// /sys/class/drm/../device/compute_partition_config/supported_xcp_configs
if (current_partition_str == "SPX" || current_partition_str == "DPX"
|| current_partition_str == "TPX" || current_partition_str == "QPX"
|| current_partition_str == "CPX") {
// get index according to supported_xcp_configs, separated by commas
if (accelerator_capabilities.find(current_partition_str) != std::string::npos) {
auto it = std::find(tokens.begin(), tokens.end(), current_partition_str);
if (it != tokens.end()) {
profile->profile_index = static_cast<uint32_t>(std::distance(
tokens.begin(), it));
}
}
}
// 2) get profile type from /sys/class/drm/../device/current_compute_partition
if (current_partition_str == "SPX") {
profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_SPX;
} else if (current_partition_str == "DPX") {
profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_DPX;
} else if (current_partition_str == "TPX") {
profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_TPX;
} else if (current_partition_str == "QPX") {
profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_QPX;
} else if (current_partition_str == "CPX") {
profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_CPX;
} else {
profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_INVALID;
}
} else {
profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_INVALID;
current_partition_str.clear();
current_partition_str = "N/A";
}
amdsmi_gpu_metrics_t metric_info = {};
status = amdsmi_get_gpu_metrics_info(processor_handle, &metric_info);
if (status == AMDSMI_STATUS_SUCCESS
&& metric_info.num_partition != std::numeric_limits<uint16_t>::max()) {
profile->num_partitions = metric_info.num_partition;
}
status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, 0,
&tmp_partition_id);
const uint32_t partition_num = 0; // Each partition should show the their respective
// partition_id at positon 0 of the array.
// We are no longer populating only the primary partition
// for BM/Guest.
if (status == AMDSMI_STATUS_SUCCESS) {
partition_id[partition_num] = tmp_partition_id;
}
std::ostringstream ss_2;
const uint32_t kMaxPartitions = 8;
uint32_t copy_partition_ids[kMaxPartitions] = {0}; // initialize all to 0s
std::copy(partition_id, partition_id + kMaxPartitions, copy_partition_ids);
std::copy(std::begin(copy_partition_ids),
std::end(copy_partition_ids),
amd::smi::make_ostream_joiner(&ss_2, ", "));
auto it_profile_type = partition_types_map.find(profile->profile_type);
std::string partition_type_str = "N/A";
if (it_profile_type != partition_types_map.end()) {
partition_type_str.clear();
partition_type_str = it_profile_type->second;
}
ss << __PRETTY_FUNCTION__
<< " | Num_partitions: " << profile->num_partitions
<< "; profile->profile_type: " << profile->profile_type << " (" << partition_type_str << ")"
<< "; partition_id: " << ss_2.str() << "\n";
LOG_DEBUG(ss);
// Add memory partition capabilities here
constexpr uint32_t kLenCapsSize = 30;
char memory_caps[kLenCapsSize];
status = rsmi_wrapper(rsmi_dev_memory_partition_capabilities_get, processor_handle, 0,
memory_caps, kLenCapsSize);
ss << __PRETTY_FUNCTION__
<< " | rsmi_dev_memory_partition_capabilities_get Returning: "
<< smi_amdgpu_get_status_string(status, false)
<< " | Type: memory_partition_capabilities"
<< " | Data: " << memory_caps;
LOG_DEBUG(ss);
std::string memory_caps_str = "N/A";
if (status == AMDSMI_STATUS_SUCCESS) {
memory_caps_str = std::string(memory_caps);
if (memory_caps_str.find("NPS1") != std::string::npos) {
flags.nps_flags.nps1_cap = 1;
}
if (memory_caps_str.find("NPS2") != std::string::npos) {
flags.nps_flags.nps2_cap = 1;
}
if (memory_caps_str.find("NPS4") != std::string::npos) {
flags.nps_flags.nps4_cap = 1;
}
if (memory_caps_str.find("NPS8") != std::string::npos) {
flags.nps_flags.nps8_cap = 1;
}
}
profile->memory_caps = flags;
ss << __PRETTY_FUNCTION__
<< " | END returning " << smi_amdgpu_get_status_string(compute_status, false) << "\n"
<< " | accelerator_capabilities: " << accelerator_capabilities << "\n"
<< " | current_partition_str: " << current_partition_str << "\n"
<< " | std::vector<std::string> tokens: " << ss_1.str() << "\n"
<< " | profile->num_partitions: " << profile->num_partitions << "\n"
<< " | profile->profile_type: " << partition_type_str << "\n"
<< " | profile->profile_index: " << profile->profile_index << "\n"
<< " | profile->num_resources: " << profile->num_resources << "\n"
<< " | profile->memory_caps: " << "\n"
<< " | nps1_cap: " << profile->memory_caps.nps_flags.nps1_cap << "\n"
<< " | nps2_cap: " << profile->memory_caps.nps_flags.nps2_cap << "\n"
<< " | nps4_cap: " << profile->memory_caps.nps_flags.nps4_cap << "\n"
<< " | nps8_cap: " << profile->memory_caps.nps_flags.nps8_cap << "\n"
<< " | partition_id: " << ss_2.str();
LOG_INFO(ss);
return compute_status; // only return status from amdsmi_get_gpu_compute_partition
// as this is the only function that can fail
// if the device does not support partitions
}
amdsmi_status_t
amdsmi_set_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_handle,
uint32_t profile_index) {
AMDSMI_CHECK_INIT();
std::ostringstream ss;
amdsmi_accelerator_partition_profile_config_t config;
amdsmi_status_t status = amdsmi_get_gpu_accelerator_partition_profile_config(
processor_handle, &config);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
std::map<uint32_t, amdsmi_accelerator_partition_type_t> mp_prof_indx_to_accel_type;
ss << __PRETTY_FUNCTION__ << " | Invalid profile_index: " << profile_index
<< "\n| Max profile_index: " << config.num_profiles - 1
<< "\n| config.num_profiles: " << config.num_profiles
<< "\n| profile_index: " << profile_index
<< "\n| Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_INVAL, false);
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
if (profile_index >= config.num_profiles) {
ss << __PRETTY_FUNCTION__ << " | Invalid profile_index: " << profile_index
<< "\n| Max profile_index: " << config.num_profiles - 1
<< "\n| Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_INVAL, false);
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
return AMDSMI_STATUS_INVAL;
}
for (uint32_t i = 0; i < config.num_profiles; i++) {
auto it = partition_types_map.find(config.profiles[i].profile_type);
std::string partition_type_str = "N/A";
if (it != partition_types_map.end()) {
partition_type_str.clear();
partition_type_str = it->second;
}
ss << __PRETTY_FUNCTION__ << " | "
<< "config.profiles[" << i << "].profile_type: "
<< static_cast<int>(config.profiles[i].profile_type) << "\n"
<< "| config.profiles[" << i << "].profile_type (str): "
<< partition_type_str << "\n"
<< "| config.profiles[" << i << "].profile_index: "
<< static_cast<int>(config.profiles[i].profile_index)
<< "\n";
// std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
mp_prof_indx_to_accel_type[config.profiles[i].profile_index]
= config.profiles[i].profile_type;
}
auto return_status = amdsmi_set_gpu_compute_partition(processor_handle,
static_cast<amdsmi_compute_partition_type_t>(mp_prof_indx_to_accel_type[profile_index]));
ss << __PRETTY_FUNCTION__ << " | User requested profile_index: " << profile_index
<< "\n| Accelerator Type: "
<< partition_types_map.at(mp_prof_indx_to_accel_type[profile_index])
<< "\n| Returning: " << smi_amdgpu_get_status_string(return_status, false);
// std::cout << ss.str() << std::endl;
LOG_INFO(ss);
return return_status;
}
// TODO(bliu) : other xgmi related information
amdsmi_status_t
amdsmi_get_xgmi_info(amdsmi_processor_handle processor_handle, amdsmi_xgmi_info_t *info) {
AMDSMI_CHECK_INIT();
if (info == nullptr)
return AMDSMI_STATUS_INVAL;
return rsmi_wrapper(rsmi_dev_xgmi_hive_id_get, processor_handle, 0,
&(info->xgmi_hive_id));
}
amdsmi_status_t
amdsmi_gpu_xgmi_error_status(amdsmi_processor_handle processor_handle, amdsmi_xgmi_status_t *status) {
return rsmi_wrapper(rsmi_dev_xgmi_error_status, processor_handle, 0,
reinterpret_cast<rsmi_xgmi_status_t*>(status));
}
amdsmi_status_t
amdsmi_reset_gpu_xgmi_error(amdsmi_processor_handle processor_handle) {
return rsmi_wrapper(rsmi_dev_xgmi_error_reset, processor_handle, 0);
}
amdsmi_status_t
amdsmi_get_gpu_compute_process_info(amdsmi_process_info_t *procs, uint32_t *num_items) {
AMDSMI_CHECK_INIT();
if (num_items == nullptr)
return AMDSMI_STATUS_INVAL;
auto r = rsmi_compute_process_info_get(
reinterpret_cast<rsmi_process_info_t*>(procs),
num_items);
return amd::smi::rsmi_to_amdsmi_status(r);
}
amdsmi_status_t amdsmi_get_gpu_compute_process_info_by_pid(uint32_t pid,
amdsmi_process_info_t *proc) {
AMDSMI_CHECK_INIT();
if (proc == nullptr)
return AMDSMI_STATUS_INVAL;
auto r = rsmi_compute_process_info_by_pid_get(pid,
reinterpret_cast<rsmi_process_info_t*>(proc));
return amd::smi::rsmi_to_amdsmi_status(r);
}
amdsmi_status_t
amdsmi_get_gpu_compute_process_gpus(uint32_t pid, uint32_t *dv_indices,
uint32_t *num_devices) {
AMDSMI_CHECK_INIT();
if (dv_indices == nullptr || num_devices == nullptr)
return AMDSMI_STATUS_INVAL;
auto r = rsmi_compute_process_gpus_get(pid, dv_indices, num_devices);
return amd::smi::rsmi_to_amdsmi_status(r);
}
amdsmi_status_t amdsmi_get_gpu_ecc_count(amdsmi_processor_handle processor_handle,
amdsmi_gpu_block_t block, amdsmi_error_count_t *ec) {
AMDSMI_CHECK_INIT();
// nullptr api supported
return rsmi_wrapper(rsmi_dev_ecc_count_get, processor_handle, 0,
static_cast<rsmi_gpu_block_t>(block),
reinterpret_cast<rsmi_error_count_t*>(ec));
}
amdsmi_status_t amdsmi_get_gpu_ecc_enabled(amdsmi_processor_handle processor_handle,
uint64_t *enabled_blocks) {
AMDSMI_CHECK_INIT();
// nullptr api supported
return rsmi_wrapper(rsmi_dev_ecc_enabled_get, processor_handle, 0,
enabled_blocks);
}
amdsmi_status_t amdsmi_get_gpu_ecc_status(amdsmi_processor_handle processor_handle,
amdsmi_gpu_block_t block,
amdsmi_ras_err_state_t *state) {
AMDSMI_CHECK_INIT();
// nullptr api supported
return rsmi_wrapper(rsmi_dev_ecc_status_get, processor_handle, 0,
static_cast<rsmi_gpu_block_t>(block),
reinterpret_cast<rsmi_ras_err_state_t*>(state));
}
amdsmi_status_t
amdsmi_get_gpu_metrics_header_info(amdsmi_processor_handle processor_handle,
amd_metrics_table_header_t *header_value)
{
AMDSMI_CHECK_INIT();
// nullptr api supported
if (header_value != nullptr) {
*header_value = amd_metrics_table_header_t{}; // Use a default initializer for the struct
}
return rsmi_wrapper(rsmi_dev_metrics_header_info_get, processor_handle, 0,
reinterpret_cast<metrics_table_header_t*>(header_value));
}
amdsmi_status_t amdsmi_get_gpu_partition_metrics_info(
amdsmi_processor_handle processor_handle,
amdsmi_gpu_metrics_t *pgpu_metrics) {
AMDSMI_CHECK_INIT();
if (pgpu_metrics != nullptr) {
*pgpu_metrics = amdsmi_gpu_metrics_t{}; // Use a default initializer for the struct
} else {
return AMDSMI_STATUS_INVAL; // Return error if pgpu_metrics is null
}
return rsmi_wrapper(rsmi_dev_gpu_partition_metrics_info_get, processor_handle, 0,
reinterpret_cast<rsmi_gpu_metrics_t*>(pgpu_metrics));
}
amdsmi_status_t amdsmi_get_gpu_metrics_info(
amdsmi_processor_handle processor_handle,
amdsmi_gpu_metrics_t *pgpu_metrics) {
AMDSMI_CHECK_INIT();
if (pgpu_metrics != nullptr) {
*pgpu_metrics = amdsmi_gpu_metrics_t{}; // Use a default initializer for the struct
} else {
return AMDSMI_STATUS_INVAL; // Return error if pgpu_metrics is null
}
return rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle, 0,
reinterpret_cast<rsmi_gpu_metrics_t*>(pgpu_metrics));
}
amdsmi_status_t amdsmi_get_gpu_pm_metrics_info(
amdsmi_processor_handle processor_handle,
amdsmi_name_value_t** pm_metrics,
uint32_t *num_of_metrics) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_pm_metrics_info_get, processor_handle, 0,
reinterpret_cast<rsmi_name_value_t**>(pm_metrics),
num_of_metrics);
}
amdsmi_status_t amdsmi_get_gpu_reg_table_info(
amdsmi_processor_handle processor_handle,
amdsmi_reg_type_t reg_type,
amdsmi_name_value_t** reg_metrics,
uint32_t *num_of_metrics) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_reg_table_info_get, processor_handle, 0,
static_cast<rsmi_reg_type_t>(reg_type),
reinterpret_cast<rsmi_name_value_t**>(reg_metrics),
num_of_metrics);
}
void amdsmi_free_name_value_pairs(void *p) {
if (p)
free(p);
return;
}
amdsmi_status_t
amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle,
uint32_t sensor_ind,
amdsmi_power_cap_info_t *info) {
AMDSMI_CHECK_INIT();
if (info == nullptr)
return AMDSMI_STATUS_INVAL;
bool set_ret_success = false;
amd::smi::AMDSmiGPUDevice* gpudevice = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpudevice);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
amdsmi_status_t status;
status = get_gpu_device_from_handle(processor_handle, &gpudevice);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
// Ignore errors to get as much as possible info.
memset(info, 0, sizeof(amdsmi_power_cap_info_t));
int power_cap = 0;
int dpm = 0;
auto smi_power_cap_status = rsmi_wrapper(rsmi_dev_power_cap_get, processor_handle, 0,
sensor_ind, &(info->power_cap));
status = smi_amdgpu_get_ranges(gpudevice, AMDSMI_CLK_TYPE_GFX,
NULL, NULL, &dpm, NULL);
info->dpm_cap = dpm;
// Get other information from rocm-smi
status = rsmi_wrapper(rsmi_dev_power_cap_default_get, processor_handle, 0,
sensor_ind, &(info->default_power_cap));
status = rsmi_wrapper(rsmi_dev_power_cap_range_get, processor_handle, 0,
sensor_ind, &(info->max_power_cap), &(info->min_power_cap));
return smi_power_cap_status;
}
amdsmi_status_t
amdsmi_set_power_cap(amdsmi_processor_handle processor_handle,
uint32_t sensor_ind, uint64_t cap) {
return rsmi_wrapper(rsmi_dev_power_cap_set, processor_handle, 0,
sensor_ind, cap);
}
amdsmi_status_t
amdsmi_get_supported_power_cap(amdsmi_processor_handle processor_handle, uint32_t *sensor_count,
uint32_t *sensor_inds, amdsmi_power_cap_type_t *sensor_types) {
AMDSMI_CHECK_INIT();
if (!sensor_count || !sensor_inds || !sensor_types) {
return AMDSMI_STATUS_INVAL;
}
return rsmi_wrapper(rsmi_dev_supported_power_cap_get, processor_handle, 0,
sensor_count, sensor_inds,
reinterpret_cast<rsmi_power_cap_type_t*>(sensor_types));
}
amdsmi_status_t
amdsmi_get_gpu_power_profile_presets(amdsmi_processor_handle processor_handle,
uint32_t sensor_ind,
amdsmi_power_profile_status_t *status) {
AMDSMI_CHECK_INIT();
// nullptr api supported
// Bare Metal and passthrough only feature
amdsmi_virtualization_mode_t virt_mode;
if (amdsmi_get_gpu_virtualization_mode(processor_handle, &virt_mode) == AMDSMI_STATUS_SUCCESS) {
if (virt_mode == AMDSMI_VIRTUALIZATION_MODE_GUEST) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
}
return rsmi_wrapper(rsmi_dev_power_profile_presets_get, processor_handle, 0,
sensor_ind, reinterpret_cast<rsmi_power_profile_status_t*>(status));
}
amdsmi_status_t amdsmi_set_gpu_perf_determinism_mode(
amdsmi_processor_handle processor_handle, uint64_t clkvalue) {
return rsmi_wrapper(rsmi_perf_determinism_mode_set, processor_handle, 0,
clkvalue);
}
amdsmi_status_t
amdsmi_set_gpu_power_profile(amdsmi_processor_handle processor_handle,
uint32_t reserved, amdsmi_power_profile_preset_masks_t profile) {
// Bare Metal and passthrough only feature
amdsmi_virtualization_mode_t virt_mode;
if (amdsmi_get_gpu_virtualization_mode(processor_handle, &virt_mode) == AMDSMI_STATUS_SUCCESS) {
if (virt_mode == AMDSMI_VIRTUALIZATION_MODE_GUEST) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
}
return rsmi_wrapper(rsmi_dev_power_profile_set, processor_handle, 0,
reserved,
static_cast<rsmi_power_profile_preset_masks_t>(profile));
}
amdsmi_status_t amdsmi_get_gpu_perf_level(amdsmi_processor_handle processor_handle,
amdsmi_dev_perf_level_t *perf) {
AMDSMI_CHECK_INIT();
if (!perf) {
return AMDSMI_STATUS_INVAL;
}
return rsmi_wrapper(rsmi_dev_perf_level_get, processor_handle, 0,
reinterpret_cast<rsmi_dev_perf_level_t*>(perf));
}
amdsmi_status_t
amdsmi_set_gpu_perf_level(amdsmi_processor_handle processor_handle,
amdsmi_dev_perf_level_t perf_lvl) {
return rsmi_wrapper(rsmi_dev_perf_level_set_v1, processor_handle, 0,
static_cast<rsmi_dev_perf_level_t>(perf_lvl));
}
amdsmi_status_t amdsmi_set_gpu_pci_bandwidth(amdsmi_processor_handle processor_handle,
uint64_t bw_bitmask) {
// Bare Metal and passthrough only feature
amdsmi_virtualization_mode_t virt_mode;
if (amdsmi_get_gpu_virtualization_mode(processor_handle, &virt_mode) == AMDSMI_STATUS_SUCCESS) {
if (virt_mode == AMDSMI_VIRTUALIZATION_MODE_GUEST) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
}
return rsmi_wrapper(rsmi_dev_pci_bandwidth_set, processor_handle, 0,
bw_bitmask);
}
amdsmi_status_t amdsmi_get_gpu_pci_bandwidth(amdsmi_processor_handle processor_handle,
amdsmi_pcie_bandwidth_t *bandwidth) {
return rsmi_wrapper(rsmi_dev_pci_bandwidth_get, processor_handle, 0,
reinterpret_cast<rsmi_pcie_bandwidth_t*>(bandwidth));
}
// TODO(bliu): other frequencies in amdsmi_clk_type_t
amdsmi_status_t amdsmi_get_clk_freq(amdsmi_processor_handle processor_handle,
amdsmi_clk_type_t clk_type, amdsmi_frequencies_t *f) {
AMDSMI_CHECK_INIT();
// nullptr api supported
// Get from gpu_metrics
if (clk_type == AMDSMI_CLK_TYPE_VCLK0 ||
clk_type == AMDSMI_CLK_TYPE_VCLK1 ||
clk_type == AMDSMI_CLK_TYPE_DCLK0 ||
clk_type == AMDSMI_CLK_TYPE_DCLK1 ) {
// Default unit is MHz
char unit = 'M';
// when f == nullptr -> check if metrics are supported
amdsmi_gpu_metrics_t metric_info;
amdsmi_gpu_metrics_t * metric_info_p = nullptr;
if (f != nullptr) {
metric_info_p = &metric_info;
}
// when metric_info_p == nullptr - this will not return AMDSMI_STATUS_SUCCESS
auto r_status = amdsmi_get_gpu_metrics_info(
processor_handle, metric_info_p);
if (r_status != AMDSMI_STATUS_SUCCESS)
return r_status;
f->num_supported = 0;
if (clk_type == AMDSMI_CLK_TYPE_VCLK0) {
f->current = 0;
f->frequency[0] = std::numeric_limits<uint64_t>::max();
if (metric_info_p->current_vclk0 != std::numeric_limits<uint16_t>::max()) {
f->frequency[0] = static_cast<uint64_t>(metric_info_p->current_vclk0)
* amd::smi::get_multiplier_from_char(unit); // match MHz ROCm SMI provides
f->num_supported = 1;
}
}
if (clk_type == AMDSMI_CLK_TYPE_VCLK1) {
f->current = 0;
f->frequency[0] = std::numeric_limits<uint64_t>::max();
if (metric_info_p->current_vclk1 != std::numeric_limits<uint16_t>::max()) {
f->frequency[0] = static_cast<uint64_t>(metric_info_p->current_vclk1)
* amd::smi::get_multiplier_from_char(unit); // match MHz ROCm SMI provides
f->num_supported = 1;
}
}
if (clk_type == AMDSMI_CLK_TYPE_DCLK0) {
f->current = 0;
f->frequency[0] = std::numeric_limits<uint64_t>::max();
if (metric_info_p->current_dclk0 != std::numeric_limits<uint16_t>::max()) {
f->frequency[0] = static_cast<uint64_t>(metric_info_p->current_dclk0)
* amd::smi::get_multiplier_from_char(unit); // match MHz ROCm SMI provides
f->num_supported = 1;
}
}
if (clk_type == AMDSMI_CLK_TYPE_DCLK1) {
f->current = 0;
f->frequency[0] = std::numeric_limits<uint64_t>::max();
if (metric_info_p->current_dclk1 != std::numeric_limits<uint16_t>::max()) {
f->frequency[0] = static_cast<uint64_t>(metric_info_p->current_dclk1)
* amd::smi::get_multiplier_from_char(unit); // match MHz ROCm SMI provides
f->num_supported = 1;
}
}
return r_status;
}
return rsmi_wrapper(rsmi_dev_gpu_clk_freq_get, processor_handle, 0,
static_cast<rsmi_clk_type_t>(clk_type),
reinterpret_cast<rsmi_frequencies_t*>(f));
}
amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle,
amdsmi_clk_type_t clk_type, uint64_t freq_bitmask) {
AMDSMI_CHECK_INIT();
// Not support the clock type write into gpu_metrics
if (clk_type == AMDSMI_CLK_TYPE_VCLK0 ||
clk_type == AMDSMI_CLK_TYPE_VCLK1 ||
clk_type == AMDSMI_CLK_TYPE_DCLK0 ||
clk_type == AMDSMI_CLK_TYPE_DCLK1 ) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
// Bare Metal and passthrough only feature
amdsmi_virtualization_mode_t virt_mode;
if (amdsmi_get_gpu_virtualization_mode(processor_handle, &virt_mode) == AMDSMI_STATUS_SUCCESS) {
if (virt_mode == AMDSMI_VIRTUALIZATION_MODE_GUEST) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
}
return rsmi_wrapper(rsmi_dev_gpu_clk_freq_set, processor_handle, 0,
static_cast<rsmi_clk_type_t>(clk_type), freq_bitmask);
}
amdsmi_status_t amdsmi_set_soc_pstate(amdsmi_processor_handle processor_handle,
uint32_t policy) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_soc_pstate_set, processor_handle, 0,
policy);
}
amdsmi_status_t amdsmi_get_soc_pstate(amdsmi_processor_handle processor_handle,
amdsmi_dpm_policy_t* policy) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_soc_pstate_get, processor_handle, 0,
reinterpret_cast<rsmi_dpm_policy_t*>(policy));
}
amdsmi_status_t amdsmi_set_xgmi_plpd(amdsmi_processor_handle processor_handle,
uint32_t policy) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_xgmi_plpd_set, processor_handle, 0,
policy);
}
amdsmi_status_t amdsmi_get_xgmi_plpd(amdsmi_processor_handle processor_handle,
amdsmi_dpm_policy_t* policy) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_xgmi_plpd_get, processor_handle, 0,
reinterpret_cast<rsmi_dpm_policy_t*>(policy));
}
amdsmi_status_t amdsmi_get_gpu_process_isolation(amdsmi_processor_handle processor_handle,
uint32_t* pisolate) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_process_isolation_get, processor_handle, 0,
pisolate);
}
amdsmi_status_t amdsmi_set_gpu_process_isolation(amdsmi_processor_handle processor_handle,
uint32_t pisolate) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_process_isolation_set, processor_handle, 0,
pisolate);
}
amdsmi_status_t amdsmi_clean_gpu_local_data(amdsmi_processor_handle processor_handle) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_gpu_run_cleaner_shader, processor_handle, 0);
}
amdsmi_status_t
amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle,
uint32_t *num_pages,
amdsmi_retired_page_record_t *records) {
return rsmi_wrapper(rsmi_dev_memory_reserved_pages_get, processor_handle, 0,
num_pages,
reinterpret_cast<rsmi_retired_page_record_t*>(records));
}
amdsmi_status_t amdsmi_get_gpu_memory_total(amdsmi_processor_handle processor_handle,
amdsmi_memory_type_t mem_type, uint64_t *total) {
return rsmi_wrapper(rsmi_dev_memory_total_get, processor_handle, 0,
static_cast<rsmi_memory_type_t>(mem_type), total);
}
amdsmi_status_t amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle,
amdsmi_memory_type_t mem_type, uint64_t *used) {
return rsmi_wrapper(rsmi_dev_memory_usage_get, processor_handle, 0,
static_cast<rsmi_memory_type_t>(mem_type), used);
}
amdsmi_status_t amdsmi_get_gpu_overdrive_level(
amdsmi_processor_handle processor_handle,
uint32_t *od) {
// Bare Metal and passthrough only feature
amdsmi_virtualization_mode_t virt_mode;
if (amdsmi_get_gpu_virtualization_mode(processor_handle, &virt_mode) == AMDSMI_STATUS_SUCCESS) {
if (virt_mode == AMDSMI_VIRTUALIZATION_MODE_GUEST) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
}
return rsmi_wrapper(rsmi_dev_overdrive_level_get, processor_handle, 0, od);
}
amdsmi_status_t amdsmi_get_gpu_mem_overdrive_level(
amdsmi_processor_handle processor_handle,
uint32_t *od) {
return rsmi_wrapper(rsmi_dev_mem_overdrive_level_get, processor_handle, 0, od);
}
amdsmi_status_t amdsmi_set_gpu_overdrive_level(
amdsmi_processor_handle processor_handle, uint32_t od) {
// Bare Metal and passthrough only feature
amdsmi_virtualization_mode_t virt_mode;
if (amdsmi_get_gpu_virtualization_mode(processor_handle, &virt_mode) == AMDSMI_STATUS_SUCCESS) {
if (virt_mode == AMDSMI_VIRTUALIZATION_MODE_GUEST) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
}
return rsmi_wrapper(rsmi_dev_overdrive_level_set_v1, processor_handle, 0, od);
}
amdsmi_status_t amdsmi_get_gpu_pci_replay_counter(
amdsmi_processor_handle processor_handle, uint64_t *counter) {
return rsmi_wrapper(rsmi_dev_pci_replay_counter_get, processor_handle, 0,
counter);
}
amdsmi_status_t amdsmi_get_gpu_pci_throughput(
amdsmi_processor_handle processor_handle,
uint64_t *sent, uint64_t *received, uint64_t *max_pkt_sz) {
return rsmi_wrapper(rsmi_dev_pci_throughput_get, processor_handle, 0,
sent, received, max_pkt_sz);
}
amdsmi_status_t amdsmi_get_gpu_od_volt_info(amdsmi_processor_handle processor_handle,
amdsmi_od_volt_freq_data_t *odv) {
return rsmi_wrapper(rsmi_dev_od_volt_info_get, processor_handle, 0,
reinterpret_cast<rsmi_od_volt_freq_data_t*>(odv));
}
amdsmi_status_t amdsmi_get_gpu_od_volt_curve_regions(
amdsmi_processor_handle processor_handle,
uint32_t *num_regions, amdsmi_freq_volt_region_t *buffer) {
return rsmi_wrapper(rsmi_dev_od_volt_curve_regions_get, processor_handle, 0,
num_regions, reinterpret_cast<rsmi_freq_volt_region_t* >(buffer));
}
amdsmi_status_t amdsmi_get_gpu_volt_metric(amdsmi_processor_handle processor_handle,
amdsmi_voltage_type_t sensor_type,
amdsmi_voltage_metric_t metric, int64_t *voltage) {
return rsmi_wrapper(rsmi_dev_volt_metric_get, processor_handle, 0,
static_cast<rsmi_voltage_type_t>(sensor_type),
static_cast<rsmi_voltage_metric_t>(metric), voltage);
}
amdsmi_status_t amdsmi_set_gpu_od_clk_info(amdsmi_processor_handle processor_handle,
amdsmi_freq_ind_t level,
uint64_t clkvalue,
amdsmi_clk_type_t clkType) {
return rsmi_wrapper(rsmi_dev_od_clk_info_set, processor_handle, 0,
static_cast<rsmi_freq_ind_t>(level), clkvalue,
static_cast<rsmi_clk_type_t>(clkType));
}
amdsmi_status_t amdsmi_set_gpu_od_volt_info(amdsmi_processor_handle processor_handle,
uint32_t vpoint, uint64_t clkvalue, uint64_t voltvalue) {
return rsmi_wrapper(rsmi_dev_od_volt_info_set, processor_handle, 0,
vpoint, clkvalue, voltvalue);
}
amdsmi_status_t amdsmi_set_gpu_clk_range(amdsmi_processor_handle processor_handle,
uint64_t minclkvalue,
uint64_t maxclkvalue,
amdsmi_clk_type_t clkType) {
// Bare Metal and passthrough only feature
amdsmi_virtualization_mode_t virt_mode;
if (amdsmi_get_gpu_virtualization_mode(processor_handle, &virt_mode) == AMDSMI_STATUS_SUCCESS) {
if (virt_mode == AMDSMI_VIRTUALIZATION_MODE_GUEST) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
}
return rsmi_wrapper(rsmi_dev_clk_range_set, processor_handle, 0,
minclkvalue, maxclkvalue,
static_cast<rsmi_clk_type_t>(clkType));
}
amdsmi_status_t amdsmi_set_gpu_clk_limit(amdsmi_processor_handle processor_handle,
amdsmi_clk_type_t clk_type,
amdsmi_clk_limit_type_t limit_type,
uint64_t clk_value) {
return rsmi_wrapper(rsmi_dev_clk_extremum_set, processor_handle, 0,
static_cast<rsmi_freq_ind_t>(limit_type),
clk_value,
static_cast<rsmi_clk_type_t>(clk_type));
}
amdsmi_status_t amdsmi_reset_gpu(amdsmi_processor_handle processor_handle) {
std::ostringstream ss;
amdsmi_status_t ret = rsmi_wrapper(rsmi_dev_gpu_reset, processor_handle, 0);
ss << __PRETTY_FUNCTION__
<< " | Returning: " << smi_amdgpu_get_status_string(ret, false);
LOG_INFO(ss);
return ret;
}
amdsmi_status_t amdsmi_gpu_driver_reload(void) {
std::ostringstream ss;
AMDSMI_CHECK_INIT();
// Attempting to speed up processing time
bool is_logger_enabled = ROCmLogging::Logger::getInstance()->isLoggerEnabled();
if (is_logger_enabled) {
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_INFO(ss);
}
rsmi_status_t ret = rsmi_dev_amdgpu_driver_reload();
amdsmi_status_t amdsmi_status = amd::smi::rsmi_to_amdsmi_status(ret);
if (is_logger_enabled) {
ss << __PRETTY_FUNCTION__
<< " | Returning: " << smi_amdgpu_get_status_string(amdsmi_status, false);
LOG_INFO(ss);
}
return amdsmi_status;
}
amdsmi_status_t amdsmi_get_gpu_busy_percent(amdsmi_processor_handle processor_handle,
uint32_t *gpu_busy_percent) {
return rsmi_wrapper(rsmi_dev_busy_percent_get, processor_handle, 0, gpu_busy_percent);
}
amdsmi_status_t amdsmi_get_utilization_count(amdsmi_processor_handle processor_handle,
amdsmi_utilization_counter_t utilization_counters[],
uint32_t count,
uint64_t *timestamp) {
return rsmi_wrapper(rsmi_utilization_count_get, processor_handle, 0,
reinterpret_cast<rsmi_utilization_counter_t*>(utilization_counters),
count, timestamp);
}
amdsmi_status_t amdsmi_get_energy_count(amdsmi_processor_handle processor_handle,
uint64_t *energy_accumulator, float *counter_resolution, uint64_t *timestamp) {
return rsmi_wrapper(rsmi_dev_energy_count_get, processor_handle, 0,
energy_accumulator, counter_resolution, timestamp);
}
amdsmi_status_t amdsmi_get_gpu_bdf_id(
amdsmi_processor_handle processor_handle, uint64_t *bdfid) {
return rsmi_wrapper(rsmi_dev_pci_id_get, processor_handle, 0,
bdfid);
}
amdsmi_status_t amdsmi_get_gpu_topo_numa_affinity(
amdsmi_processor_handle processor_handle, int32_t *numa_node) {
if (!numa_node) {
return AMDSMI_STATUS_INVAL;
}
return rsmi_wrapper(rsmi_topo_numa_affinity_get, processor_handle, 0,
numa_node);
}
amdsmi_status_t amdsmi_get_lib_version(amdsmi_version_t *version) {
if (version == nullptr)
return AMDSMI_STATUS_INVAL;
version->major = AMDSMI_LIB_VERSION_MAJOR;
version->minor = AMDSMI_LIB_VERSION_MINOR;
version->release = AMDSMI_LIB_VERSION_RELEASE;
version->build = AMDSMI_LIB_VERSION_STRING;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_vbios_info(amdsmi_processor_handle processor_handle, amdsmi_vbios_info_t *info) {
AMDSMI_CHECK_INIT();
if (info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
struct drm_amdgpu_info_vbios vbios = {};
amdsmi_status_t status;
std::ostringstream ss;
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
status = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex());
std::string render_name = gpu_device->get_gpu_path();
std::string path = "/dev/dri/" + render_name;
if (render_name.empty()) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
ScopedFD drm_fd(path.c_str(), O_RDWR | O_CLOEXEC);
if (!drm_fd.valid()) {
ss << __PRETTY_FUNCTION__
<< " | Failed to open " << path << ": " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false);
LOG_ERROR(ss);
return AMDSMI_STATUS_FILE_ERROR;
}
amd::smi::AMDSmiLibraryLoader libdrm;
status = libdrm.load(LIBDRM_AMDGPU_SONAME);
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load " LIBDRM_AMDGPU_SONAME ": " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
ss << __PRETTY_FUNCTION__
<< " | about to load drmCommandWrite symbol";
LOG_INFO(ss);
// extern int drmCommandWrite(int fd, unsigned long drmCommandIndex,
// void *data, unsigned long size);
typedef int (*drmCommandWrite_t)(int fd, unsigned long drmCommandIndex,
void *data, unsigned long size);
drmCommandWrite_t drmCommandWrite = nullptr;
// load symbol from libdrm
status = libdrm.load_symbol(reinterpret_cast<drmCommandWrite_t *>(&drmCommandWrite),
"drmCommandWrite");
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load drmCommandWrite symbol"
<< " | Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
ss << __PRETTY_FUNCTION__
<< " | drmCommandWrite symbol loaded successfully";
LOG_INFO(ss);
memset(&vbios, 0, sizeof(struct drm_amdgpu_info_vbios));
struct drm_amdgpu_info request = {};
memset(&request, 0, sizeof(request));
request.return_pointer = reinterpret_cast<uint64_t>(&vbios);
request.return_size = sizeof(drm_amdgpu_info_vbios);
request.query = AMDGPU_INFO_VBIOS;
request.vbios_info.type = AMDGPU_INFO_VBIOS_INFO;
auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request,
sizeof(struct drm_amdgpu_info));
if (drm_write == 0) {
strncpy(info->name, reinterpret_cast<char *>(vbios.name), AMDSMI_MAX_STRING_LENGTH);
strncpy(info->build_date, reinterpret_cast<char *>(vbios.date), AMDSMI_MAX_STRING_LENGTH - 1);
info->build_date[AMDSMI_MAX_STRING_LENGTH - 1] = '\0';
strncpy(info->part_number, reinterpret_cast<char *>(vbios.vbios_pn),
AMDSMI_MAX_STRING_LENGTH);
// Navi devices still interpret vbios version from drm vbios_ver_str
strncpy(info->version, reinterpret_cast<char *>(vbios.vbios_ver_str),
AMDSMI_MAX_STRING_LENGTH);
} else {
// get sysfs vbios_version string which is known as the part number
char vbios_version[AMDSMI_MAX_STRING_LENGTH];
status = rsmi_wrapper(rsmi_dev_vbios_version_get, processor_handle, 0,
vbios_version, AMDSMI_MAX_STRING_LENGTH);
// fail if cannot get vbios version from sysfs
if (status == AMDSMI_STATUS_SUCCESS) {
strncpy(info->part_number, vbios_version, AMDSMI_MAX_STRING_LENGTH);
}
}
libdrm.unload();
// get vbios build string from rocm_smi which translates to ifwi version
char vbios_build_number[AMDSMI_MAX_STRING_LENGTH];
amdsmi_status_t build_status;
build_status = rsmi_wrapper(rsmi_dev_vbios_build_number_get, processor_handle, 0,
vbios_build_number, AMDSMI_MAX_STRING_LENGTH);
// Continue if sysfs doesn't exist
if (build_status == AMDSMI_STATUS_SUCCESS) {
// This device has an ifwi version so swap the version and boot_firmware
strncpy(info->boot_firmware, info->version, AMDSMI_MAX_STRING_LENGTH);
strncpy(info->version, vbios_build_number, AMDSMI_MAX_STRING_LENGTH);
}
ss << __PRETTY_FUNCTION__
<< " | drmCommandWrite returned: " << strerror(errno) << "\n"
<< " | vbios name: " << info->name << "\n"
<< " | vbios build date: " << info->build_date << "\n"
<< " | vbios part number: " << info->part_number << "\n"
<< " | vbios version: " << info->version << "\n"
<< " | vbios boot_firmware: " << info->boot_firmware<< "\n"
<< " | Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_INFO(ss);
return status;
}
amdsmi_status_t
amdsmi_get_gpu_activity(amdsmi_processor_handle processor_handle, amdsmi_engine_usage_t *info) {
AMDSMI_CHECK_INIT();
if (info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amdsmi_gpu_metrics_t metrics = {};
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
amdsmi_status_t status;
status = amdsmi_get_gpu_metrics_info(processor_handle, &metrics);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
info->gfx_activity = metrics.average_gfx_activity;
info->mm_activity = metrics.average_mm_activity;
info->umc_activity = metrics.average_umc_activity;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_is_gpu_power_management_enabled(amdsmi_processor_handle processor_handle, bool *enabled) {
if (enabled == nullptr) {
return AMDSMI_STATUS_INVAL;
}
*enabled = false;
amd::smi::AMDSmiGPUDevice * gpu_device = nullptr;
amdsmi_status_t status;
status = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
status = smi_amdgpu_is_gpu_power_management_enabled(gpu_device, enabled);
return status;
}
amdsmi_status_t
amdsmi_get_clock_info(amdsmi_processor_handle processor_handle, amdsmi_clk_type_t clk_type, amdsmi_clk_info_t *info) {
AMDSMI_CHECK_INIT();
if (info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
if (clk_type > AMDSMI_CLK_TYPE__MAX) {
return AMDSMI_STATUS_INVAL;
}
amdsmi_gpu_metrics_t metrics = {};
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
amdsmi_status_t status;
status = amdsmi_get_gpu_metrics_info(processor_handle, &metrics);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
int max_freq;
int min_freq;
int sleep_state_freq;
status = smi_amdgpu_get_ranges(gpu_device, clk_type,
&max_freq, &min_freq, NULL, &sleep_state_freq);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
info->max_clk = max_freq;
info->min_clk = min_freq;
info->clk_deep_sleep = static_cast<uint8_t>(sleep_state_freq);
switch (clk_type) {
case AMDSMI_CLK_TYPE_GFX:
info->clk = metrics.current_gfxclk;
break;
case AMDSMI_CLK_TYPE_MEM:
info->clk = metrics.current_uclk;
break;
case AMDSMI_CLK_TYPE_VCLK0:
info->clk = metrics.current_vclk0;
break;
case AMDSMI_CLK_TYPE_VCLK1:
info->clk = metrics.current_vclk1;
break;
case AMDSMI_CLK_TYPE_DCLK0:
info->clk = metrics.current_dclk0;
break;
case AMDSMI_CLK_TYPE_DCLK1:
info->clk = metrics.current_dclk1;
break;
case AMDSMI_CLK_TYPE_SOC:
info->clk = metrics.current_socclk;
break;
// fclk/df not supported by gpu metrics so providing default value which cannot be contrued to be valid
case AMDSMI_CLK_TYPE_DF:
info->clk = UINT32_MAX;
break;
default:
return AMDSMI_STATUS_INVAL;
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_ras_block_features_enabled(amdsmi_processor_handle processor_handle, amdsmi_gpu_block_t block, amdsmi_ras_err_state_t *state) {
AMDSMI_CHECK_INIT();
if (state == nullptr || block > AMDSMI_GPU_BLOCK_LAST) {
return AMDSMI_STATUS_INVAL;
}
uint64_t features_mask = 0;
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
amdsmi_status_t status;
status = smi_amdgpu_get_enabled_blocks(gpu_device, &features_mask);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
*state = (features_mask & block) ? AMDSMI_RAS_ERR_STATE_ENABLED : AMDSMI_RAS_ERR_STATE_DISABLED;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t *num_pages, amdsmi_retired_page_record_t *info) {
AMDSMI_CHECK_INIT();
if (num_pages == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
amdsmi_status_t status;
status = smi_amdgpu_get_bad_page_info(gpu_device, num_pages, info);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_bad_page_threshold(amdsmi_processor_handle processor_handle, uint32_t *threshold) {
AMDSMI_CHECK_INIT();
if (threshold == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
amdsmi_status_t status;
status = smi_amdgpu_get_bad_page_threshold(gpu_device, threshold);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_gpu_validate_ras_eeprom(amdsmi_processor_handle processor_handle) {
AMDSMI_CHECK_INIT();
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
return smi_amdgpu_validate_ras_eeprom(gpu_device);
}
amdsmi_status_t amdsmi_get_gpu_ras_feature_info(
amdsmi_processor_handle processor_handle, amdsmi_ras_feature_t *ras_feature) {
AMDSMI_CHECK_INIT();
if (ras_feature == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle,
&gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
rsmi_ras_feature_info_t rsmi_ras_feature;
r = rsmi_wrapper(rsmi_ras_feature_info_get, processor_handle, 0,
&rsmi_ras_feature);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
ras_feature->ecc_correction_schema_flag
= rsmi_ras_feature.ecc_correction_schema_flag;
ras_feature->ras_eeprom_version = rsmi_ras_feature.ras_eeprom_version;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_total_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_error_count_t *ec) {
AMDSMI_CHECK_INIT();
if (ec == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t status = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
amdsmi_ras_err_state_t state = {};
// Iterate through the ecc blocks
for (auto block = AMDSMI_GPU_BLOCK_FIRST; block <= AMDSMI_GPU_BLOCK_LAST;
block = (amdsmi_gpu_block_t)(block * 2)) {
// Clear the previous ecc block counts
amdsmi_error_count_t block_ec = {};
// Check if the current ecc block is enabled
status = amdsmi_get_gpu_ras_block_features_enabled(processor_handle, block, &state);
if (status == AMDSMI_STATUS_SUCCESS && state == AMDSMI_RAS_ERR_STATE_ENABLED) {
// Increment the total ecc counts by the ecc block counts
status = amdsmi_get_gpu_ecc_count(processor_handle, block, &block_ec);
if (status == AMDSMI_STATUS_SUCCESS) {
// Increase the total ecc counts
ec->correctable_count += block_ec.correctable_count;
ec->uncorrectable_count += block_ec.uncorrectable_count;
ec->deferred_count += block_ec.deferred_count;
}
}
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_cper_entries(
amdsmi_processor_handle processor_handle,
uint32_t severity_mask,
char *cper_data,
uint64_t *buf_size,
amdsmi_cper_hdr_t **cper_hdrs,
uint64_t *entry_count,
uint64_t *cursor) {
AMDSMI_CHECK_INIT();
if (!amd::smi::is_sudo_user()) {
return AMDSMI_STATUS_NO_PERM;
}
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t status = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
std::string path = std::string("/sys/kernel/debug/dri/") +
std::to_string(gpu_device->get_card_id()) +
"/amdgpu_ring_cper";
return amdsmi_get_gpu_cper_entries_by_path(
path.c_str(),
severity_mask,
cper_data,
buf_size,
cper_hdrs,
entry_count,
cursor,
get_product_serial_number(processor_handle)
);
}
amdsmi_status_t amdsmi_get_afids_from_cper(
char* cper_buffer, uint32_t buf_size, uint64_t* afids, uint32_t* num_afids) {
AMDSMI_CHECK_INIT();
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] begin\n";
LOG_DEBUG(ss);
if(!cper_buffer) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] cper_buffer should be a valid memory address\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
else if(!buf_size) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] buf_size should be greater than 0\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
else if(!afids) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] afids should be a valid memory address\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
else if(!num_afids) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] num_afids should be a valid memory address\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
else if(!*num_afids) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] num_afids should be greater than 0\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_INVAL;
}
const amdsmi_cper_hdr_t *cper = reinterpret_cast<const amdsmi_cper_hdr_t *>(cper_buffer);
if(cper->record_length > buf_size) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] cper buffer size " << std::dec << buf_size << " is smaller than cper record length " << std::dec << cper->record_length << "\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_UNEXPECTED_SIZE;
}
else if(strncmp(cper->signature, "CPER", 4) != 0) {
ss << __PRETTY_FUNCTION__ << "\n:" << __LINE__ << "[AFIDS] cper buffer does not have the correct signature\n";
LOG_ERROR(ss);
return AMDSMI_STATUS_UNEXPECTED_DATA;
}
uint32_t i = 0;
for(int afid: cper_decode(cper)) {
if(i < *num_afids) {
afids[i] = afid;
}
++i;
}
*num_afids = i;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t
amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_proc_info_t *list) {
AMDSMI_CHECK_INIT();
// Validate the max_processes pointer
if (!max_processes) {
return AMDSMI_STATUS_INVAL;
}
// Retrieve the GPU device associated with the processor handle
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t status_code = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (status_code != AMDSMI_STATUS_SUCCESS) {
return status_code;
}
// Get the list of compute processes running on the GPU
auto compute_process_list = gpu_device->amdgpu_get_compute_process_list();
// If max_processes is 0, return the number of processes currently running
// If compute_process_list is empty, return success with max_processes set to 0
if ((*max_processes == 0) || compute_process_list.empty()) {
*max_processes = static_cast<uint32_t>(compute_process_list.size());
return AMDSMI_STATUS_SUCCESS;
}
// Validate the list pointer
if (!list) {
return AMDSMI_STATUS_INVAL;
}
// Store the original size of max_processes
const auto max_processes_original_size(*max_processes);
auto idx = uint32_t(0);
// Populate the list with process information
for (auto& process : compute_process_list) {
if (idx < *max_processes) {
// Iterate over the map of processes and store the amdsmi_proc_info_t in the list
list[idx++] = static_cast<amdsmi_proc_info_t>(process.second);
} else {
break;
}
}
// Update max_processes to reflect the actual number of running processes
*max_processes = static_cast<uint32_t>(compute_process_list.size());
// Check if the caller-provided size for processes is sufficient to store all running processes
return (max_processes_original_size >= static_cast<uint32_t>(compute_process_list.size()))
? AMDSMI_STATUS_SUCCESS : AMDSMI_STATUS_OUT_OF_RESOURCES;
}
amdsmi_status_t
amdsmi_get_power_info(amdsmi_processor_handle processor_handle, amdsmi_power_info_t *info) {
AMDSMI_CHECK_INIT();
if (info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amdsmi_status_t status;
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
status = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
info->socket_power = 0xFFFF;
info->current_socket_power = 0xFFFF;
info->average_socket_power = 0xFFFF;
info->gfx_voltage = 0xFFFF;
info->soc_voltage = 0xFFFF;
info->mem_voltage = 0xFFFF;
info->power_limit = 0xFFFF;
amdsmi_gpu_metrics_t metrics = {};
status = amdsmi_get_gpu_metrics_info(processor_handle, &metrics);
if (status == AMDSMI_STATUS_SUCCESS) {
info->current_socket_power = metrics.current_socket_power;
info->average_socket_power = metrics.average_socket_power;
info->gfx_voltage = metrics.voltage_gfx;
info->soc_voltage = metrics.voltage_soc;
info->mem_voltage = metrics.voltage_mem;
}
if (metrics.current_socket_power != 0xFFFF) {
info->socket_power = metrics.current_socket_power;
} else if (metrics.average_socket_power != 0xFFFF) {
info->socket_power = metrics.average_socket_power;
}
int power_limit = 0;
// default the sensor_ind here to 0
amdsmi_status_t status2 = smi_amdgpu_get_power_cap(gpu_device, 0, &power_limit);
if (status2 == AMDSMI_STATUS_SUCCESS) {
info->power_limit = power_limit;
}
// Returning status from amdsmi_get_gpu_metrics_info() which should return SUCCESS
// Getting power cap values may not be supported on all virtualized systems and should
// not return a failure when the metrics values are ascertainable.
return status;
}
amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_handle,
amdsmi_driver_info_t *info) {
AMDSMI_CHECK_INIT();
if (info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
std::ostringstream ss;
amdsmi_status_t status = AMDSMI_STATUS_SUCCESS;
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
int length = AMDSMI_MAX_STRING_LENGTH;
// Get the driver version
status = smi_amdgpu_get_driver_version(gpu_device,
&length, info->driver_version);
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex())
std::string render_name = gpu_device->get_gpu_path();
std::string path = "/dev/dri/" + render_name;
if (render_name.empty()) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
ScopedFD drm_fd(path.c_str(), O_RDWR | O_CLOEXEC);
if (!drm_fd.valid()) {
ss << __PRETTY_FUNCTION__
<< " | Failed to open " << path << ": " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false);
LOG_ERROR(ss);
return AMDSMI_STATUS_FILE_ERROR;
}
amd::smi::AMDSmiLibraryLoader libdrm;
status = libdrm.load(LIBDRM_AMDGPU_SONAME);
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load " LIBDRM_AMDGPU_SONAME ": " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
// Define a function pointer for drmGetVersion
typedef struct _drmVersion* (*drmGetVersion_t)(int fd); // drmGetVersion
drmGetVersion_t drm_get_version = nullptr;
typedef void (*drmFreeVersion_t)(drmVersionPtr version); // drmFreeVersion
drmFreeVersion_t drm_free_version = nullptr;
status = libdrm.load_symbol(
reinterpret_cast<drmGetVersion_t *>(&drm_get_version), "drmGetVersion");
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load drmGetVersion symbol"
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
status = libdrm.load_symbol(
reinterpret_cast<drmGetVersion_t *>(&drm_free_version), "drmFreeVersion");
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load drmFreeVersion symbol"
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
// Get the driver date
std::string driver_date;
auto version = drm_get_version(drm_fd);
if (version == nullptr) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to get driver version"
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_DRM_ERROR, false);
LOG_ERROR(ss);
return AMDSMI_STATUS_DRM_ERROR;
}
driver_date = version->date;
// Reformat the driver date from 20150101 to 2015/01/01 00:00
if (driver_date.length() == 8) {
driver_date = driver_date.substr(0, 4) + "/" + driver_date.substr(4, 2)
+ "/" + driver_date.substr(6, 2) + " 00:00";
}
strncpy(info->driver_date, driver_date.c_str(), AMDSMI_MAX_STRING_LENGTH-1);
// Get the driver name
std::string driver_name = version->name;
strncpy(info->driver_name, driver_name.c_str(), AMDSMI_MAX_STRING_LENGTH-1);
drm_free_version(version);
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Driver version: " << info->driver_version << "\n"
<< " | Driver date: " << info->driver_date << "\n"
<< " | Driver name: " << info->driver_name << "\n"
<< " | Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_INFO(ss);
return status;
}
amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, amdsmi_pcie_info_t *info) {
AMDSMI_CHECK_INIT();
std::ostringstream ss;
if (info == nullptr) {
return AMDSMI_STATUS_INVAL;
}
amdsmi_status_t status = AMDSMI_STATUS_SUCCESS;
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex())
char buff[AMDSMI_MAX_STRING_LENGTH];
FILE* fp;
double pcie_speed = 0;
unsigned pcie_width = 0;
memset((void *)info, 0, sizeof(*info));
std::string path_max_link_width = "/sys/class/drm/" +
gpu_device->get_gpu_path() + "/device/max_link_width";
fp = fopen(path_max_link_width.c_str(), "r");
if (fp) {
fscanf(fp, "%d", &pcie_width);
fclose(fp);
} else {
ss << __PRETTY_FUNCTION__
<< " | Failed to open file: " << path_max_link_width
<< " | returning AMDSMI_STATUS_NOT_SUPPORTED";
LOG_ERROR(ss);
return AMDSMI_STATUS_NOT_SUPPORTED;
}
info->pcie_static.max_pcie_width = (uint16_t)pcie_width;
std::string path_max_link_speed = "/sys/class/drm/" +
gpu_device->get_gpu_path() + "/device/max_link_speed";
fp = fopen(path_max_link_speed.c_str(), "r");
if (fp) {
fscanf(fp, "%lf %s", &pcie_speed, buff);
fclose(fp);
} else {
printf("Failed to open file: %s \n", path_max_link_speed.c_str());
return AMDSMI_STATUS_API_FAILED;
}
// pcie speed in sysfs returns in GT/s
info->pcie_static.max_pcie_speed = static_cast<uint32_t>(pcie_speed * 1000);
switch (info->pcie_static.max_pcie_speed) {
case 2500:
info->pcie_static.pcie_interface_version = 1;
break;
case 5000:
info->pcie_static.pcie_interface_version = 2;
break;
case 8000:
info->pcie_static.pcie_interface_version = 3;
break;
case 16000:
info->pcie_static.pcie_interface_version = 4;
break;
case 32000:
info->pcie_static.pcie_interface_version = 5;
break;
case 64000:
info->pcie_static.pcie_interface_version = 6;
break;
default:
info->pcie_static.pcie_interface_version = 0;
}
// default to PCIe
info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_PCIE;
rsmi_pcie_slot_type_t slot_type;
status = rsmi_wrapper(rsmi_dev_pcie_slot_type_get, processor_handle, 0,
&slot_type);
if (status == AMDSMI_STATUS_SUCCESS) {
switch (slot_type) {
case RSMI_PCIE_SLOT_PCIE:
info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_PCIE;
break;
case RSMI_PCIE_SLOT_OAM:
info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_OAM;
break;
case RSMI_PCIE_SLOT_CEM:
info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_CEM;
break;
default:
info->pcie_static.slot_type = AMDSMI_CARD_FORM_FACTOR_UNKNOWN;
}
}
// metrics
amdsmi_gpu_metrics_t metric_info = {};
status = amdsmi_get_gpu_metrics_info(
processor_handle, &metric_info);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
info->pcie_metric.pcie_width = metric_info.pcie_link_width;
// gpu metrics is inconsistent with pcie_speed values, if 0-6 then it needs to be translated
if (metric_info.pcie_link_speed <= 6) {
status = smi_amdgpu_get_pcie_speed_from_pcie_type(metric_info.pcie_link_speed, &info->pcie_metric.pcie_speed); // mapping to MT/s
} else {
// gpu metrics returns pcie link speed in .1 GT/s ex. 160 vs 16
info->pcie_metric.pcie_speed = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_speed)>
(metric_info.pcie_link_speed, (metric_info.pcie_link_speed * 100));
}
// additional pcie related metrics
/**
* pcie_metric.pcie_bandwidth: MB/s (uint32_t)
* metric_info.pcie_bandwidth_inst: GB/s (uint64_t)
*/
info->pcie_metric.pcie_bandwidth = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_bandwidth)>
(metric_info.pcie_bandwidth_inst, metric_info.pcie_bandwidth_inst);
info->pcie_metric.pcie_replay_count = metric_info.pcie_replay_count_acc;
info->pcie_metric.pcie_l0_to_recovery_count = metric_info.pcie_l0_to_recov_count_acc;
info->pcie_metric.pcie_replay_roll_over_count = metric_info.pcie_replay_rover_count_acc;
/**
* pcie_metric.pcie_nak_received_count: (uint64_t)
* metric_info.pcie_nak_rcvd_count_acc: (uint32_t)
*/
info->pcie_metric.pcie_nak_received_count = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_nak_received_count)>
(metric_info.pcie_nak_rcvd_count_acc, (metric_info.pcie_nak_rcvd_count_acc));
/**
* pcie_metric.pcie_nak_sent_count: (uint64_t)
* metric_info.pcie_nak_sent_count_acc: (uint32_t)
*/
info->pcie_metric.pcie_nak_sent_count = translate_umax_or_assign_value<decltype(info->pcie_metric.pcie_nak_sent_count)>
(metric_info.pcie_nak_sent_count_acc, (metric_info.pcie_nak_sent_count_acc));
/**
* pcie_metric.pcie_lc_perf_other_end_recovery: (uint32_t)
*/
info->pcie_metric.pcie_lc_perf_other_end_recovery_count =
translate_umax_or_assign_value<decltype(
info->pcie_metric.pcie_lc_perf_other_end_recovery_count)> (
metric_info.pcie_lc_perf_other_end_recovery,
(metric_info.pcie_lc_perf_other_end_recovery));
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_gpu_xcd_counter(amdsmi_processor_handle processor_handle,
uint16_t *xcd_count) {
return rsmi_wrapper(rsmi_dev_metrics_xcd_counter_get, processor_handle, 0, xcd_count);
}
amdsmi_status_t amdsmi_get_processor_handle_from_bdf(amdsmi_bdf_t bdf,
amdsmi_processor_handle* processor_handle)
{
amdsmi_status_t status;
uint32_t socket_count = 0;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr) {
return AMDSMI_STATUS_INVAL;
}
status = amdsmi_get_socket_handles(&socket_count, nullptr);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
std::vector<amdsmi_socket_handle> sockets(socket_count);
status = amdsmi_get_socket_handles(&socket_count, &sockets[0]);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
std::ostringstream bdf_sstream;
bdf_sstream << __PRETTY_FUNCTION__
<< " | [bdf] domain_number:" << "bus_number:" << "device_number."
<< "function_number = ";
bdf_sstream << std::hex << std::setfill('0') << std::setw(4) << bdf.domain_number << ":";
bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << bdf.bus_number << ":";
bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << bdf.device_number << ".";
bdf_sstream << std::hex << std::setfill('0') << +bdf.function_number;
// std::cout << __PRETTY_FUNCTION__ << " BDF: " << bdf_sstream.str() << std::endl;
LOG_DEBUG(bdf_sstream);
for (unsigned int i = 0; i < socket_count; i++) {
// Get the processor count available for the socket.
uint32_t processor_count = 0;
status = amdsmi_get_processor_handles(sockets[i], &processor_count, nullptr);
// Allocate the memory for the device handlers on the socket
std::vector<amdsmi_processor_handle> processor_handles(processor_count);
// Get all processors of the socket
status = amdsmi_get_processor_handles(sockets[i], &processor_count, &processor_handles[0]);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
for (uint32_t idx = 0; idx < processor_count; idx++) {
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
status = get_gpu_device_from_handle(processor_handles[idx], &gpu_device);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
amdsmi_bdf_t found_bdf = gpu_device->get_bdf();
bdf_sstream << __PRETTY_FUNCTION__
<< " | [found_bdf] domain_number:" << "bus_number:" << "device_number."
<< "function_number = ";
bdf_sstream << std::hex << std::setfill('0') << std::setw(4)
<< found_bdf.domain_number << ":";
bdf_sstream << std::hex << std::setfill('0') << std::setw(2)
<< found_bdf.bus_number << ":";
bdf_sstream << std::hex << std::setfill('0') << std::setw(2)
<< found_bdf.device_number << ".";
bdf_sstream << std::hex << std::setfill('0')
<< +found_bdf.function_number;
// std::cout << __PRETTY_FUNCTION__ << " BDF: " << bdf_sstream.str() << std::endl;
LOG_DEBUG(bdf_sstream);
if ((bdf.bus_number == found_bdf.bus_number) &&
(bdf.device_number == found_bdf.device_number) &&
(bdf.domain_number == found_bdf.domain_number) &&
(bdf.function_number == found_bdf.function_number)) {
*processor_handle = processor_handles[idx];
return AMDSMI_STATUS_SUCCESS;
}
}
}
return AMDSMI_STATUS_API_FAILED;
}
amdsmi_status_t
amdsmi_get_link_topology_nearest(amdsmi_processor_handle processor_handle,
amdsmi_link_type_t link_type,
amdsmi_topology_nearest_t* topology_nearest_info)
{
if (topology_nearest_info == nullptr) {
return amdsmi_status_t::AMDSMI_STATUS_INVAL;
}
if (link_type < amdsmi_link_type_t::AMDSMI_LINK_TYPE_INTERNAL ||
link_type > amdsmi_link_type_t::AMDSMI_LINK_TYPE_UNKNOWN) {
return amdsmi_status_t::AMDSMI_STATUS_INVAL;
}
auto status(amdsmi_status_t::AMDSMI_STATUS_SUCCESS);
struct LinkTopolyInfo_t
{
amdsmi_processor_handle target_processor_handle;
amdsmi_link_type_t link_type;
bool is_accessible;
uint64_t num_hops;
uint64_t link_weight;
};
/*
* Note: The link topology table is sorted by the number of hops and link weight.
*/
struct LinkTopogyOrderCmp_t {
constexpr bool operator()(const LinkTopolyInfo_t& left,
const LinkTopolyInfo_t& right) const noexcept
{
if (left.num_hops == right.num_hops) {
return (left.num_hops >= right.num_hops);
}
else {
return (left.link_weight > right.link_weight);
}
}
};
std::priority_queue<LinkTopolyInfo_t,
std::vector<LinkTopolyInfo_t>,
LinkTopogyOrderCmp_t> link_topology_order{};
//
AMDSMI_CHECK_INIT();
auto socket_counter = uint32_t(0);
if (auto api_status = amdsmi_get_socket_handles(&socket_counter, nullptr);
(api_status != amdsmi_status_t::AMDSMI_STATUS_SUCCESS)) {
return api_status;
}
amdsmi_socket_handle socket_list[socket_counter];
if (auto api_status = amdsmi_get_socket_handles(&socket_counter, &socket_list[0]);
(api_status != amdsmi_status_t::AMDSMI_STATUS_SUCCESS)) {
return api_status;
}
uint32_t device_counter(AMDSMI_MAX_DEVICES * AMDSMI_MAX_NUM_XCP);
amdsmi_processor_handle device_list[AMDSMI_MAX_DEVICES * AMDSMI_MAX_NUM_XCP];
for (auto socket_idx = uint32_t(0); socket_idx < socket_counter; ++socket_idx) {
if (auto api_status = amdsmi_get_processor_handles(socket_list[socket_idx], &device_counter, device_list);
(api_status != amdsmi_status_t::AMDSMI_STATUS_SUCCESS)) {
return api_status;
}
for (auto device_idx = uint32_t(0); device_idx < device_counter; ++device_idx) {
/* Note: Skip the processor handle that is being queried. */
if (processor_handle != device_list[device_idx]) {
// Accessibility?
auto is_accessible(false);
if (auto api_status = amdsmi_is_P2P_accessible(processor_handle, device_list[device_idx], &is_accessible);
(api_status != amdsmi_status_t::AMDSMI_STATUS_SUCCESS) || !is_accessible) {
continue;
}
// Link type matches what we are searching for?
auto link_type_new = link_type;
auto num_hops = uint64_t(0);
if (auto api_status = amdsmi_topo_get_link_type(processor_handle, device_list[device_idx], &num_hops, &link_type_new);
(api_status != amdsmi_status_t::AMDSMI_STATUS_SUCCESS) || (link_type_new != link_type)) {
continue;
}
// Link weights
auto link_weight = uint64_t(0);
if (auto api_status = amdsmi_topo_get_link_weight(processor_handle, device_list[device_idx], &link_weight);
(api_status != amdsmi_status_t::AMDSMI_STATUS_SUCCESS)) {
continue;
}
// Topology nearest info
LinkTopolyInfo_t link_info = {
.target_processor_handle = device_list[device_idx],
.link_type = link_type,
.is_accessible = is_accessible,
.num_hops = num_hops,
.link_weight = link_weight
};
link_topology_order.push(link_info);
}
}
}
/*
* Note: The link topology table is sorted by the number of hops and link weight.
*/
topology_nearest_info->processor_list[AMDSMI_MAX_DEVICES * AMDSMI_MAX_NUM_XCP] = {nullptr};
topology_nearest_info->count = static_cast<uint32_t>(link_topology_order.size());
auto topology_nearest_counter = uint32_t(0);
while (!link_topology_order.empty()) {
auto link_info = link_topology_order.top();
link_topology_order.pop();
if (topology_nearest_counter < (AMDSMI_MAX_DEVICES * AMDSMI_MAX_NUM_XCP)) {
topology_nearest_info->processor_list[topology_nearest_counter++] = link_info.target_processor_handle;
}
}
return status;
}
static const std::map<amdsmi_virtualization_mode_t, std::string>
virtualization_mode_map = {
{AMDSMI_VIRTUALIZATION_MODE_UNKNOWN, "UNKNOWN"},
{AMDSMI_VIRTUALIZATION_MODE_BAREMETAL, "BAREMETAL"},
{ AMDSMI_VIRTUALIZATION_MODE_HOST, "HOST"},
{ AMDSMI_VIRTUALIZATION_MODE_GUEST, "GUEST"},
{AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH, "PASSTHROUGH"}
};
amdsmi_status_t
amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle,
amdsmi_virtualization_mode_t *mode) {
AMDSMI_CHECK_INIT();
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | start";
LOG_INFO(ss);
if (mode == nullptr) {
return AMDSMI_STATUS_INVAL;
}
struct drm_amdgpu_info_device dev_info = {};
*mode = AMDSMI_VIRTUALIZATION_MODE_UNKNOWN;
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (r != AMDSMI_STATUS_SUCCESS) {
return r;
}
amdsmi_status_t status;
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex())
std::string render_name = gpu_device->get_gpu_path();
std::string path = "/dev/dri/" + render_name;
if (render_name.empty()) {
return AMDSMI_STATUS_NOT_SUPPORTED;
}
ScopedFD drm_fd(path.c_str(), O_RDWR | O_CLOEXEC);
if (!drm_fd.valid()) {
ss << __PRETTY_FUNCTION__
<< " | Failed to open " << path << ": " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false);
LOG_ERROR(ss);
return AMDSMI_STATUS_FILE_ERROR;
}
amd::smi::AMDSmiLibraryLoader libdrm;
status = libdrm.load(LIBDRM_AMDGPU_SONAME);
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load " LIBDRM_AMDGPU_SONAME ": " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
typedef drmVersionPtr (*drmGetVersion_t)(int fd);
typedef void (*drmFreeVersion_t)(drmVersionPtr version);
drmGetVersion_t drm_get_version = nullptr;
drmFreeVersion_t drm_free_version = nullptr;
// Load the drmGetVersion symbol
status = libdrm.load_symbol(reinterpret_cast<drmGetVersion_t *>(&drm_get_version),
"drmGetVersion");
if (status != AMDSMI_STATUS_SUCCESS) {
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load drmGetVersion symbol"
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
// Load the drmFreeVersion symbol
status = libdrm.load_symbol(reinterpret_cast<drmFreeVersion_t *>(&drm_free_version),
"drmFreeVersion");
if (status != AMDSMI_STATUS_SUCCESS) {
drm_free_version = nullptr;
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load drmFreeVersion symbol"
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
// get drm version. If it's older than 3.62.0, then say not supported and exit.
auto drm_version = drm_get_version(drm_fd);
// minimum version that supports getting of virtualization mode
int major_version = 3;
int minor_version = 62;
int patch_version = 0;
bool isDRMVersionSupported = false;
((drm_version->version_major >= major_version)
&& (drm_version->version_minor >= minor_version)
&& (drm_version->version_patchlevel >= patch_version) ?
isDRMVersionSupported = true : isDRMVersionSupported = false);
ss << __PRETTY_FUNCTION__ << " | drm_version: "
<< std::dec << drm_version->version_major << "." << drm_version->version_minor
<< "." << drm_version->version_patchlevel << "\n"
<< " | isDRMVersionSupported: " << (isDRMVersionSupported ? "TRUE" : "FALSE") << "\n"
<< " | Expecting version >= " << major_version << "." << minor_version
<< "." << patch_version << "\n"
<< "; Returning: " << (isDRMVersionSupported ?
smi_amdgpu_get_status_string(AMDSMI_STATUS_SUCCESS, false):
smi_amdgpu_get_status_string(AMDSMI_STATUS_NOT_SUPPORTED, false));
LOG_INFO(ss);
// Check if the version is supported
// If not, then return not supported
if (isDRMVersionSupported == false) {
drm_free_version(drm_version);
libdrm.unload();
return AMDSMI_STATUS_NOT_SUPPORTED;
}
// Get the device info
typedef int (*drmCommandWrite_t)(int fd, unsigned long drmCommandIndex,
void *data, unsigned long size);
drmCommandWrite_t drmCommandWrite = nullptr;
// load symbol from libdrm
status = libdrm.load_symbol(reinterpret_cast<drmCommandWrite_t *>(&drmCommandWrite),
"drmCommandWrite");
if (status != AMDSMI_STATUS_SUCCESS) {
drm_free_version(drm_version);
libdrm.unload();
ss << __PRETTY_FUNCTION__
<< " | Failed to load drmCommandWrite symbol: " << strerror(errno)
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
LOG_ERROR(ss);
return status;
}
// Get the device info
memset(&dev_info, 0, sizeof(struct drm_amdgpu_info_device));
struct drm_amdgpu_info request = {};
memset(&request, 0, sizeof(request));
request.return_pointer = reinterpret_cast<unsigned long long>(&dev_info);
request.return_size = sizeof(struct drm_amdgpu_info_device);
request.query = AMDGPU_INFO_DEV_INFO;
auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request,
sizeof(struct drm_amdgpu_info));
ss << __PRETTY_FUNCTION__
<< " | drm_fd: " << std::dec << drm_fd << "\n"
<< " | path: " << path << "\n"
<< " | drmCommandWrite: " << drm_write << "\n"
<< " | drmCommandWrite returned: " << strerror(errno) << "\n"
<< " | dev_info.ids_flags: " << dev_info.ids_flags << "\n"
<< " | dev_info.ids_flags size: " << sizeof(dev_info.ids_flags) << "\n"
<< " | dev_info.pci_rev: 0x"
<< std::setw(4) << std::setfill('0') << std::hex << dev_info.pci_rev << "\n"
<< " | dev_info.device_id: 0x"
<< std::setw(4) << std::setfill('0') << std::hex << dev_info.device_id;
LOG_INFO(ss);
if (drm_write == 0) {
uint32_t ids_flag = ((dev_info.ids_flags & AMDGPU_IDS_FLAGS_MODE_MASK)
>> AMDGPU_IDS_FLAGS_MODE_SHIFT);
switch (ids_flag) {
case 0: *mode = AMDSMI_VIRTUALIZATION_MODE_BAREMETAL; break;
case 1: *mode = AMDSMI_VIRTUALIZATION_MODE_GUEST; break;
case 2: *mode = AMDSMI_VIRTUALIZATION_MODE_PASSTHROUGH; break;
default: *mode = AMDSMI_VIRTUALIZATION_MODE_UNKNOWN; break;
}
std::string mode_str = "UNKNOWN";
if (virtualization_mode_map.find(*mode) != virtualization_mode_map.end()) {
mode_str.clear();
mode_str = virtualization_mode_map.at(*mode);
}
ss << __PRETTY_FUNCTION__
<< " | ids_flag: " << std::dec << ids_flag << "\n"
<< " | dev_info.ids_flags: 0x"
<< std::hex << std::setw(8) << std::setfill('0') << dev_info.ids_flags << "\n"
<< " | *mode: " << mode_str << "\n"
<< " | Returning: " << smi_amdgpu_get_status_string(status, false)
<< std::endl;
LOG_INFO(ss);
} else {
ss << __PRETTY_FUNCTION__
<< " | Failed to get device info: " << strerror(errno)
<< " | returning AMDSMI_STATUS_DRM_ERROR";
LOG_ERROR(ss);
*mode = AMDSMI_VIRTUALIZATION_MODE_UNKNOWN;
status = AMDSMI_STATUS_DRM_ERROR;
}
drm_free_version(drm_version);
libdrm.unload();
return status;
}
amdsmi_status_t amdsmi_get_cpu_affinity_with_scope(amdsmi_processor_handle processor_handle,
uint32_t cpu_set_size, uint64_t *cpu_set, amdsmi_affinity_scope_t scope)
{
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr || cpu_set == nullptr || cpu_set_size == 0) {
return AMDSMI_STATUS_INVAL;
}
// Retrieve GPU device from the processor handle
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
amdsmi_status_t status = get_gpu_device_from_handle(processor_handle, &gpu_device);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
uint32_t numa_node;
status = amdsmi_topo_get_numa_node_number(processor_handle, &numa_node);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
int32_t node_id = static_cast<int32_t>(numa_node);
status = amdsmi_get_gpu_topo_numa_affinity(processor_handle, &node_id);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
if(node_id < 0) {
return AMDSMI_STATUS_NOT_FOUND;
}
std::memset(cpu_set, 0, cpu_set_size * sizeof(uint64_t));
switch(scope) {
case AMDSMI_AFFINITY_SCOPE_NODE:
{
std::vector<uint64_t> bitmask = gpu_device->get_bitmask_from_numa_node(node_id, cpu_set_size);
if(bitmask[0] == std::numeric_limits<int32_t>::max()){
return AMDSMI_STATUS_REFCOUNT_OVERFLOW;
} else {
std::memcpy(cpu_set, bitmask.data(), cpu_set_size * sizeof(uint64_t));
}
break;
}
case AMDSMI_AFFINITY_SCOPE_SOCKET:
{
uint32_t drm_card = gpu_device->get_card_id();
std::vector<uint64_t> bitmask = gpu_device->get_bitmask_from_local_cpulist(drm_card, cpu_set_size);
if(bitmask[0] == std::numeric_limits<int32_t>::max()){
return AMDSMI_STATUS_REFCOUNT_OVERFLOW;
} else {
std::memcpy(cpu_set, bitmask.data(), cpu_set_size * sizeof(uint64_t));
}
break;
}
default:
return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
}
return AMDSMI_STATUS_SUCCESS;
}
#ifdef ENABLE_ESMI_LIB
static amdsmi_status_t amdsmi_errno_to_esmi_status(amdsmi_status_t status)
{
for (auto& iter : amd::smi::esmi_status_map) {
if (iter.first == static_cast<esmi_status_t>(status))
return iter.second;
}
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_threads_per_core(uint32_t *threads_per_core)
{
amdsmi_status_t status;
uint32_t esmi_threads_per_core;
AMDSMI_CHECK_INIT();
status = static_cast<amdsmi_status_t>(esmi_threads_per_core_get(&esmi_threads_per_core));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*threads_per_core = esmi_threads_per_core;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_hsmp_proto_ver(amdsmi_processor_handle processor_handle,
uint32_t *proto_ver)
{
amdsmi_status_t status;
uint32_t hsmp_proto_ver;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
status = static_cast<amdsmi_status_t>(esmi_hsmp_proto_ver_get(&hsmp_proto_ver));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*proto_ver = hsmp_proto_ver;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_hsmp_driver_version(amdsmi_processor_handle processor_handle,
amdsmi_hsmp_driver_version_t *amdsmi_hsmp_driver_ver)
{
amdsmi_status_t status;
struct hsmp_driver_version hsmp_driver_ver;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
status = static_cast<amdsmi_status_t>(esmi_hsmp_driver_version_get(&hsmp_driver_ver));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
amdsmi_hsmp_driver_ver->major = hsmp_driver_ver.major;
amdsmi_hsmp_driver_ver->minor = hsmp_driver_ver.minor;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_smu_fw_version(amdsmi_processor_handle processor_handle,
amdsmi_smu_fw_version_t *amdsmi_smu_fw)
{
amdsmi_status_t status;
struct smu_fw_version smu_fw;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
status = static_cast<amdsmi_status_t>(esmi_smu_fw_version_get(&smu_fw));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
amdsmi_smu_fw->major = smu_fw.major;
amdsmi_smu_fw->minor = smu_fw.minor;
amdsmi_smu_fw->debug = smu_fw.debug;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_core_energy(amdsmi_processor_handle processor_handle,
uint64_t *penergy)
{
amdsmi_status_t status;
uint64_t core_input;
uint32_t core_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
core_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_core_energy_get(core_ind, &core_input));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*penergy = core_input;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_socket_energy(amdsmi_processor_handle processor_handle,
uint64_t *penergy)
{
amdsmi_status_t status;
uint64_t pkg_input;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_socket_energy_get(sock_ind, &pkg_input));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*penergy = pkg_input;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_prochot_status(amdsmi_processor_handle processor_handle,
uint32_t *prochot)
{
amdsmi_status_t status;
uint32_t phot;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_prochot_status_get(sock_ind, &phot));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*prochot = phot;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_fclk_mclk(amdsmi_processor_handle processor_handle,
uint32_t *fclk, uint32_t *mclk)
{
amdsmi_status_t status;
uint32_t f_clk, m_clk;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_fclk_mclk_get(sock_ind, &f_clk, &m_clk));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*fclk = f_clk;
*mclk = m_clk;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_cclk_limit(amdsmi_processor_handle processor_handle,
uint32_t *cclk)
{
amdsmi_status_t status;
uint32_t c_clk;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_cclk_limit_get(sock_ind, &c_clk));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*cclk = c_clk;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_socket_current_active_freq_limit(amdsmi_processor_handle processor_handle,
uint16_t *freq, char **src_type)
{
amdsmi_status_t status;
uint16_t limit;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_socket_current_active_freq_limit_get(sock_ind, &limit, src_type));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*freq = limit;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_socket_freq_range(amdsmi_processor_handle processor_handle,
uint16_t *fmax, uint16_t *fmin)
{
amdsmi_status_t status;
uint16_t f_max;
uint16_t f_min;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_socket_freq_range_get(sock_ind, &f_max, &f_min));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*fmax = f_max;
*fmin = f_min;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_core_current_freq_limit(amdsmi_processor_handle processor_handle,
uint32_t *freq)
{
amdsmi_status_t status;
uint32_t c_clk;
uint32_t core_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
core_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_current_freq_limit_core_get(core_ind, &c_clk));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*freq = c_clk;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_socket_power(amdsmi_processor_handle processor_handle,
uint32_t *ppower)
{
amdsmi_status_t status;
uint32_t avg_power;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_socket_power_get(sock_ind, &avg_power));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*ppower = avg_power;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_socket_power_cap(amdsmi_processor_handle processor_handle,
uint32_t *pcap)
{
amdsmi_status_t status;
uint32_t p_cap;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_socket_power_cap_get(sock_ind, &p_cap));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*pcap = p_cap;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_socket_power_cap_max(amdsmi_processor_handle processor_handle,
uint32_t *pmax)
{
amdsmi_status_t status;
uint32_t p_max;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_socket_power_cap_max_get(sock_ind, &p_max));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*pmax = p_max;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_pwr_svi_telemetry_all_rails(amdsmi_processor_handle processor_handle,
uint32_t *power)
{
amdsmi_status_t status;
uint32_t pow;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_pwr_svi_telemetry_all_rails_get(sock_ind, &pow));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*power = pow;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_set_cpu_socket_power_cap(amdsmi_processor_handle processor_handle,
uint32_t pcap)
{
amdsmi_status_t status;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_socket_power_cap_set(sock_ind, pcap));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_set_cpu_pwr_efficiency_mode(amdsmi_processor_handle processor_handle,
uint8_t mode)
{
amdsmi_status_t status;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_pwr_efficiency_mode_set(sock_ind, mode));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_core_boostlimit(amdsmi_processor_handle processor_handle,
uint32_t *pboostlimit)
{
amdsmi_status_t status;
uint32_t boostlimit;
uint32_t core_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
core_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_core_boostlimit_get(core_ind, &boostlimit));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*pboostlimit = boostlimit;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_socket_c0_residency(amdsmi_processor_handle processor_handle,
uint32_t *pc0_residency)
{
amdsmi_status_t status;
uint32_t res;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_socket_c0_residency_get(sock_ind, &res));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*pc0_residency = res;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_set_cpu_core_boostlimit(amdsmi_processor_handle processor_handle,
uint32_t boostlimit)
{
amdsmi_status_t status;
uint32_t core_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
core_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_core_boostlimit_set(core_ind, boostlimit));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_set_cpu_socket_boostlimit(amdsmi_processor_handle processor_handle,
uint32_t boostlimit)
{
amdsmi_status_t status;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_socket_boostlimit_set(sock_ind, boostlimit));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_ddr_bw(amdsmi_processor_handle processor_handle,
amdsmi_ddr_bw_metrics_t *ddr_bw)
{
amdsmi_status_t status;
struct ddr_bw_metrics ddr;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_ddr_bw_get(sock_ind, &ddr));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
ddr_bw->max_bw = ddr.max_bw;
ddr_bw->utilized_bw = ddr.utilized_bw;
ddr_bw->utilized_pct = ddr.utilized_pct;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_socket_temperature(amdsmi_processor_handle processor_handle,
uint32_t *ptmon)
{
amdsmi_status_t status;
uint32_t tmon;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_socket_temperature_get(sock_ind, &tmon));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*ptmon = tmon;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_dimm_temp_range_and_refresh_rate(
amdsmi_processor_handle processor_handle,
uint8_t dimm_addr, amdsmi_temp_range_refresh_rate_t *rate)
{
amdsmi_status_t status;
struct temp_range_refresh_rate dimm_rate;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_dimm_temp_range_and_refresh_rate_get(
sock_ind, dimm_addr, &dimm_rate));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
rate->range = dimm_rate.range;
rate->ref_rate = dimm_rate.ref_rate;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_dimm_power_consumption(amdsmi_processor_handle processor_handle,
uint8_t dimm_addr, amdsmi_dimm_power_t *dimm_pow)
{
amdsmi_status_t status;
struct dimm_power d_power;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_dimm_power_consumption_get(sock_ind,
dimm_addr, &d_power));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
dimm_pow->power = d_power.power;
dimm_pow->update_rate = d_power.update_rate;
dimm_pow->dimm_addr = d_power.dimm_addr;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_dimm_thermal_sensor(amdsmi_processor_handle processor_handle,
uint8_t dimm_addr, amdsmi_dimm_thermal_t *dimm_temp)
{
amdsmi_status_t status;
struct dimm_thermal d_sensor;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_dimm_thermal_sensor_get(sock_ind,
dimm_addr, &d_sensor));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
dimm_temp->temp = d_sensor.temp;
dimm_temp->update_rate = d_sensor.update_rate;
dimm_temp->dimm_addr = d_sensor.dimm_addr;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_set_cpu_xgmi_width(amdsmi_processor_handle processor_handle,
uint8_t min, uint8_t max)
{
amdsmi_status_t status;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
status = static_cast<amdsmi_status_t>(esmi_xgmi_width_set(min, max));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_set_cpu_gmi3_link_width_range(amdsmi_processor_handle processor_handle,
uint8_t min_link_width, uint8_t max_link_width)
{
amdsmi_status_t status;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_gmi3_link_width_range_set(sock_ind,
min_link_width, max_link_width));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_cpu_apb_enable(amdsmi_processor_handle processor_handle)
{
amdsmi_status_t status;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_apb_enable(sock_ind));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_cpu_apb_disable(amdsmi_processor_handle processor_handle,
uint8_t pstate)
{
amdsmi_status_t status;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_apb_disable(sock_ind, pstate));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_set_cpu_socket_lclk_dpm_level(amdsmi_processor_handle processor_handle,
uint8_t nbio_id, uint8_t min, uint8_t max)
{
amdsmi_status_t status;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_socket_lclk_dpm_level_set(sock_ind, nbio_id, min, max));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_socket_lclk_dpm_level(amdsmi_processor_handle processor_handle,
uint8_t nbio_id, amdsmi_dpm_level_t *nbio)
{
amdsmi_status_t status;
struct dpm_level nb;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_socket_lclk_dpm_level_get(sock_ind,
nbio_id, &nb));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
nbio->min_dpm_level = nb.min_dpm_level;
nbio->max_dpm_level = nb.max_dpm_level;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_set_cpu_pcie_link_rate(amdsmi_processor_handle processor_handle,
uint8_t rate_ctrl, uint8_t *prev_mode)
{
amdsmi_status_t status;
uint8_t sock_ind;
uint8_t p_mode;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_pcie_link_rate_set(sock_ind,
rate_ctrl, &p_mode));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*prev_mode = p_mode;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_set_cpu_df_pstate_range(amdsmi_processor_handle processor_handle,
uint8_t max_pstate, uint8_t min_pstate)
{
amdsmi_status_t status;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_df_pstate_range_set(sock_ind,
max_pstate, min_pstate));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_current_io_bandwidth(amdsmi_processor_handle processor_handle,
amdsmi_link_id_bw_type_t link, uint32_t *io_bw)
{
amdsmi_status_t status;
uint32_t bw;
struct link_id_bw_type io_link;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
io_link.link_name = link.link_name;
io_link.bw_type = static_cast<io_bw_encoding>(link.bw_type);
status = static_cast<amdsmi_status_t>(esmi_current_io_bandwidth_get(sock_ind,
io_link, &bw));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*io_bw = bw;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_current_xgmi_bw(amdsmi_processor_handle processor_handle,
amdsmi_link_id_bw_type_t link, uint32_t *xgmi_bw)
{
amdsmi_status_t status;
uint32_t bw;
struct link_id_bw_type io_link;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
io_link.link_name = link.link_name;
io_link.bw_type= static_cast<io_bw_encoding>(link.bw_type);
status = static_cast<amdsmi_status_t>(esmi_current_xgmi_bw_get(sock_ind, io_link, &bw));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*xgmi_bw = bw;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_hsmp_metrics_table_version(amdsmi_processor_handle processor_handle,
uint32_t *metrics_version)
{
amdsmi_status_t status;
uint32_t metrics_tbl_ver;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
status = static_cast<amdsmi_status_t>(esmi_metrics_table_version_get(&metrics_tbl_ver));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*metrics_version = metrics_tbl_ver;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_hsmp_metrics_table(amdsmi_processor_handle processor_handle,
amdsmi_hsmp_metrics_table_t *metrics_table)
{
amdsmi_status_t status;
struct hsmp_metric_table metrics_tbl;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
if(sizeof(amdsmi_hsmp_metrics_table_t) != sizeof(struct hsmp_metric_table))
return AMDSMI_STATUS_UNEXPECTED_SIZE;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_metrics_table_get(sock_ind, &metrics_tbl));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
std::memcpy(metrics_table, &metrics_tbl, sizeof(amdsmi_hsmp_metrics_table_t));
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_first_online_core_on_cpu_socket(amdsmi_processor_handle processor_handle,
uint32_t *pcore_ind)
{
amdsmi_status_t status;
uint32_t online_core;
uint8_t sock_ind;
AMDSMI_CHECK_INIT();
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = static_cast<amdsmi_status_t>(esmi_first_online_core_on_socket(sock_ind, &online_core));
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*pcore_ind = online_core;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_family(uint32_t *cpu_family)
{
amdsmi_status_t status;
uint32_t family;
AMDSMI_CHECK_INIT();
status = amd::smi::AMDSmiSystem::getInstance().get_cpu_family(&family);
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*cpu_family = family;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_model(uint32_t *cpu_model)
{
amdsmi_status_t status;
uint32_t model;
AMDSMI_CHECK_INIT();
status = amd::smi::AMDSmiSystem::getInstance().get_cpu_model(&model);
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
*cpu_model = model;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_model_name(amdsmi_processor_handle processor_handle, amdsmi_cpu_info_t *cpu_info)
{
amdsmi_status_t status;
uint32_t sock_ind;
std::string model_name;
if (processor_handle == nullptr)
return AMDSMI_STATUS_INVAL;
amdsmi_status_t r = amdsmi_get_processor_info(processor_handle, SIZE, proc_id);
if (r != AMDSMI_STATUS_SUCCESS)
return r;
sock_ind = (uint8_t)std::stoi(proc_id, NULL, 0);
status = amd::smi::AMDSmiSystem::getInstance().get_cpu_model_name(sock_ind, &model_name);
if (status != AMDSMI_STATUS_SUCCESS)
return amdsmi_errno_to_esmi_status(status);
strncpy(cpu_info->model_name, model_name.c_str(), AMDSMI_MAX_STRING_LENGTH -1);
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_cores_per_socket(uint32_t sock_count, amdsmi_sock_info_t *sock_info)
{
amdsmi_status_t status;
uint32_t core_num;
status = amd::smi::AMDSmiSystem::getInstance().get_sys_cpu_cores_per_socket(&core_num);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
sock_info->cores_per_socket = core_num;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_socket_count(uint32_t *sock_count)
{
amdsmi_status_t status;
uint32_t sock_num;
status = amd::smi::AMDSmiSystem::getInstance().get_sys_num_of_cpu_sockets(&sock_num);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
*sock_count = sock_num;
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t amdsmi_get_cpu_handles(uint32_t *cpu_count,
amdsmi_processor_handle *processor_handles)
{
uint32_t soc_count = 0, index = 0, cpu_per_soc = 0;
processor_type_t processor_type = AMDSMI_PROCESSOR_TYPE_AMD_CPU;
std::vector<amdsmi_processor_handle> cpu_handles;
amdsmi_status_t status;
AMDSMI_CHECK_INIT();
if (cpu_count == nullptr)
return AMDSMI_STATUS_INVAL;
status = amdsmi_get_socket_handles(&soc_count, nullptr);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
// Allocate the memory for the sockets
std::vector<amdsmi_socket_handle> sockets(soc_count);
// Get the sockets of the system
status = amdsmi_get_socket_handles(&soc_count, &sockets[0]);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
for (index = 0 ; index < soc_count; index++)
{
cpu_per_soc = 0;
status = amdsmi_get_processor_handles_by_type(sockets[index], processor_type,
nullptr, &cpu_per_soc);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
if (cpu_per_soc == 0)
continue;
// Allocate the memory for the cpus
std::vector<amdsmi_processor_handle> plist(cpu_per_soc);
// Get the cpus for each socket
status = amdsmi_get_processor_handles_by_type(sockets[index], processor_type,
&plist[0], &cpu_per_soc);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
cpu_handles.insert(cpu_handles.end(), plist.begin(), plist.end());
}
// Get the cpu count
*cpu_count = static_cast<uint32_t>(cpu_handles.size());
if (processor_handles == nullptr) {
return AMDSMI_STATUS_SUCCESS;
}
// Copy the cpu socket handles
for (uint32_t i = 0; i < *cpu_count; i++) {
processor_handles[i] = reinterpret_cast<amdsmi_processor_handle>(cpu_handles[i]);
}
return status;
}
amdsmi_status_t amdsmi_get_cpucore_handles(uint32_t *cores_count,
amdsmi_processor_handle* processor_handles)
{
uint32_t soc_count = 0, index = 0, cores_per_soc = 0;
processor_type_t processor_type = AMDSMI_PROCESSOR_TYPE_AMD_CPU_CORE;
std::vector<amdsmi_processor_handle> core_handles;
amdsmi_status_t status;
AMDSMI_CHECK_INIT();
if (cores_count == nullptr) {
return AMDSMI_STATUS_INVAL;
}
// Get sockets count
status = amdsmi_get_socket_handles(&soc_count, nullptr);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
// Allocate the memory for the sockets
std::vector<amdsmi_socket_handle> sockets(soc_count);
// Get the sockets of the system
status = amdsmi_get_socket_handles(&soc_count, &sockets[0]);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
for (index = 0 ; index < soc_count; index++)
{
cores_per_soc = 0;
status = amdsmi_get_processor_handles_by_type(sockets[index], processor_type,
nullptr, &cores_per_soc);
if (status != AMDSMI_STATUS_SUCCESS)
return status;
// Allocate the memory for the cores
std::vector<amdsmi_processor_handle> plist(cores_per_soc);
// Get the coress for each socket
status = amdsmi_get_processor_handles_by_type(sockets[index], processor_type,
&plist[0], &cores_per_soc);
if (status != AMDSMI_STATUS_SUCCESS) {
return status;
}
core_handles.insert(core_handles.end(), plist.begin(), plist.end());
}
// Get the cores count
*cores_count = static_cast<uint32_t>(core_handles.size());
if (processor_handles == nullptr) {
return AMDSMI_STATUS_SUCCESS;
}
// Copy the core handles
for (uint32_t i = 0; i < *cores_count; i++) {
processor_handles[i] = reinterpret_cast<amdsmi_processor_handle>(core_handles[i]);
}
return status;
}
amdsmi_status_t amdsmi_get_esmi_err_msg(amdsmi_status_t status, const char **status_string)
{
for (const auto& iter : amd::smi::esmi_status_map) {
const amdsmi_status_t _status = status;
if (static_cast<int>(iter.first) == static_cast<int>(_status)) {
*status_string = esmi_get_err_msg(static_cast<esmi_status_t>(iter.first));
return iter.second;
}
}
return AMDSMI_STATUS_SUCCESS;
}
#endif