SWDEV-439576 - rocmsmi -> amdsmi
- Migrate to amdsmi library
- NOTE: raslib still uses rocmsmi
- Remove unused rocmsmi service
- Remove unused RDC client code
- Remove RSMI calls from protos/rdc.proto
Change-Id: Ifc34a264c506b0ec5792307ee56b34526268762d
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
[ROCm/rdc commit: 9702d0f2d7]
This commit is contained in:
@@ -151,16 +151,16 @@ if(NOT EXISTS "${CMAKE_SOURCE_DIR}/raslib/.git" AND BUILD_RASLIB)
|
||||
If you do not want to build raslib, use cmake -DBUILD_RASLIB=off")
|
||||
endif()
|
||||
|
||||
find_package(RSMI
|
||||
NAMES rocm_smi
|
||||
find_package(SMI
|
||||
NAMES amd_smi
|
||||
HINTS ${ROCM_DIR}/lib/cmake
|
||||
CONFIGURE REQUIRED)
|
||||
set(RSMI_INC_DIR "${ROCM_SMI_INCLUDE_DIR}" CACHE INTERNAL "ROCm SMI include directory.")
|
||||
set(RSMI_LIB_DIR "${ROCM_SMI_LIB_DIR}" CACHE INTERNAL "ROCm SMI library directory.")
|
||||
set(SMI_INC_DIR "${AMD_SMI_INCLUDE_DIR}" CACHE INTERNAL "AMD SMI include directory.")
|
||||
set(SMI_LIB_DIR "${AMD_SMI_LIB_DIR}" CACHE INTERNAL "AMD SMI library directory.")
|
||||
|
||||
if(NOT EXISTS "${RSMI_INC_DIR}" OR NOT EXISTS "${RSMI_LIB_DIR}")
|
||||
message(FATAL_ERROR "rocm_smi not found in ${RSMI_INC_DIR}. Please
|
||||
make sure rocm_smi is installed and present in ${RSMI_INC_DIR}.")
|
||||
if(NOT EXISTS "${SMI_INC_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
|
||||
message(FATAL_ERROR "amd_smi not found in ${SMI_INC_DIR}. Please
|
||||
make sure amd_smi is installed and present in ${SMI_INC_DIR}.")
|
||||
endif()
|
||||
|
||||
if(BUILD_RASLIB AND NOT DEFINED HSA_DIR)
|
||||
@@ -301,7 +301,6 @@ if(BUILD_STANDALONE)
|
||||
unset(OLD_CMAKE_INSTALL_MESSAGE)
|
||||
|
||||
add_subdirectory("server")
|
||||
add_subdirectory("client")
|
||||
add_subdirectory("rdci")
|
||||
|
||||
if(BUILD_TESTS)
|
||||
@@ -458,7 +457,7 @@ set(CPACK_DEBIAN_RUNTIME_PACKAGE_CONTROL_EXTRA
|
||||
|
||||
option(ROCM_DEP_ROCMCORE "Add debian dependency on rocm-core" OFF)
|
||||
mark_as_advanced(ROCM_DEP_ROCMCORE)
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-smi-lib, libc6")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "amd-smi-lib, libc6")
|
||||
if(ROCM_DEP_ROCMCORE)
|
||||
string(APPEND CPACK_DEBIAN_PACKAGE_DEPENDS ", rocm-core")
|
||||
endif()
|
||||
@@ -485,7 +484,7 @@ endif()
|
||||
|
||||
set(CPACK_RPM_PACKAGE_AUTOREQ 0)
|
||||
set(CPACK_RPM_PACKAGE_AUTOPROV 0)
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "rocm-smi-lib")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "amd-smi-lib")
|
||||
# rdc-tests need rdc
|
||||
set(CPACK_RPM_TESTS_PACKAGE_REQUIRES "${CPACK_PACKAGE_NAME}")
|
||||
list(APPEND CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/lib"
|
||||
|
||||
@@ -30,7 +30,7 @@ RDC can run on AMD ROCm supported platforms, please refer to the [List of Suppor
|
||||
* It is recommended to install the complete AMD ROCm platform.
|
||||
For installation instruction see https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html
|
||||
* At the minimum, these two components are required
|
||||
(i) AMD ROCm SMI Library (https://github.com/ROCm/rocm_smi_lib)
|
||||
(i) AMDSMI Library (https://github.com/ROCm/amdsmi)
|
||||
(ii) AMD ROCk Kernel driver (https://github.com/ROCm/ROCK-Kernel-Driver)
|
||||
|
||||
## Building gRPC and protoc
|
||||
|
||||
@@ -1,126 +0,0 @@
|
||||
# Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
message(" Cmake Client Lib ")
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
|
||||
## Compiler flags
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -m64")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse -msse2")
|
||||
# Use this instead of above for 32 bit
|
||||
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32")
|
||||
|
||||
if("${CMAKE_BUILD_TYPE}" STREQUAL Release)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG")
|
||||
endif()
|
||||
|
||||
|
||||
# Required Defines first:
|
||||
|
||||
message("")
|
||||
message("Build Configuration:")
|
||||
message("-------------BuildType: " ${CMAKE_BUILD_TYPE})
|
||||
message("--------------Compiler: " ${CMAKE_CXX_COMPILER})
|
||||
message("---------------Version: " ${CMAKE_CXX_COMPILER_VERSION})
|
||||
message("----------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
|
||||
message("----------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
|
||||
message("----------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
|
||||
message("----------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
|
||||
message("----------RSMI Lib Dir: " ${RSMI_LIB_DIR})
|
||||
message("----------RSMI Inc Dir: " ${RSMI_INC_DIR})
|
||||
message("---------GRPC Root Dir: " ${GRPC_ROOT})
|
||||
message("")
|
||||
|
||||
## Include common cmake modules
|
||||
include(utils)
|
||||
|
||||
set(CLIENT_LIB "rdc_client_smi")
|
||||
set(SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
|
||||
set(INC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include/rdc")
|
||||
|
||||
################# Determine the library version #########################
|
||||
## Setup the SO version based on git tags.
|
||||
set(SO_VERSION_GIT_TAG_PREFIX "rdc_so_ver")
|
||||
|
||||
# provide git to utilities
|
||||
find_program(GIT NAMES git)
|
||||
|
||||
# Debian package specific variables
|
||||
# Set a default value for the package version
|
||||
get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
|
||||
|
||||
# VERSION_* variables should be set by get_version_from_tag
|
||||
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}")
|
||||
message("SOVERSION: ${SO_VERSION_STRING}")
|
||||
|
||||
set(CPACK_PACKAGE_FILE_NAME "${RDC_PACKAGE}-${VERSION_STRING}")
|
||||
|
||||
# TODO delete these if not used
|
||||
file(GLOB PROTOBUF_GENERATED_INCLUDES "${PROTOB_OUT_DIR}/*.h")
|
||||
file(GLOB PROTOBUF_GENERATED_SRCS "${PROTOB_OUT_DIR}/*.cc")
|
||||
|
||||
set(CLIENT_LIB_SRC_LIST "${SRC_DIR}/rdc_client.cc"
|
||||
"${SRC_DIR}/rdc_client_main.cc"
|
||||
"${SRC_DIR}/rdc_client_utils.cc"
|
||||
"${PROTOBUF_GENERATED_SRCS}"
|
||||
"${COMMON_DIR}/rdc_utils.cc")
|
||||
message("CLIENT_LIB_SRC_LIST=${CLIENT_LIB_SRC_LIST}")
|
||||
|
||||
set(CLIENT_LIB_INC_LIST "${INC_DIR}/rdc_client.h"
|
||||
"${INC_DIR}/rdc_exception.h"
|
||||
"${INC_DIR}/rdc_client_main.h"
|
||||
"${COMMON_DIR}/rdc_utils.h")
|
||||
|
||||
add_library(${CLIENT_LIB} SHARED ${CLIENT_LIB_SRC_LIST} ${CLIENT_LIB_INC_LIST})
|
||||
target_link_libraries(${CLIENT_LIB} pthread rt gRPC::grpc++ dl)
|
||||
target_include_directories(${CLIENT_LIB} PRIVATE
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/include"
|
||||
"${PROTOB_OUT_DIR}"
|
||||
"${RSMI_INC_DIR}")
|
||||
# TODO: set the properties for the library once we have one
|
||||
## Set the VERSION and SOVERSION values
|
||||
set_property(TARGET ${CLIENT_LIB} PROPERTY
|
||||
SOVERSION "${VERSION_MAJOR}")
|
||||
set_property(TARGET ${CLIENT_LIB} PROPERTY
|
||||
VERSION "${SO_VERSION_STRING}")
|
||||
|
||||
## If the library is a release, strip the target library
|
||||
if("${CMAKE_BUILD_TYPE}" STREQUAL Release)
|
||||
add_custom_command(
|
||||
TARGET ${CLIENT_LIB}
|
||||
POST_BUILD COMMAND ${CMAKE_STRIP} lib${CLIENT_LIB}.so)
|
||||
endif()
|
||||
|
||||
## Add the install directives for the runtime library.
|
||||
install(TARGETS ${CLIENT_LIB}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${RDC}
|
||||
COMPONENT ${CLIENT_COMPONENT})
|
||||
install(DIRECTORY ${PROJECT_SOURCE_DIR}/authentication
|
||||
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${RDC}
|
||||
COMPONENT ${CLIENT_COMPONENT})
|
||||
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
message(" Finished Cmake Client Lib ")
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
@@ -1,382 +0,0 @@
|
||||
|
||||
/*
|
||||
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_H_
|
||||
#define CLIENT_INCLUDE_RDC_RDC_CLIENT_H_
|
||||
|
||||
#include <grpcpp/grpcpp.h>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
/**
|
||||
* @brief Error codes retured by rdc functions
|
||||
*/
|
||||
typedef enum {
|
||||
RDC_STATUS_SUCCESS = 0x0, //!< Operation was successful
|
||||
RDC_RSMI_STATUS_INVALID_ARGS, //!< Passed in arguments are not valid
|
||||
RDC_RSMI_STATUS_NOT_SUPPORTED, //!< The requested information or
|
||||
//!< action is not available for the
|
||||
//!< given input, on the given system
|
||||
RDC_RSMI_STATUS_FILE_ERROR, //!< Problem accessing a file. This
|
||||
//!< may because the operation is not
|
||||
//!< supported by the Linux kernel
|
||||
//!< version running on the executing
|
||||
//!< machine
|
||||
RDC_RSMI_STATUS_PERMISSION, //!< Permission denied/EACCESS file
|
||||
//!< error. Many functions require
|
||||
//!< root access to run.
|
||||
RDC_RSMI_STATUS_OUT_OF_RESOURCES, //!< Unable to acquire memory or other
|
||||
//!< resource
|
||||
RDC_RSMI_STATUS_INTERNAL_EXCEPTION, //!< An internal exception was caught
|
||||
RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS, //!< The provided input is out of
|
||||
//!< allowable or safe range
|
||||
RDC_RSMI_STATUS_INIT_ERROR, //!< An error occurred when creating
|
||||
//!< a communications channel
|
||||
RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED, //!< The requested function has not
|
||||
//!< yet been implemented in the
|
||||
//!< current system for the current
|
||||
//!< devices
|
||||
RDC_RSMI_STATUS_NOT_FOUND, //!< An item was searched for but not
|
||||
//!< found
|
||||
RDC_RSMI_STATUS_INSUFFICIENT_SIZE, //!< Not enough resources were
|
||||
//!< available for the operation
|
||||
RDC_RSMI_STATUS_INTERRUPT, //!< An interrupt occurred during
|
||||
//!< execution of function
|
||||
RDC_RSMI_STATUS_UNEXPECTED_SIZE, //!< An unexpected amount of data
|
||||
//!< was read
|
||||
RDC_RSMI_STATUS_NO_DATA, //!< No data was found for a given
|
||||
//!< input
|
||||
RDC_RSMI_STATUS_UNKNOWN_ERROR, //!< An unknown error occurred
|
||||
RDC_STATUS_GRPC_ERR_FIRST = 1000,
|
||||
|
||||
/// Not an error; returned on success.
|
||||
RDC_STATUS_GRPC_OK = RDC_STATUS_GRPC_ERR_FIRST,
|
||||
|
||||
/// The operation was cancelled (typically by the caller).
|
||||
RDC_STATUS_GRPC_CANCELLED,
|
||||
|
||||
/// Unknown error. An example of where this error may be returned is if a
|
||||
/// Status value received from another address space belongs to an error-space
|
||||
/// that is not known in this address space. Also errors raised by APIs that
|
||||
/// do not return enough error information may be converted to this error.
|
||||
RDC_STATUS_GRPC_UNKNOWN,
|
||||
|
||||
/// Client specified an invalid argument. Note that this differs from
|
||||
/// FAILED_PRECONDITION. INVALID_ARGUMENT indicates arguments that are
|
||||
/// problematic regardless of the state of the system (e.g., a malformed file
|
||||
/// name).
|
||||
RDC_STATUS_GRPC_INVALID_ARG,
|
||||
|
||||
/// Deadline expired before operation could complete. For operations that
|
||||
/// change the state of the system, this error may be returned even if the
|
||||
/// operation has completed successfully. For example, a successful response
|
||||
/// from a server could have been delayed long enough for the deadline to
|
||||
/// expire.
|
||||
RDC_STATUS_GRPC_DEADLINE_EXCEEDED,
|
||||
|
||||
/// Some requested entity (e.g., file or directory) was not found.
|
||||
RDC_STATUS_GRPC_NOT_FOUND,
|
||||
|
||||
/// Some entity that we attempted to create (e.g., file or directory) already
|
||||
/// exists.
|
||||
RDC_STATUS_GRPC_ALREADY_EXISTS,
|
||||
|
||||
/// The caller does not have permission to execute the specified operation.
|
||||
/// PERMISSION_DENIED must not be used for rejections caused by exhausting
|
||||
/// some resource (use RESOURCE_EXHAUSTED instead for those errors).
|
||||
/// PERMISSION_DENIED must not be used if the caller can not be identified
|
||||
/// (use UNAUTHENTICATED instead for those errors).
|
||||
RDC_STATUS_GRPC_PERM_DENIED,
|
||||
|
||||
/// The request does not have valid authentication credentials for the
|
||||
/// operation.
|
||||
RDC_STATUS_GRPC_UNAUTHENTICATED,
|
||||
|
||||
/// Some resource has been exhausted, perhaps a per-user quota, or perhaps the
|
||||
/// entire file system is out of space.
|
||||
RDC_STATUS_GRPC_RESOURCE_EXHAUSTED,
|
||||
|
||||
/// Operation was rejected because the system is not in a state required for
|
||||
/// the operation's execution. For example, directory to be deleted may be
|
||||
/// non-empty, an rmdir operation is applied to a non-directory, etc.
|
||||
///
|
||||
/// A litmus test that may help a service implementor in deciding
|
||||
/// between FAILED_PRECONDITION, ABORTED, and UNAVAILABLE:
|
||||
/// (a) Use UNAVAILABLE if the client can retry just the failing call.
|
||||
/// (b) Use ABORTED if the client should retry at a higher-level
|
||||
/// (e.g., restarting a read-modify-write sequence).
|
||||
/// (c) Use FAILED_PRECONDITION if the client should not retry until
|
||||
/// the system state has been explicitly fixed. E.g., if an "rmdir"
|
||||
/// fails because the directory is non-empty, FAILED_PRECONDITION
|
||||
/// should be returned since the client should not retry unless
|
||||
/// they have first fixed up the directory by deleting files from it.
|
||||
/// (d) Use FAILED_PRECONDITION if the client performs conditional
|
||||
/// REST Get/Update/Delete on a resource and the resource on the
|
||||
/// server does not match the condition. E.g., conflicting
|
||||
/// read-modify-write on the same resource.
|
||||
RDC_STATUS_GRPC_FAILED_PRECOND,
|
||||
|
||||
/// The operation was aborted, typically due to a concurrency issue like
|
||||
/// sequencer check failures, transaction aborts, etc.
|
||||
///
|
||||
/// See litmus test above for deciding between FAILED_PRECONDITION, ABORTED,
|
||||
/// and UNAVAILABLE.
|
||||
RDC_STATUS_GRPC_ABORTED,
|
||||
|
||||
/// Operation was attempted past the valid range. E.g., seeking or reading
|
||||
/// past end of file.
|
||||
///
|
||||
/// Unlike INVALID_ARGUMENT, this error indicates a problem that may be fixed
|
||||
/// if the system state changes. For example, a 32-bit file system will
|
||||
/// generate INVALID_ARGUMENT if asked to read at an offset that is not in the
|
||||
/// range [0,2^32-1], but it will generate OUT_OF_RANGE if asked to read from
|
||||
/// an offset past the current file size.
|
||||
///
|
||||
/// There is a fair bit of overlap between FAILED_PRECONDITION and
|
||||
/// OUT_OF_RANGE. We recommend using OUT_OF_RANGE (the more specific error)
|
||||
/// when it applies so that callers who are iterating through a space can
|
||||
/// easily look for an OUT_OF_RANGE error to detect when they are done.
|
||||
RDC_STATUS_GRPC_OUT_OF_RANGE,
|
||||
|
||||
/// Operation is not implemented or not supported/enabled in this service.
|
||||
RDC_STATUS_GRPC_UNIMPLEMENTED,
|
||||
|
||||
/// Internal errors. Means some invariants expected by underlying System has
|
||||
/// been broken. If you see one of these errors, Something is very broken.
|
||||
RDC_STATUS_GRPC_INTERNAL,
|
||||
|
||||
/// The service is currently unavailable. This is a most likely a transient
|
||||
/// condition and may be corrected by retrying with a backoff.
|
||||
///
|
||||
/// \warning Although data MIGHT not have been transmitted when this
|
||||
/// status occurs, there is NOT A GUARANTEE that the server has not seen
|
||||
/// anything. So in general it is unsafe to retry on this status code
|
||||
/// if the call is non-idempotent.
|
||||
///
|
||||
/// See litmus test above for deciding between FAILED_PRECONDITION, ABORTED,
|
||||
/// and UNAVAILABLE.
|
||||
RDC_STATUS_GRPC_UNAVAILABLE,
|
||||
|
||||
/// Unrecoverable data loss or corruption.
|
||||
RDC_STATUS_GRPC_DATA_LOSS,
|
||||
|
||||
RDC_STATUS_CLIENT_ERR_FIRST = 2000,
|
||||
|
||||
/// SSL authentication error occurred.
|
||||
RDC_STATUS_CLIENT_ERR_SSL = RDC_STATUS_CLIENT_ERR_FIRST,
|
||||
|
||||
RDC_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
|
||||
} rdc_status_t;
|
||||
|
||||
/**
|
||||
* @brief Handle to RDC server channel
|
||||
*/
|
||||
typedef uintptr_t rdc_channel_t;
|
||||
|
||||
#define RDC_DEFAULT_SERVER_PORT 50051
|
||||
#define RDC_DEFAULT_SERVER_IP "localhost"
|
||||
|
||||
/*****************************************************************************/
|
||||
/** @defgroup RDCAdmin RDC Administration Functions
|
||||
* These administrative functions are used to monitor and control, for
|
||||
* example RDC connectivity.
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Check the connection status of a channel
|
||||
*
|
||||
* @details Given an ::rdc_channel_t @p channel and a boolean @p
|
||||
* try_to_connect, this function will return the grpc_connectivity_state for
|
||||
* that channel
|
||||
*
|
||||
* @p channel[in] The channel for which the status will be given
|
||||
*
|
||||
* @param[in] try_to_connect If the channel is currently IDLE, if the argument
|
||||
* is true, transition to CONNECTING.
|
||||
*
|
||||
* @param[inout] state A pointer to caller provided memory to which an
|
||||
* the grpc_connectivity_state will be written. grpc_connectivity_state has
|
||||
* the following possible values:
|
||||
* GRPC_CHANNEL_IDLE channel is idle
|
||||
* GRPC_CHANNEL_CONNECTING channel is connecting
|
||||
* GRPC_CHANNEL_READY channel is ready for work
|
||||
* GRPC_CHANNEL_TRANSIENT_FAILURE channel has seen a failure but expects to
|
||||
* recover
|
||||
* GRPC_CHANNEL_SHUTDOWN channel has seen a failure that it cannot
|
||||
* recover from
|
||||
*
|
||||
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
|
||||
*
|
||||
*/
|
||||
rdc_status_t rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect,
|
||||
grpc_connectivity_state* state);
|
||||
|
||||
/**
|
||||
* @brief Verify a channel's connection to the server
|
||||
*
|
||||
* @details Given an ::rdc_channel_t @p channel, this function will send a
|
||||
* random number to the server associated with @p channel. The server will send
|
||||
* the number back. Upon receiving the returned message from the server, the
|
||||
* number sent to the server is compared to the number received from the
|
||||
* server. If the 2 numbers are the same, the connection is verified.
|
||||
* Otherwise, an appropriate error code is returned.
|
||||
*
|
||||
* @p channel[in] The channel for which the connection will be verified
|
||||
*
|
||||
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
|
||||
*
|
||||
*/
|
||||
rdc_status_t rdc_channel_connection_verify(rdc_channel_t channel);
|
||||
|
||||
/** @} */ // end of RDCAdmin
|
||||
|
||||
/*****************************************************************************/
|
||||
/** @defgroup InitShutAdmin Initialization and Shutdown
|
||||
* These functions are used for initialization of RDC and clean up when
|
||||
* done.
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Create a communications channel to an RDC server
|
||||
*
|
||||
* @details Given a pointer to an ::rdc_channel_t @p channel, a string
|
||||
* containing the ip address of the server @p ip, a string containing
|
||||
* the port number on which the server is listening @p port and a bool
|
||||
* indicating whether the channel should use a secure link @p secure,
|
||||
* this function will attempt to create a new channel and write its
|
||||
* location to address pointed to by @p channel.
|
||||
*
|
||||
* @p channel[inout] A pointer to caller provided memory to which an
|
||||
* ::rdc_channel_t will be written
|
||||
*
|
||||
* @param[in] ip A pointer to a string containing the address of the server.
|
||||
* If nullptr is passed for this parameter, RDC_DEFAULT_SERVER_IP will be used.
|
||||
*
|
||||
* @param[in] port A pointer to string containing the port on which the
|
||||
* RDC server is listening. If nullptr is passed for this parameter,
|
||||
* RDC_DEFAULT_SERVER_PORT will be used.
|
||||
*
|
||||
* @param[in] secure A bool indicating whether SSL should be used for
|
||||
* communications (not currently supported)
|
||||
*
|
||||
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
|
||||
*
|
||||
*/
|
||||
rdc_status_t rdc_channel_create(rdc_channel_t* channel, const char* ip, const char* port,
|
||||
bool secure);
|
||||
|
||||
/**
|
||||
* @brief Destroy a communications channel to an RDC server
|
||||
*
|
||||
* @details Given an ::rdc_channel_t @p channel, this function will free any
|
||||
* resources used by @p channel
|
||||
*
|
||||
* @p channel[inout] An ::rdc_channel_t will be freed
|
||||
*
|
||||
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
|
||||
*
|
||||
*/
|
||||
rdc_status_t rdc_channel_destroy(rdc_channel_t channel);
|
||||
|
||||
/** @} */ // end of InitShutAdmin
|
||||
|
||||
/*****************************************************************************/
|
||||
/** @defgroup RSMIAccess Remote ROCm SMI Calls
|
||||
* These functions calls make ROCm SMI function calls on the remote server.
|
||||
* Please refer to the
|
||||
* [ROCm SMI documentation]
|
||||
* (https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/docs) for
|
||||
* information about the calls. Here, we will document any additional aspects
|
||||
* of the calls introduced by RDC that are not covered in the ROCm SMI
|
||||
* documentation.
|
||||
*
|
||||
* All of the functions in this section attempt to make an RSMI call on the
|
||||
* server machine, given an ::rdc_channel_t associated with the server, and
|
||||
* all the arguments that are required to make the RSMI call.
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Remote call to rsmi_num_monitor_devices()
|
||||
*
|
||||
*/
|
||||
rdc_status_t rdc_num_gpus_get(rdc_channel_t channel, uint64_t* num_gpu);
|
||||
|
||||
/** @} */ // end of RSMIAccess
|
||||
|
||||
/** @defgroup PhysQuer Physical State Queries
|
||||
* These functions provide information about the physical characteristics of
|
||||
* the device.
|
||||
* @{
|
||||
*/
|
||||
/**
|
||||
* @brief Remote call to rsmi_dev_temp_metric_get()
|
||||
*
|
||||
*/
|
||||
rdc_status_t rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_type,
|
||||
rsmi_temperature_metric_t metric, int64_t* temperature);
|
||||
|
||||
/**
|
||||
* @brief Remote call to rsmi_dev_fan_rpms_get()
|
||||
*
|
||||
*/
|
||||
rdc_status_t rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
|
||||
int64_t* rpms);
|
||||
|
||||
/**
|
||||
* @brief Remote call to rsmi_dev_fan_speed_get()
|
||||
*
|
||||
*/
|
||||
rdc_status_t rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
|
||||
int64_t* speed);
|
||||
|
||||
/**
|
||||
* @brief Remote call to rsmi_dev_fan_speed_max_get()
|
||||
*
|
||||
*/
|
||||
rdc_status_t rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
|
||||
uint64_t* max_speed);
|
||||
/** @} */ // end of PhysQuer
|
||||
|
||||
/**
|
||||
* @brief Get a description of a provided RDC error status
|
||||
*
|
||||
* @details Set the provided pointer to a const char *, @p status_string, to
|
||||
* a string containing a description of the provided error code @p status.
|
||||
*
|
||||
* @param[in] status The error status for which a description is desired
|
||||
*
|
||||
* @param[inout] status_string A pointer to a const char * which will be made
|
||||
* to point to a description of the provided error code
|
||||
*
|
||||
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call
|
||||
*
|
||||
*/
|
||||
rdc_status_t rdc_status_string(rdc_status_t status, const char** status_string);
|
||||
|
||||
#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_H_
|
||||
@@ -1,69 +0,0 @@
|
||||
|
||||
/*
|
||||
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_
|
||||
#define CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_
|
||||
|
||||
#include <grpcpp/grpcpp.h>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
#include "rdc/rdc_client.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RDCChannel {
|
||||
public:
|
||||
explicit RDCChannel(std::string server_ip, std::string server_port, bool secure_channel);
|
||||
~RDCChannel();
|
||||
|
||||
rdc_status_t Initialize(void);
|
||||
|
||||
// Getters and Setters
|
||||
|
||||
// Don't have setter for server ip and ports; we don't want to change those
|
||||
// after construction
|
||||
std::string server_ip(void) const { return server_ip_; }
|
||||
std::string server_port(void) const { return server_port_; }
|
||||
bool secure_channel(void) const { return secure_channel_; }
|
||||
std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub(void) const { return rsmi_stub_; }
|
||||
std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub(void) const { return rdc_admin_stub_; }
|
||||
std::shared_ptr<grpc::Channel> const channel(void) { return channel_; }
|
||||
|
||||
private:
|
||||
std::string server_ip_;
|
||||
std::string server_port_;
|
||||
bool secure_channel_;
|
||||
std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub_;
|
||||
std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub_;
|
||||
std::shared_ptr<grpc::Channel> channel_;
|
||||
std::shared_ptr<grpc::ChannelCredentials> channel_creds_;
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_
|
||||
@@ -1,34 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_
|
||||
#define CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_
|
||||
|
||||
#include "rdc/rdc_client.h"
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
rdc_status_t GrpcErrorToRdcError(::grpc::StatusCode grpc_err);
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_
|
||||
@@ -1,50 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef CLIENT_INCLUDE_RDC_RDC_EXCEPTION_H_
|
||||
#define CLIENT_INCLUDE_RDC_RDC_EXCEPTION_H_
|
||||
|
||||
#include <exception>
|
||||
#include <string>
|
||||
|
||||
#include "rdc/rdc_client.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
/// @brief Exception type which carries an error code to return to the user.
|
||||
class rdc_exception : public std::exception {
|
||||
public:
|
||||
rdc_exception(rdc_status_t error, const std::string description)
|
||||
: err_(error), desc_(description) {}
|
||||
rdc_status_t error_code() const noexcept { return err_; }
|
||||
const char* what() const noexcept override { return desc_.c_str(); }
|
||||
|
||||
private:
|
||||
rdc_status_t err_;
|
||||
std::string desc_;
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // CLIENT_INCLUDE_RDC_RDC_EXCEPTION_H_
|
||||
@@ -1,547 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2019 - Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "rdc/rdc_client.h"
|
||||
|
||||
#include <grpcpp/grpcpp.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "common/rdc_utils.h"
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
#include "rdc/rdc_client_main.h"
|
||||
#include "rdc/rdc_client_utils.h"
|
||||
#include "rdc/rdc_exception.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
#define CHK_PTR_ARG(PTR) \
|
||||
if ((PTR) == nullptr) { \
|
||||
return RDC_RSMI_STATUS_INVALID_ARGS; \
|
||||
}
|
||||
|
||||
#define UINTPTR_TO_RDC_CHAN(UPTR) \
|
||||
amd::rdc::RDCChannel* ch = reinterpret_cast<amd::rdc::RDCChannel*>(UPTR); \
|
||||
if (ch == nullptr) { \
|
||||
return RDC_STATUS_GRPC_INVALID_ARG; \
|
||||
}
|
||||
|
||||
static rdc_status_t handleException() {
|
||||
try {
|
||||
throw;
|
||||
} catch (const std::bad_alloc& e) {
|
||||
debug_print("RDC exception: BadAlloc\n");
|
||||
return RDC_RSMI_STATUS_OUT_OF_RESOURCES;
|
||||
} catch (const amd::rdc::rdc_exception& e) {
|
||||
debug_print("Exception caught: %s.\n", e.what());
|
||||
return e.error_code();
|
||||
return RDC_RSMI_STATUS_INTERNAL_EXCEPTION;
|
||||
} catch (const std::exception& e) {
|
||||
debug_print("Unhandled exception: %s\n", e.what());
|
||||
assert(false && "Unhandled exception.");
|
||||
return RDC_RSMI_STATUS_INTERNAL_EXCEPTION;
|
||||
} catch (const std::nested_exception& e) {
|
||||
debug_print("Callback threw, forwarding.\n");
|
||||
e.rethrow_nested();
|
||||
return RDC_RSMI_STATUS_INTERNAL_EXCEPTION;
|
||||
} catch (...) {
|
||||
assert(false && "Unhandled exception.");
|
||||
abort();
|
||||
return RDC_RSMI_STATUS_INTERNAL_EXCEPTION;
|
||||
}
|
||||
}
|
||||
|
||||
#define TRY try {
|
||||
#define CATCH \
|
||||
} \
|
||||
catch (...) { \
|
||||
return handleException(); \
|
||||
}
|
||||
|
||||
rdc_status_t rdc_channel_create(rdc_channel_t* channel, const char* ip, const char* port,
|
||||
bool secure) {
|
||||
TRY std::string server_str;
|
||||
std::string port_str;
|
||||
|
||||
if (channel == nullptr) {
|
||||
return RDC_STATUS_GRPC_INVALID_ARG;
|
||||
}
|
||||
if (ip != nullptr) {
|
||||
server_str = ip;
|
||||
} else {
|
||||
server_str = RDC_DEFAULT_SERVER_IP;
|
||||
}
|
||||
|
||||
if (port != nullptr) {
|
||||
port_str = port;
|
||||
} else {
|
||||
port_str = std::to_string(RDC_DEFAULT_SERVER_PORT);
|
||||
}
|
||||
|
||||
amd::rdc::RDCChannel* ch = new amd::rdc::RDCChannel(server_str, port_str, secure);
|
||||
|
||||
if (ch == nullptr) {
|
||||
return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED;
|
||||
}
|
||||
|
||||
rdc_status_t ret = ch->Initialize();
|
||||
|
||||
if (ret != 0) {
|
||||
delete ch;
|
||||
return ret;
|
||||
}
|
||||
|
||||
*channel = reinterpret_cast<rdc_channel_t>(ch);
|
||||
|
||||
return RDC_STATUS_SUCCESS;
|
||||
|
||||
CATCH
|
||||
}
|
||||
rdc_status_t rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect,
|
||||
grpc_connectivity_state* state) {
|
||||
TRY CHK_PTR_ARG(state) UINTPTR_TO_RDC_CHAN(channel)
|
||||
|
||||
* state = ch->channel()->GetState(try_to_connect);
|
||||
return RDC_STATUS_SUCCESS;
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
rdc_status_t rdc_channel_connection_verify(rdc_channel_t channel) {
|
||||
TRY UINTPTR_TO_RDC_CHAN(channel)
|
||||
|
||||
::rdc::VerifyConnectionResponse resp;
|
||||
::rdc::VerifyConnectionRequest req;
|
||||
::grpc::ClientContext context;
|
||||
unsigned int seed = time(NULL);
|
||||
|
||||
req.set_magic_num(static_cast<uint64_t>(rand_r(&seed)));
|
||||
::grpc::Status status = ch->rdc_admin_stub()->VerifyConnection(&context, req, &resp);
|
||||
|
||||
if (!status.ok()) {
|
||||
return amd::rdc::GrpcErrorToRdcError(status.error_code());
|
||||
}
|
||||
|
||||
if (resp.echo_magic_num() != req.magic_num()) {
|
||||
return RDC_STATUS_GRPC_DATA_LOSS;
|
||||
}
|
||||
|
||||
return RDC_STATUS_SUCCESS;
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
rdc_status_t rdc_channel_destroy(rdc_channel_t channel) {
|
||||
TRY UINTPTR_TO_RDC_CHAN(channel)
|
||||
|
||||
delete ch;
|
||||
|
||||
return RDC_STATUS_SUCCESS;
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
rdc_status_t rdc_num_gpus_get(rdc_channel_t channel, uint64_t* num_gpu) {
|
||||
TRY CHK_PTR_ARG(num_gpu) UINTPTR_TO_RDC_CHAN(channel)
|
||||
|
||||
::rdc::GetNumDevicesResponse resp;
|
||||
::rdc::GetNumDevicesRequest empty;
|
||||
::grpc::ClientContext context;
|
||||
::grpc::Status status = ch->rsmi_stub()->GetNumDevices(&context, empty, &resp);
|
||||
|
||||
if (!status.ok()) {
|
||||
return amd::rdc::GrpcErrorToRdcError(status.error_code());
|
||||
}
|
||||
|
||||
*num_gpu = resp.val();
|
||||
return static_cast<rdc_status_t>(resp.ret_val());
|
||||
|
||||
CATCH
|
||||
}
|
||||
|
||||
// rsmi and rdc currently happen to have a 1-to-1 mapping, but
|
||||
// have this function in case that changes
|
||||
static ::rdc::GetTemperatureRequest_TemperatureMetric rsmi_temp2rdc_temp(
|
||||
rsmi_temperature_metric_t rsmi_temp) {
|
||||
return static_cast<::rdc::GetTemperatureRequest_TemperatureMetric>(rsmi_temp);
|
||||
}
|
||||
|
||||
rdc_status_t rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_type,
|
||||
rsmi_temperature_metric_t metric, int64_t* temperature) {
|
||||
TRY CHK_PTR_ARG(temperature) UINTPTR_TO_RDC_CHAN(channel)
|
||||
|
||||
::rdc::GetTemperatureResponse resp;
|
||||
::rdc::GetTemperatureRequest in_args;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
in_args.set_metric(rsmi_temp2rdc_temp(metric));
|
||||
in_args.set_dv_ind(dv_ind);
|
||||
in_args.set_sensor_type(sensor_type);
|
||||
|
||||
::grpc::Status status = ch->rsmi_stub()->GetTemperature(&context, in_args, &resp);
|
||||
|
||||
if (!status.ok()) {
|
||||
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
|
||||
}
|
||||
|
||||
*temperature = resp.temperature();
|
||||
|
||||
return static_cast<rdc_status_t>(resp.ret_val());
|
||||
CATCH
|
||||
}
|
||||
|
||||
rdc_status_t rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
|
||||
int64_t* rpms) {
|
||||
TRY CHK_PTR_ARG(rpms) UINTPTR_TO_RDC_CHAN(channel)
|
||||
|
||||
::rdc::GetFanRpmsResponse resp;
|
||||
::rdc::GetFanRpmsRequest in_args;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
in_args.set_dv_ind(dv_ind);
|
||||
in_args.set_sensor_ind(sensor_ind);
|
||||
|
||||
::grpc::Status status = ch->rsmi_stub()->GetFanRpms(&context, in_args, &resp);
|
||||
|
||||
if (!status.ok()) {
|
||||
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
|
||||
}
|
||||
|
||||
*rpms = resp.rpms();
|
||||
|
||||
return static_cast<rdc_status_t>(resp.ret_val());
|
||||
CATCH
|
||||
}
|
||||
|
||||
rdc_status_t rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
|
||||
int64_t* speed) {
|
||||
TRY CHK_PTR_ARG(speed) UINTPTR_TO_RDC_CHAN(channel)
|
||||
|
||||
::rdc::GetFanSpeedResponse resp;
|
||||
::rdc::GetFanSpeedRequest in_args;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
in_args.set_dv_ind(dv_ind);
|
||||
in_args.set_sensor_ind(sensor_ind);
|
||||
|
||||
::grpc::Status status = ch->rsmi_stub()->GetFanSpeed(&context, in_args, &resp);
|
||||
|
||||
if (!status.ok()) {
|
||||
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
|
||||
}
|
||||
|
||||
*speed = resp.speed();
|
||||
|
||||
return static_cast<rdc_status_t>(resp.ret_val());
|
||||
CATCH
|
||||
}
|
||||
|
||||
rdc_status_t rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
|
||||
uint64_t* max_speed) {
|
||||
TRY CHK_PTR_ARG(max_speed) UINTPTR_TO_RDC_CHAN(channel)
|
||||
|
||||
::rdc::GetFanSpeedMaxResponse resp;
|
||||
::rdc::GetFanSpeedMaxRequest in_args;
|
||||
::grpc::ClientContext context;
|
||||
|
||||
in_args.set_dv_ind(dv_ind);
|
||||
in_args.set_sensor_ind(sensor_ind);
|
||||
|
||||
::grpc::Status status = ch->rsmi_stub()->GetFanSpeedMax(&context, in_args, &resp);
|
||||
|
||||
if (!status.ok()) {
|
||||
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
|
||||
}
|
||||
|
||||
*max_speed = resp.max_speed();
|
||||
|
||||
return static_cast<rdc_status_t>(resp.ret_val());
|
||||
CATCH
|
||||
}
|
||||
|
||||
rdc_status_t rdc_status_string(rdc_status_t status, const char** status_string) {
|
||||
TRY if (status_string == nullptr) { return RDC_RSMI_STATUS_INVALID_ARGS; }
|
||||
|
||||
const size_t status_u = static_cast<size_t>(status);
|
||||
switch (status_u) {
|
||||
case RDC_STATUS_SUCCESS:
|
||||
*status_string =
|
||||
"RDC_STATUS_SUCCESS: The function has been executed"
|
||||
" successfully.";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_INVALID_ARGS:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_INVALID_ARGS: The provided arguments do not"
|
||||
" meet the preconditions required for calling this function.";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_NOT_SUPPORTED:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_NOT_SUPPORTED: This function is not"
|
||||
" supported in the current environment.";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_FILE_ERROR:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_FILE_ERROR: There was an error in finding or"
|
||||
" opening a file or directory. The operation may not be supported by "
|
||||
"this Linux kernel version.";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_PERMISSION:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_PERMISSION: The user ID of the calling"
|
||||
" process does not have sufficient permission to execute a command."
|
||||
" Often this is fixed by running as root (sudo).";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_OUT_OF_RESOURCES:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_OUT_OF_RESOURCES: Unable to acquire "
|
||||
"memory or other resource";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_INTERNAL_EXCEPTION:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_INTERNAL_EXCEPTION: An internal "
|
||||
"exception was caught";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS: The provided "
|
||||
"input is out of allowable or safe range";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_INIT_ERROR:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_INIT_ERROR: An error occurred during "
|
||||
"initialization, during "
|
||||
"monitor discovery or when when initializing internal data structures";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED: The called "
|
||||
"function has not been implemented in this "
|
||||
"system for this device type";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_NOT_FOUND:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_NOT_FOUND: An item required to "
|
||||
"complete the call was not found";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_INSUFFICIENT_SIZE:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_INSUFFICIENT_SIZE: Not enough "
|
||||
"resources were available to fully execute"
|
||||
" the call";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_UNKNOWN_ERROR:
|
||||
*status_string =
|
||||
"An unknown error prevented the call from completing"
|
||||
" successfully";
|
||||
break;
|
||||
|
||||
case RDC_RSMI_STATUS_INTERRUPT:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_INTERRUPT An interrupt occurred while "
|
||||
"executing the function";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_CANCELLED:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_CANCELLED The operation was cancelled (typically by "
|
||||
"the caller).";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_UNKNOWN:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_UNKNOWN Unknown error. An example of where this error"
|
||||
" may be returned is if a"
|
||||
"Status value received from another address space belongs to an error-"
|
||||
"space that is not known in this address space. Also errors raised by "
|
||||
"APIs that do not return enough error information may be converted to "
|
||||
"this error.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_INVALID_ARG:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_INVALID_ARG Client specified an invalid argument. "
|
||||
"Note that this differs from"
|
||||
"FAILED_PRECONDITION. INVALID_ARGUMENT indicates arguments that are "
|
||||
"problematic regardless of the state of the system (e.g., a malformed "
|
||||
"file name).";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_DEADLINE_EXCEEDED:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_DEADLINE_EXCEEDED Deadline expired before operation "
|
||||
"could complete. For operations that"
|
||||
"change the state of the system, this error may be returned even if "
|
||||
"the operation has completed successfully. For example, a successful "
|
||||
"response from a server could have been delayed long enough for the "
|
||||
"deadline to expire.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_NOT_FOUND:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_NOT_FOUND Some requested entity (e.g., file or "
|
||||
"directory) was not found.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_ALREADY_EXISTS:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_ALREADY_EXISTS Some entity that we "
|
||||
"attempted to create "
|
||||
"(e.g., file or directory) already exists.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_PERM_DENIED:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_PERM_DENIED The caller does not have permission to "
|
||||
"execute the specified operation."
|
||||
"PERMISSION_DENIED must not be used for rejections caused by "
|
||||
"exhausting some resource (use RESOURCE_EXHAUSTED instead for those "
|
||||
"errors). PERMISSION_DENIED must not be used if the caller can not "
|
||||
" be identified (use UNAUTHENTICATED instead for those errors).";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_UNAUTHENTICATED:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_UNAUTHENTICATED The request does not have valid "
|
||||
"authentication credentials for the operation.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_RESOURCE_EXHAUSTED:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_RESOURCE_EXHAUSTED Some resource has been exhausted, "
|
||||
"perhaps a per-user quota, or perhaps the "
|
||||
"entire file system is out of space.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_FAILED_PRECOND:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_FAILED_PRECOND Operation was rejected because the "
|
||||
"system is not in a state required for "
|
||||
"the operation's execution. For example, directory to be deleted may "
|
||||
"be non-empty, an rmdir operation is applied to a non-directory, etc.\n"
|
||||
"A litmus test that may help a service implementor in deciding "
|
||||
"between FAILED_PRECONDITION, ABORTED, and UNAVAILABLE:\n"
|
||||
" (a) Use UNAVAILABLE if the client can retry just the failing call.\n"
|
||||
" (b) Use ABORTED if the client should retry at a higher-level "
|
||||
" (e.g., restarting a read-modify-write sequence).\n"
|
||||
" (c) Use FAILED_PRECONDITION if the client should not retry until"
|
||||
" the system state has been explicitly fixed. E.g., if an \"rmdir\""
|
||||
" fails because the directory is non-empty, FAILED_PRECONDITION"
|
||||
" should be returned since the client should not retry unless"
|
||||
" they have first fixed up the directory by deleting files from it.\n"
|
||||
" (d) Use FAILED_PRECONDITION if the client performs conditional"
|
||||
" REST Get/Update/Delete on a resource and the resource on the"
|
||||
" server does not match the condition. E.g., conflicting"
|
||||
" read-modify-write on the same resource.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_ABORTED:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_ABORTED The operation was aborted, "
|
||||
"typically due to a concurrency issue like "
|
||||
"sequencer check failures, transaction aborts, etc.\n"
|
||||
"See litmus test above for deciding between "
|
||||
"FAILED_PRECONDITION, ABORTED, "
|
||||
"and UNAVAILABLE.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_OUT_OF_RANGE:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_OUT_OF_RANGE Operation was attempted "
|
||||
"past the valid range. E.g., seeking or reading "
|
||||
"past end of file.\n"
|
||||
"Unlike INVALID_ARGUMENT, this error indicates a "
|
||||
"problem that may be fixed "
|
||||
"if the system state changes. For example, a 32-bit file system will "
|
||||
"generate INVALID_ARGUMENT if asked to read "
|
||||
"at an offset that is not in the "
|
||||
"range [0,2^32-1], but it will generate "
|
||||
"OUT_OF_RANGE if asked to read from "
|
||||
"an offset past the current file size.\n"
|
||||
"There is a fair bit of overlap between FAILED_PRECONDITION and "
|
||||
"OUT_OF_RANGE. We recommend using OUT_OF_RANGE "
|
||||
"(the more specific error) "
|
||||
"when it applies so that callers who are "
|
||||
"iterating through a space can "
|
||||
"easily look for an OUT_OF_RANGE error to detect when they are done.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_UNIMPLEMENTED:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_UNIMPLEMENTED Operation is not "
|
||||
"implemented or not supported/enabled in this service.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_INTERNAL:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_INTERNAL Internal errors. This means "
|
||||
"some invariants expected by underlying System has "
|
||||
"been broken. If you see one of these errors.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_UNAVAILABLE:
|
||||
*status_string =
|
||||
"RDC_STATUS_GRPC_UNAVAILABLE The service is currently unavailable. "
|
||||
"This is a most likely a transient "
|
||||
"condition and may be corrected by retrying with a backoff.\n"
|
||||
"Warning: Although data MIGHT not have been transmitted when this "
|
||||
"status occurs, there is NOT A GUARANTEE that the server has not seen "
|
||||
"anything. So in general it is unsafe to retry on this status code "
|
||||
"if the call is non-idempotent. "
|
||||
"See litmus test above for deciding between "
|
||||
"FAILED_PRECONDITION, ABORTED,"
|
||||
"and UNAVAILABLE.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_GRPC_DATA_LOSS:
|
||||
*status_string = "RDC_STATUS_GRPC_DATA_LOSS Unrecoverable data loss or corruption.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_UNKNOWN_ERROR:
|
||||
*status_string = "RDC_STATUS_UNKNOWN_ERROR An unknown RDC error occurred.";
|
||||
break;
|
||||
|
||||
case RDC_STATUS_CLIENT_ERR_SSL:
|
||||
*status_string = "An error occurred when executing SSL authentication operations.";
|
||||
break;
|
||||
|
||||
default:
|
||||
*status_string =
|
||||
"RDC_RSMI_STATUS_UNKNOWN_ERROR An "
|
||||
"unknown error occurred";
|
||||
return RDC_RSMI_STATUS_UNKNOWN_ERROR;
|
||||
}
|
||||
return RDC_STATUS_SUCCESS;
|
||||
CATCH
|
||||
}
|
||||
@@ -1,177 +0,0 @@
|
||||
|
||||
/*
|
||||
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "rdc/rdc_client_main.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <grpcpp/grpcpp.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "common/rdc_utils.h"
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
#include "rdc/rdc_client.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
#ifdef USE_PINNED_CERTS
|
||||
// Pinned certificates
|
||||
static const char* kDefaultRDCServerCertPinPath = "/etc/rdc/server/rdc_server.crt";
|
||||
static const char* kDefaultRDCClientKeyPinPath = "/etc/rdc/client/private/rdc_client.key";
|
||||
static const char* kDefaultRDCClientCertPinPath = "/etc/rdc/client/rdc_client.crt";
|
||||
#endif // USE_PINNED_CERTS
|
||||
|
||||
// PKI certificates
|
||||
static const char* kDefaultRDCClientCertKeyPkiPath = "/etc/rdc/client/private/rdc_client_cert.key";
|
||||
static const char* kDefaultRDCClientCertPemPkiPath = "/etc/rdc/client/certs/rdc_client_cert.pem";
|
||||
static const char* kDefaultRDCClientCACertPemPkiPath = "/etc/rdc/client/certs/rdc_cacert.pem";
|
||||
|
||||
RDCChannel::RDCChannel(std::string server_ip, std::string server_port, bool secure)
|
||||
: server_ip_(server_ip), server_port_(server_port), secure_channel_(secure) {}
|
||||
|
||||
RDCChannel::~RDCChannel() {}
|
||||
|
||||
#ifdef USE_PINNED_CERTS
|
||||
static int ConstructSSLOptsPin(grpc::SslCredentialsOptions* ssl_opts) {
|
||||
assert(ssl_opts != nullptr);
|
||||
if (ssl_opts == nullptr) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
// Ensure the required paths exists before going forward
|
||||
// TODO(cfreehil): override these defaults with values read from config
|
||||
// file
|
||||
if (!amd::rdc::FileExists(kDefaultRDCClientKeyPinPath) ||
|
||||
!amd::rdc::FileExists(kDefaultRDCServerCertPinPath) ||
|
||||
!amd::rdc::FileExists(kDefaultRDCClientCertPinPath)) {
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
std::string cli_key;
|
||||
std::string ser_crt;
|
||||
std::string cli_crt;
|
||||
int ret;
|
||||
ret = amd::rdc::ReadFile(kDefaultRDCClientKeyPinPath, &cli_key);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
ret = amd::rdc::ReadFile(kDefaultRDCServerCertPinPath, &ser_crt);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
ret = amd::rdc::ReadFile(kDefaultRDCClientCertPinPath, &cli_crt);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ssl_opts->pem_root_certs = ser_crt;
|
||||
ssl_opts->pem_private_key = cli_key;
|
||||
ssl_opts->pem_cert_chain = cli_crt;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif // USE_PINNED_CERTS
|
||||
|
||||
static int ConstructSSLOptsPKI(grpc::SslCredentialsOptions* ssl_opts) {
|
||||
assert(ssl_opts != nullptr);
|
||||
if (ssl_opts == nullptr) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
// Ensure the required paths exists before going forward
|
||||
// TODO(cfreehil): override these defaults with values read from config
|
||||
// file
|
||||
if (!amd::rdc::FileExists(kDefaultRDCClientCertKeyPkiPath) ||
|
||||
!amd::rdc::FileExists(kDefaultRDCClientCertPemPkiPath) ||
|
||||
!amd::rdc::FileExists(kDefaultRDCClientCACertPemPkiPath)) {
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
std::string pem_root_certs;
|
||||
std::string pem_private_key;
|
||||
std::string pem_cert_chain;
|
||||
int ret;
|
||||
ret = amd::rdc::ReadFile(kDefaultRDCClientCACertPemPkiPath, &pem_root_certs);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
ret = amd::rdc::ReadFile(kDefaultRDCClientCertKeyPkiPath, &pem_private_key);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
ret = amd::rdc::ReadFile(kDefaultRDCClientCertPemPkiPath, &pem_cert_chain);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ssl_opts->pem_root_certs = pem_root_certs;
|
||||
ssl_opts->pem_private_key = pem_private_key;
|
||||
ssl_opts->pem_cert_chain = pem_cert_chain;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
rdc_status_t RDCChannel::Initialize(void) {
|
||||
assert(!server_port_.empty());
|
||||
assert(!server_ip_.empty());
|
||||
|
||||
int ret;
|
||||
std::string addr_str = server_ip() + ":";
|
||||
addr_str += server_port();
|
||||
|
||||
if (secure_channel_) {
|
||||
grpc::SslCredentialsOptions ssl_opts;
|
||||
|
||||
#ifdef USE_PINNED_CERTS
|
||||
ret = ConstructSSLOptsPin(&ssl_opts);
|
||||
#else
|
||||
ret = ConstructSSLOptsPKI(&ssl_opts);
|
||||
#endif
|
||||
if (ret) {
|
||||
std::cerr << "Failed to process OpenSSL keys and certificates." << std::endl;
|
||||
return RDC_STATUS_CLIENT_ERR_SSL;
|
||||
}
|
||||
|
||||
channel_creds_ = grpc::SslCredentials(ssl_opts);
|
||||
channel_ = grpc::CreateChannel(addr_str, channel_creds_);
|
||||
} else {
|
||||
channel_ = ::grpc::CreateChannel(addr_str, grpc::InsecureChannelCredentials());
|
||||
}
|
||||
|
||||
rsmi_stub_ = ::rdc::Rsmi::NewStub(channel_);
|
||||
if (rsmi_stub_ == nullptr) {
|
||||
return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED;
|
||||
}
|
||||
|
||||
rdc_admin_stub_ = ::rdc::RdcAdmin::NewStub(channel_);
|
||||
if (rdc_admin_stub_ == nullptr) {
|
||||
return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED;
|
||||
}
|
||||
|
||||
// Test to see if we can connect to server; if not, return err.
|
||||
return RDC_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
@@ -1,40 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "rdc/rdc_client_utils.h"
|
||||
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
#include "rdc/rdc_client.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
rdc_status_t GrpcErrorToRdcError(grpc::StatusCode grpc_err) {
|
||||
uint32_t grpc_err_int = static_cast<uint32_t>(grpc_err);
|
||||
uint32_t rdc_grpc_base_int = static_cast<uint32_t>(RDC_STATUS_GRPC_ERR_FIRST);
|
||||
uint32_t rdc_err_int = grpc_err_int + rdc_grpc_base_int;
|
||||
|
||||
return static_cast<rdc_status_t>(rdc_err_int);
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
@@ -12,9 +12,9 @@ The RDC includes the following libraries:
|
||||
|
||||
• librdc_client.so: Exposes RDC functionality using gRPC client.
|
||||
|
||||
• librdc.so: RDC API. This depends on librocm_smi.so.
|
||||
• librdc.so: RDC API. This depends on libamd_smi.so.
|
||||
|
||||
• librocm_smi.so: Stateless low overhead access to GPU data.
|
||||
• libamd_smi.so: Stateless low overhead access to GPU data.
|
||||
|
||||

|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ NOTE: The RDC tool is tested on the following software versions. Earlier version
|
||||
|
||||
• g++ (5.4.0)
|
||||
|
||||
• AMD ROCm, which includes AMD ROCm SMI Library
|
||||
• AMD ROCm, which includes AMD AMDSMI Library
|
||||
|
||||
• gRPC and protoc
|
||||
|
||||
|
||||
@@ -68,7 +68,7 @@ RDC Command Line Tool (rdci)
|
||||
|
||||
A command-line tool to invoke all the features of the RDC tool. This CLI can be run locally or remotely.
|
||||
|
||||
ROCm-SMI Library
|
||||
AMDSMI Library
|
||||
|
||||
A stateless system management library that provides low-level interfaces to access GPU information
|
||||
|
||||
|
||||
@@ -257,7 +257,7 @@ typedef enum {
|
||||
//!< represents 32 bytes
|
||||
|
||||
// "Composite" events. These events have additional processing beyond
|
||||
// the value provided by the rocm_smi library.
|
||||
// the value provided by the amd_smi library.
|
||||
RDC_EVNT_XGMI_0_THRPUT = 1500, //!< Transmit throughput to XGMI
|
||||
//!< neighbor 0 in byes/sec
|
||||
RDC_EVNT_XGMI_1_THRPUT, //!< Transmit throughput to XGMI
|
||||
|
||||
@@ -34,8 +34,8 @@ namespace rdc {
|
||||
|
||||
class RdcMetricFetcher {
|
||||
public:
|
||||
virtual rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) = 0;
|
||||
virtual rdc_status_t delete_rsmi_handle(RdcFieldKey fk) = 0;
|
||||
virtual rdc_status_t acquire_smi_handle(RdcFieldKey fk) = 0;
|
||||
virtual rdc_status_t delete_smi_handle(RdcFieldKey fk) = 0;
|
||||
|
||||
virtual rdc_status_t fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id,
|
||||
rdc_field_value* value) = 0;
|
||||
|
||||
@@ -29,9 +29,9 @@ THE SOFTWARE.
|
||||
#include <mutex> // NOLINT(build/c++11)
|
||||
#include <queue>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rdc_lib/RdcMetricFetcher.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
@@ -44,20 +44,20 @@ struct MetricValue {
|
||||
rdc_field_value value;
|
||||
};
|
||||
|
||||
// This union represents any RSMI handles require initialization and/or
|
||||
// This union represents any SMI handles require initialization and/or
|
||||
// shut down. There should only be one instance of this for each raw event
|
||||
// used. For example, if a field group includes a pseudo-event and the
|
||||
// underlying raw event, then only one FieldRSMIData should be created,
|
||||
// underlying raw event, then only one FieldSMIData should be created,
|
||||
// and it should be used by both events.
|
||||
struct FieldRSMIData {
|
||||
struct FieldSMIData {
|
||||
union {
|
||||
rsmi_event_handle_t evt_handle;
|
||||
amdsmi_event_handle_t evt_handle;
|
||||
};
|
||||
union {
|
||||
rsmi_counter_value_t counter_val;
|
||||
amdsmi_counter_value_t counter_val;
|
||||
};
|
||||
~FieldRSMIData() {}
|
||||
FieldRSMIData() : evt_handle(0), counter_val{0, 0, 0} {}
|
||||
~FieldSMIData() {}
|
||||
FieldSMIData() : evt_handle(0), counter_val{0, 0, 0} {}
|
||||
};
|
||||
|
||||
//!< The data structure to store the async fetch task
|
||||
@@ -77,11 +77,11 @@ class RdcMetricFetcherImpl : public RdcMetricFetcher {
|
||||
RdcMetricFetcherImpl();
|
||||
~RdcMetricFetcherImpl();
|
||||
|
||||
rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) override;
|
||||
rdc_status_t delete_rsmi_handle(RdcFieldKey fk) override;
|
||||
rdc_status_t acquire_smi_handle(RdcFieldKey fk) override;
|
||||
rdc_status_t delete_smi_handle(RdcFieldKey fk) override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<FieldRSMIData> get_rsmi_data(RdcFieldKey key);
|
||||
std::shared_ptr<FieldSMIData> get_smi_data(RdcFieldKey key);
|
||||
|
||||
uint64_t now();
|
||||
void get_ecc_error(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
|
||||
@@ -92,7 +92,7 @@ class RdcMetricFetcherImpl : public RdcMetricFetcher {
|
||||
|
||||
//!< Async metric retreive
|
||||
std::map<RdcFieldKey, MetricValue> async_metrics_;
|
||||
std::map<RdcFieldKey, std::shared_ptr<FieldRSMIData>> rsmi_data_;
|
||||
std::map<RdcFieldKey, std::shared_ptr<FieldSMIData>> smi_data_;
|
||||
std::queue<MetricTask> updated_tasks_;
|
||||
std::mutex task_mutex_;
|
||||
std::future<void> updater_; // keep the future of updater
|
||||
@@ -100,8 +100,6 @@ class RdcMetricFetcherImpl : public RdcMetricFetcher {
|
||||
std::atomic<bool> task_started_;
|
||||
};
|
||||
|
||||
rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi);
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -24,9 +24,8 @@ THE SOFTWARE.
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
@@ -35,23 +34,23 @@ class RdcSmiDiagnosticImpl {
|
||||
public:
|
||||
RdcSmiDiagnosticImpl();
|
||||
|
||||
rdc_status_t check_rsmi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result);
|
||||
rdc_status_t check_rsmi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
rdc_status_t check_smi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result);
|
||||
rdc_status_t check_smi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result);
|
||||
rdc_status_t check_smi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result);
|
||||
rdc_status_t check_rsmi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result);
|
||||
|
||||
private:
|
||||
rdc_diag_result_t check_temperature_level(uint32_t gpu_index, rsmi_temperature_type_t type,
|
||||
rdc_diag_result_t check_temperature_level(uint32_t gpu_index, amdsmi_temperature_type_t type,
|
||||
char msg[MAX_DIAG_MSG_LENGTH],
|
||||
char per_gpu_msg[MAX_DIAG_MSG_LENGTH]);
|
||||
std::string get_temperature_string(rsmi_temperature_type_t type) const;
|
||||
std::string get_temperature_string(amdsmi_temperature_type_t type) const;
|
||||
|
||||
rdc_diag_result_t check_voltage_level(uint32_t gpu_index, rsmi_voltage_type_t type,
|
||||
rdc_diag_result_t check_voltage_level(uint32_t gpu_index, amdsmi_voltage_type_t type,
|
||||
char msg[MAX_DIAG_MSG_LENGTH],
|
||||
char per_gpu_msg[MAX_DIAG_MSG_LENGTH]);
|
||||
std::string get_voltage_string(rsmi_voltage_type_t type) const;
|
||||
std::string get_voltage_string(amdsmi_voltage_type_t type) const;
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<RdcSmiDiagnosticImpl> RdcSmiDiagnosticPtr;
|
||||
|
||||
@@ -30,13 +30,13 @@ THE SOFTWARE.
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rdc_lib/RdcCacheManager.h"
|
||||
#include "rdc_lib/RdcGroupSettings.h"
|
||||
#include "rdc_lib/RdcMetricFetcher.h"
|
||||
#include "rdc_lib/RdcModuleMgr.h"
|
||||
#include "rdc_lib/RdcNotification.h"
|
||||
#include "rdc_lib/RdcWatchTable.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
+6
-2
@@ -23,12 +23,16 @@ THE SOFTWARE.
|
||||
#ifndef INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_
|
||||
#define INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rdc/rdc.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi);
|
||||
rdc_status_t Smi2RdcError(amdsmi_status_t rsmi);
|
||||
amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
|
||||
amdsmi_processor_handle* processor_handle);
|
||||
amdsmi_status_t get_processor_count(uint32_t& all_processor_count);
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
@@ -63,7 +63,7 @@ class TestBase : public RdcRocrBase {
|
||||
const std::string& get_per_gpu_info() const { return per_gpu_info_; }
|
||||
|
||||
hsa_status_t FindGPUIndex(hsa_agent_t agent, void* data);
|
||||
// Return the agent by GPU index in rocm_smi
|
||||
// Return the agent by GPU index in amd_smi
|
||||
hsa_status_t get_agent_by_gpu_index(uint32_t gpu_index, hsa_agent_t* agent);
|
||||
|
||||
protected:
|
||||
|
||||
@@ -28,88 +28,6 @@ syntax = "proto3";
|
||||
|
||||
package rdc;
|
||||
|
||||
/****************************************************************************/
|
||||
/********************************** Rsmi Service ****************************/
|
||||
/****************************************************************************/
|
||||
service Rsmi {
|
||||
// RSMI ID services
|
||||
rpc GetNumDevices (GetNumDevicesRequest) returns(GetNumDevicesResponse) {}
|
||||
|
||||
// RSMI Physical Queries
|
||||
rpc GetTemperature(GetTemperatureRequest) returns(GetTemperatureResponse){}
|
||||
rpc GetFanRpms(GetFanRpmsRequest) returns(GetFanRpmsResponse){}
|
||||
rpc GetFanSpeed(GetFanSpeedRequest) returns(GetFanSpeedResponse){}
|
||||
rpc GetFanSpeedMax(GetFanSpeedMaxRequest) returns(GetFanSpeedMaxResponse){}
|
||||
}
|
||||
|
||||
/* rsmi_num_monitor_devices() */
|
||||
message GetNumDevicesRequest {
|
||||
}
|
||||
message GetNumDevicesResponse {
|
||||
uint64 val = 1;
|
||||
uint64 ret_val = 2;
|
||||
}
|
||||
|
||||
/* GetTemperature */
|
||||
/* rsmi_dev_temp_metric_get() */
|
||||
message GetTemperatureRequest {
|
||||
uint32 dv_ind = 1;
|
||||
uint32 sensor_type = 2;
|
||||
enum TemperatureMetric {
|
||||
RSMI_TEMP_CURRENT = 0;
|
||||
RSMI_TEMP_MAX = 1;
|
||||
RSMI_TEMP_MIN = 2;
|
||||
RSMI_TEMP_MAX_HYST = 3;
|
||||
RSMI_TEMP_MIN_HYST = 4;
|
||||
RSMI_TEMP_CRITICAL = 5;
|
||||
RSMI_TEMP_CRITICAL_HYST = 6;
|
||||
RSMI_TEMP_EMERGENCY = 7;
|
||||
RSMI_TEMP_EMERGENCY_HYST = 8;
|
||||
RSMI_TEMP_CRIT_MIN = 9;
|
||||
RSMI_TEMP_CRIT_MIN_HYST = 10;
|
||||
RSMI_TEMP_OFFSET = 11;
|
||||
RSMI_TEMP_LOWEST = 12;
|
||||
RSMI_TEMP_HIGHEST = 13;
|
||||
}
|
||||
TemperatureMetric metric = 3;
|
||||
}
|
||||
message GetTemperatureResponse {
|
||||
int64 temperature = 1;
|
||||
uint64 ret_val = 2;
|
||||
}
|
||||
|
||||
/* GetFanRpms */
|
||||
/* rsmi_dev_fan_rpms_get() */
|
||||
message GetFanRpmsRequest {
|
||||
uint32 dv_ind = 1;
|
||||
uint32 sensor_ind = 2;
|
||||
}
|
||||
message GetFanRpmsResponse {
|
||||
int64 rpms = 1;
|
||||
uint64 ret_val = 2;
|
||||
}
|
||||
/* GetFanSpeed */
|
||||
/* rsmi_dev_fan_speed_get() */
|
||||
message GetFanSpeedRequest {
|
||||
uint32 dv_ind = 1;
|
||||
uint32 sensor_ind = 2;
|
||||
}
|
||||
message GetFanSpeedResponse {
|
||||
int64 speed = 1;
|
||||
uint64 ret_val = 2;
|
||||
}
|
||||
|
||||
/* GetFanSpeedMax */
|
||||
/* rsmi_dev_fan_speed_max_get() */
|
||||
message GetFanSpeedMaxRequest {
|
||||
uint32 dv_ind = 1;
|
||||
uint32 sensor_ind = 2;
|
||||
}
|
||||
message GetFanSpeedMaxResponse {
|
||||
uint64 max_speed = 1;
|
||||
uint64 ret_val = 2;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
/********************************** RdcAdmin Service ************************/
|
||||
/****************************************************************************/
|
||||
|
||||
@@ -37,13 +37,13 @@ class PrometheusReader(RdcReader):
|
||||
if enable_pci_id == True:
|
||||
try:
|
||||
import sys, os
|
||||
# Relaive path of rocm_smi to map gpu index to PCI id
|
||||
# change smi_lib_path if the rocm_smi is installed in different folder
|
||||
# Relaive path of amd_smi to map gpu index to PCI id
|
||||
# change smi_lib_path if the amd_smi is installed in different folder
|
||||
smi_lib_relative_path = "../../bin"
|
||||
smi_lib_path = os.path.join(sys.path[0], smi_lib_relative_path)
|
||||
if os.path.exists(smi_lib_path+"/rocm_smi.py"):
|
||||
if os.path.exists(smi_lib_path+"/amd_smi.py"):
|
||||
sys.path.append(smi_lib_path)
|
||||
from rocm_smi import getBus, initializeRsmi
|
||||
from amd_smi import getBus, initializeRsmi
|
||||
initializeRsmi()
|
||||
# Map between gpu indexes and PCIe bus addresses
|
||||
self.index_to_bus_addr = {}
|
||||
|
||||
@@ -46,8 +46,8 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
|
||||
message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
|
||||
message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
|
||||
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
|
||||
message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR})
|
||||
message("--------RSMI Inc Dir: " ${RSMI_INC_DIR})
|
||||
message("---------SMI Lib Dir: " ${SMI_LIB_DIR})
|
||||
message("---------SMI Inc Dir: " ${SMI_INC_DIR})
|
||||
message("")
|
||||
|
||||
|
||||
@@ -82,8 +82,8 @@ set(CPACK_PACKAGE_FILE_NAME "${RDC_PACKAGE}-${VERSION_STRING}")
|
||||
set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "rocm-core")
|
||||
|
||||
# link RSMI
|
||||
link_directories(${RSMI_LIB_DIR})
|
||||
# link SMI
|
||||
link_directories(${SMI_LIB_DIR})
|
||||
|
||||
# add librdc_bootstrap.so
|
||||
add_subdirectory(bootstrap)
|
||||
|
||||
@@ -28,7 +28,7 @@ target_include_directories(${BOOTSTRAP_LIB} PRIVATE
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
"${COMMON_DIR}"
|
||||
"${RSMI_INC_DIR}"
|
||||
"${SMI_INC_DIR}"
|
||||
"${ROCM_DIR}/include")
|
||||
|
||||
target_include_directories(${BOOTSTRAP_LIB}
|
||||
|
||||
@@ -26,7 +26,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST}
|
||||
"${SRC_DIR}/RdcSmiLib.cc"
|
||||
"${SRC_DIR}/RdcTelemetryModule.cc"
|
||||
"${SRC_DIR}/RdcWatchTableImpl.cc"
|
||||
"${SRC_DIR}/RsmiUtils.cc")
|
||||
"${SRC_DIR}/SmiUtils.cc")
|
||||
|
||||
# TODO: remove all headers? Will just dir be ok after install?
|
||||
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST}
|
||||
@@ -59,16 +59,16 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST}
|
||||
"${INC_DIR}/impl/RdcSmiLib.h"
|
||||
"${INC_DIR}/impl/RdcTelemetryModule.h"
|
||||
"${INC_DIR}/impl/RdcWatchTableImpl.h"
|
||||
"${INC_DIR}/impl/RsmiUtils.h")
|
||||
"${INC_DIR}/impl/SmiUtils.h")
|
||||
|
||||
message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}")
|
||||
|
||||
add_library(${RDC_LIB} SHARED ${RDC_LIB_SRC_LIST} ${RDC_LIB_INC_LIST})
|
||||
target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64 cap)
|
||||
target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread amd_smi cap)
|
||||
target_include_directories(${RDC_LIB} PRIVATE
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
"${RSMI_INC_DIR}")
|
||||
"${SMI_INC_DIR}")
|
||||
|
||||
# Set the VERSION and SOVERSION values
|
||||
set_property(TARGET ${RDC_LIB} PROPERTY
|
||||
|
||||
@@ -23,6 +23,7 @@ THE SOFTWARE.
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "common/rdc_fields_supported.h"
|
||||
#include "rdc_lib/RdcException.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
@@ -35,30 +36,29 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/impl/RdcNotificationImpl.h"
|
||||
#include "rdc_lib/impl/RdcWatchTableImpl.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
namespace {
|
||||
// call the rsmi_init when load library
|
||||
// and rsmi_shutdown when unload the library.
|
||||
class rsmi_initializer {
|
||||
rsmi_initializer() {
|
||||
// Make sure rsmi will not be initialized multiple times
|
||||
rsmi_shut_down();
|
||||
rsmi_status_t rsmi_ret = rsmi_init(0);
|
||||
if (rsmi_ret != RSMI_STATUS_SUCCESS) {
|
||||
throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "RSMI initialize fail");
|
||||
// call the smi_init when load library
|
||||
// and smi_shutdown when unload the library.
|
||||
class smi_initializer {
|
||||
smi_initializer() {
|
||||
// Make sure smi will not be initialized multiple times
|
||||
amdsmi_shut_down();
|
||||
amdsmi_status_t ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "SMI initialize fail");
|
||||
}
|
||||
}
|
||||
~rsmi_initializer() { rsmi_shut_down(); }
|
||||
~smi_initializer() { amdsmi_shut_down(); }
|
||||
|
||||
public:
|
||||
static rsmi_initializer& getInstance() {
|
||||
static rsmi_initializer instance;
|
||||
static smi_initializer& getInstance() {
|
||||
static smi_initializer instance;
|
||||
return instance;
|
||||
}
|
||||
};
|
||||
|
||||
static rsmi_initializer& in = rsmi_initializer::getInstance();
|
||||
static smi_initializer& in = smi_initializer::getInstance();
|
||||
} // namespace
|
||||
|
||||
amd::rdc::RdcHandler* make_handler(rdc_operation_mode_t op_mode) {
|
||||
|
||||
@@ -25,42 +25,39 @@ THE SOFTWARE.
|
||||
#include <string.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono> //NOLINT
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "common/rdc_capabilities.h"
|
||||
#include "common/rdc_fields_supported.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/impl/RsmiUtils.h"
|
||||
#include "rdc_lib/impl/SmiUtils.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
static const std::unordered_map<rdc_field_t, rsmi_event_type_t> rdc_evnt_2_rsmi_field = {
|
||||
{RDC_EVNT_XGMI_0_NOP_TX, RSMI_EVNT_XGMI_0_NOP_TX},
|
||||
{RDC_EVNT_XGMI_0_REQ_TX, RSMI_EVNT_XGMI_0_REQUEST_TX},
|
||||
{RDC_EVNT_XGMI_0_RESP_TX, RSMI_EVNT_XGMI_0_RESPONSE_TX},
|
||||
{RDC_EVNT_XGMI_0_BEATS_TX, RSMI_EVNT_XGMI_0_BEATS_TX},
|
||||
{RDC_EVNT_XGMI_1_NOP_TX, RSMI_EVNT_XGMI_1_NOP_TX},
|
||||
{RDC_EVNT_XGMI_1_REQ_TX, RSMI_EVNT_XGMI_1_REQUEST_TX},
|
||||
{RDC_EVNT_XGMI_1_RESP_TX, RSMI_EVNT_XGMI_1_RESPONSE_TX},
|
||||
{RDC_EVNT_XGMI_1_BEATS_TX, RSMI_EVNT_XGMI_1_BEATS_TX},
|
||||
static const std::unordered_map<rdc_field_t, amdsmi_event_type_t> rdc_evnt_2_smi_field = {
|
||||
{RDC_EVNT_XGMI_0_NOP_TX, AMDSMI_EVNT_XGMI_0_NOP_TX},
|
||||
{RDC_EVNT_XGMI_0_REQ_TX, AMDSMI_EVNT_XGMI_0_REQUEST_TX},
|
||||
{RDC_EVNT_XGMI_0_RESP_TX, AMDSMI_EVNT_XGMI_0_RESPONSE_TX},
|
||||
{RDC_EVNT_XGMI_0_BEATS_TX, AMDSMI_EVNT_XGMI_0_BEATS_TX},
|
||||
{RDC_EVNT_XGMI_1_NOP_TX, AMDSMI_EVNT_XGMI_1_NOP_TX},
|
||||
{RDC_EVNT_XGMI_1_REQ_TX, AMDSMI_EVNT_XGMI_1_REQUEST_TX},
|
||||
{RDC_EVNT_XGMI_1_RESP_TX, AMDSMI_EVNT_XGMI_1_RESPONSE_TX},
|
||||
{RDC_EVNT_XGMI_1_BEATS_TX, AMDSMI_EVNT_XGMI_1_BEATS_TX},
|
||||
|
||||
{RDC_EVNT_XGMI_0_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_0},
|
||||
{RDC_EVNT_XGMI_1_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_1},
|
||||
{RDC_EVNT_XGMI_2_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_2},
|
||||
{RDC_EVNT_XGMI_3_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_3},
|
||||
{RDC_EVNT_XGMI_4_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_4},
|
||||
{RDC_EVNT_XGMI_5_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_5},
|
||||
{RDC_EVNT_XGMI_0_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_0},
|
||||
{RDC_EVNT_XGMI_1_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_1},
|
||||
{RDC_EVNT_XGMI_2_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_2},
|
||||
{RDC_EVNT_XGMI_3_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_3},
|
||||
{RDC_EVNT_XGMI_4_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_4},
|
||||
{RDC_EVNT_XGMI_5_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_5},
|
||||
};
|
||||
|
||||
RdcMetricFetcherImpl::RdcMetricFetcherImpl() {
|
||||
task_started_ = true;
|
||||
|
||||
RdcMetricFetcherImpl::RdcMetricFetcherImpl() : task_started_(true) {
|
||||
// kick off another thread for async fetch
|
||||
updater_ = std::async(std::launch::async, [this]() {
|
||||
while (task_started_) {
|
||||
@@ -95,37 +92,41 @@ uint64_t RdcMetricFetcherImpl::now() {
|
||||
|
||||
void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index, rdc_field_t field_id,
|
||||
rdc_field_value* value) {
|
||||
rsmi_status_t err = RSMI_STATUS_SUCCESS;
|
||||
uint64_t correctable_err = 0;
|
||||
uint64_t uncorrectable_err = 0;
|
||||
rsmi_ras_err_state_t err_state;
|
||||
amdsmi_status_t err = AMDSMI_STATUS_SUCCESS;
|
||||
uint64_t correctable_count = 0;
|
||||
uint64_t uncorrectable_count = 0;
|
||||
amdsmi_ras_err_state_t err_state;
|
||||
|
||||
amdsmi_processor_handle processor_handle;
|
||||
err = get_processor_handle_from_id(gpu_index, &processor_handle);
|
||||
|
||||
if (!value) {
|
||||
return;
|
||||
}
|
||||
for (uint32_t b = RSMI_GPU_BLOCK_FIRST; b <= RSMI_GPU_BLOCK_LAST; b = b * 2) {
|
||||
err = rsmi_dev_ecc_status_get(gpu_index, static_cast<rsmi_gpu_block_t>(b), &err_state);
|
||||
if (err != RSMI_STATUS_SUCCESS) {
|
||||
for (uint32_t b = AMDSMI_GPU_BLOCK_FIRST; b <= AMDSMI_GPU_BLOCK_LAST; b = b * 2) {
|
||||
err =
|
||||
amdsmi_get_gpu_ecc_status(processor_handle, static_cast<amdsmi_gpu_block_t>(b), &err_state);
|
||||
if (err != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_INFO, "Get the ecc Status error " << b << ":" << err);
|
||||
continue;
|
||||
}
|
||||
|
||||
rsmi_error_count_t ec;
|
||||
err = rsmi_dev_ecc_count_get(gpu_index, static_cast<rsmi_gpu_block_t>(b), &ec);
|
||||
amdsmi_error_count_t ec;
|
||||
err = amdsmi_get_gpu_ecc_count(processor_handle, static_cast<amdsmi_gpu_block_t>(b), &ec);
|
||||
|
||||
if (err == RSMI_STATUS_SUCCESS) {
|
||||
correctable_err += ec.correctable_err;
|
||||
uncorrectable_err += ec.uncorrectable_err;
|
||||
if (err == AMDSMI_STATUS_SUCCESS) {
|
||||
correctable_count += ec.correctable_count;
|
||||
uncorrectable_count += ec.uncorrectable_count;
|
||||
}
|
||||
}
|
||||
|
||||
value->status = RSMI_STATUS_SUCCESS;
|
||||
value->status = AMDSMI_STATUS_SUCCESS;
|
||||
value->type = INTEGER;
|
||||
if (field_id == RDC_FI_ECC_CORRECT_TOTAL) {
|
||||
value->value.l_int = correctable_err;
|
||||
value->value.l_int = correctable_count;
|
||||
}
|
||||
if (field_id == RDC_FI_ECC_UNCORRECT_TOTAL) {
|
||||
value->value.l_int = uncorrectable_err;
|
||||
value->value.l_int = uncorrectable_count;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,7 +167,10 @@ bool RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, rdc_fie
|
||||
void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) {
|
||||
uint32_t gpu_index = key.first;
|
||||
uint64_t sent, received, max_pkt_sz;
|
||||
rsmi_status_t ret;
|
||||
amdsmi_status_t ret;
|
||||
|
||||
amdsmi_processor_handle processor_handle;
|
||||
ret = get_processor_handle_from_id(gpu_index, &processor_handle);
|
||||
|
||||
// Return if the cache does not expire yet
|
||||
do {
|
||||
@@ -178,7 +182,7 @@ void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) {
|
||||
}
|
||||
} while (0);
|
||||
|
||||
ret = rsmi_dev_pci_throughput_get(gpu_index, &sent, &received, &max_pkt_sz);
|
||||
ret = amdsmi_get_gpu_pci_throughput(processor_handle, &sent, &received, &max_pkt_sz);
|
||||
|
||||
uint64_t curTime = now();
|
||||
MetricValue value;
|
||||
@@ -207,12 +211,12 @@ void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) {
|
||||
rx_metric->second.value.status = ret;
|
||||
rx_metric->second.value.ts = curTime;
|
||||
|
||||
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
RDC_LOG(RDC_ERROR, "PCIe throughput not supported on GPU " << gpu_index);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ret == RSMI_STATUS_SUCCESS) {
|
||||
if (ret == AMDSMI_STATUS_SUCCESS) {
|
||||
rx_metric->second.value.value.l_int = received;
|
||||
tx_metric->second.value.value.l_int = sent;
|
||||
RDC_LOG(RDC_DEBUG, "Async updated " << gpu_index << ":"
|
||||
@@ -226,16 +230,16 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields(
|
||||
std::vector<rdc_gpu_field_value_t>& results) { // NOLINT
|
||||
const std::set<rdc_field_t> rdc_bulk_fields = {
|
||||
RDC_FI_GPU_CLOCK, // current_gfxclk * 1000000
|
||||
RDC_FI_MEMORY_TEMP, // temperature_mem * 1000
|
||||
RDC_FI_GPU_TEMP, // temperature_edge * 1000
|
||||
RDC_FI_POWER_USAGE, // average_socket_power * 1000000
|
||||
RDC_FI_MEMORY_TEMP, // temperature_mem
|
||||
RDC_FI_GPU_TEMP, // temperature_edge
|
||||
RDC_FI_POWER_USAGE, // average_socket_power
|
||||
RDC_FI_GPU_UTIL // average_gfx_activity
|
||||
};
|
||||
|
||||
// To prevent always call the bulk API even if it is not supported,
|
||||
// the static is used to cache last try.
|
||||
static rsmi_status_t rs = RSMI_STATUS_SUCCESS;
|
||||
if (rs != RSMI_STATUS_SUCCESS) {
|
||||
static amdsmi_status_t rs = AMDSMI_STATUS_SUCCESS;
|
||||
if (rs != AMDSMI_STATUS_SUCCESS) {
|
||||
results.clear();
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
@@ -248,13 +252,16 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields(
|
||||
}
|
||||
}
|
||||
|
||||
// Call the rocm_smi_lib API to bulk fetch the data
|
||||
// Call the amd_smi_lib API to bulk fetch the data
|
||||
auto cur_time = now();
|
||||
auto ite = bulk_fields.begin();
|
||||
for (; ite != bulk_fields.end(); ite++) {
|
||||
rsmi_gpu_metrics_t gpu_metrics;
|
||||
rs = rsmi_dev_gpu_metrics_info_get(ite->first, &gpu_metrics);
|
||||
if (rs != RSMI_STATUS_SUCCESS) {
|
||||
amdsmi_gpu_metrics_t gpu_metrics;
|
||||
amdsmi_processor_handle processor_handle;
|
||||
rs = get_processor_handle_from_id(ite->first, &processor_handle);
|
||||
|
||||
rs = amdsmi_get_gpu_metrics_info(processor_handle, &gpu_metrics);
|
||||
if (rs != AMDSMI_STATUS_SUCCESS) {
|
||||
results.clear();
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
@@ -264,38 +271,46 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields(
|
||||
value.gpu_index = ite->first;
|
||||
value.field_value.field_id = field_id;
|
||||
value.field_value.type = INTEGER;
|
||||
value.field_value.status = RSMI_STATUS_SUCCESS;
|
||||
value.field_value.status = AMDSMI_STATUS_SUCCESS;
|
||||
value.field_value.ts = cur_time;
|
||||
|
||||
switch (field_id) {
|
||||
case RDC_FI_GPU_CLOCK: // current_gfxclk * 1000000
|
||||
value.field_value.value.l_int =
|
||||
static_cast<int64_t>(gpu_metrics.current_gfxclk * 1000000);
|
||||
static_cast<int64_t>(gpu_metrics.current_gfxclk) * 1000000;
|
||||
break;
|
||||
case RDC_FI_MEMORY_TEMP: // temperature_mem * 1000
|
||||
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.temperature_mem * 1000);
|
||||
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.temperature_mem) * 1000;
|
||||
break;
|
||||
case RDC_FI_GPU_TEMP: // temperature_edge * 1000
|
||||
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.temperature_edge * 1000);
|
||||
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.temperature_edge) * 1000;
|
||||
break;
|
||||
case RDC_FI_POWER_USAGE: // average_socket_power * 1000000
|
||||
value.field_value.value.l_int =
|
||||
static_cast<int64_t>(gpu_metrics.average_socket_power * 1000000);
|
||||
case RDC_FI_POWER_USAGE: // average_socket_power
|
||||
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.average_socket_power);
|
||||
// Use current_socket_power if average_socket_power is not available
|
||||
if (value.field_value.value.l_int == 65535) {
|
||||
RDC_LOG(RDC_DEBUG, "Bulk fetch "
|
||||
<< value.gpu_index << ":"
|
||||
<< "RDC_FI_POWER_USAGE fallback to current_socket_power.");
|
||||
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.current_socket_power);
|
||||
}
|
||||
|
||||
// Ignore if the power is 0, which will fallback to non-bulk fetch.
|
||||
if (value.field_value.value.l_int == 0) {
|
||||
RDC_LOG(RDC_DEBUG, "Bulk fetch " << value.gpu_index << ":"
|
||||
<< "RDC_FI_POWER_USAGE fallback to regular way.");
|
||||
continue;
|
||||
}
|
||||
value.field_value.value.l_int *= 1000000;
|
||||
break;
|
||||
case RDC_FI_GPU_UTIL: // average_gfx_activity
|
||||
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.average_gfx_activity);
|
||||
break;
|
||||
default:
|
||||
value.field_value.status = RSMI_STATUS_NOT_SUPPORTED;
|
||||
value.field_value.status = AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
break;
|
||||
}
|
||||
if (value.field_value.status == RSMI_STATUS_SUCCESS) {
|
||||
if (value.field_value.status == AMDSMI_STATUS_SUCCESS) {
|
||||
results.push_back(value);
|
||||
}
|
||||
}
|
||||
@@ -304,20 +319,23 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields(
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
static const uint64_t kGig = 1000000000;
|
||||
constexpr double kGig = 1000000000.0;
|
||||
|
||||
rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id,
|
||||
rdc_field_value* value) {
|
||||
if (!value) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
uint64_t i64 = 0;
|
||||
rsmi_temperature_type_t sensor_type;
|
||||
rsmi_clk_type_t clk_type;
|
||||
bool async_fetching = false;
|
||||
RdcFieldKey f_key(gpu_index, field_id);
|
||||
std::shared_ptr<FieldRSMIData> rsmi_data;
|
||||
double coll_time_sec;
|
||||
std::shared_ptr<FieldSMIData> smi_data;
|
||||
|
||||
amdsmi_processor_handle processor_handle = {};
|
||||
|
||||
amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << gpu_index << " error: " << ret);
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
|
||||
if (!is_field_valid(field_id)) {
|
||||
RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id << " which is not supported");
|
||||
@@ -326,101 +344,121 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
|
||||
value->ts = now();
|
||||
value->field_id = field_id;
|
||||
value->status = RSMI_STATUS_NOT_SUPPORTED;
|
||||
value->status = AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
|
||||
auto read_rsmi_counter = [&](void) {
|
||||
rsmi_data = get_rsmi_data(f_key);
|
||||
if (rsmi_data == nullptr) {
|
||||
value->status = RSMI_STATUS_NOT_SUPPORTED;
|
||||
auto read_smi_counter = [&](void) {
|
||||
RdcFieldKey f_key(gpu_index, field_id);
|
||||
smi_data = get_smi_data(f_key);
|
||||
if (smi_data == nullptr) {
|
||||
value->status = AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
return;
|
||||
}
|
||||
|
||||
value->status = rsmi_counter_read(rsmi_data->evt_handle, &rsmi_data->counter_val);
|
||||
value->value.l_int = rsmi_data->counter_val.value;
|
||||
value->status = amdsmi_gpu_read_counter(smi_data->evt_handle, &smi_data->counter_val);
|
||||
value->value.l_int = smi_data->counter_val.value;
|
||||
value->type = INTEGER;
|
||||
};
|
||||
|
||||
switch (field_id) {
|
||||
case RDC_FI_GPU_MEMORY_USAGE:
|
||||
value->status = rsmi_dev_memory_usage_get(gpu_index, RSMI_MEM_TYPE_VRAM, &i64);
|
||||
case RDC_FI_GPU_MEMORY_USAGE: {
|
||||
uint64_t u64 = 0;
|
||||
value->status = amdsmi_get_gpu_memory_usage(processor_handle, AMDSMI_MEM_TYPE_VRAM, &u64);
|
||||
value->type = INTEGER;
|
||||
if (value->status == RSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(i64);
|
||||
}
|
||||
break;
|
||||
case RDC_FI_GPU_MEMORY_TOTAL:
|
||||
value->status = rsmi_dev_memory_total_get(gpu_index, RSMI_MEM_TYPE_VRAM, &i64);
|
||||
value->type = INTEGER;
|
||||
if (value->status == RSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(i64);
|
||||
}
|
||||
break;
|
||||
case RDC_FI_GPU_COUNT:
|
||||
uint32_t num_gpu;
|
||||
value->status = rsmi_num_monitor_devices(&num_gpu);
|
||||
value->type = INTEGER;
|
||||
if (value->status == RSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(num_gpu);
|
||||
}
|
||||
break;
|
||||
case RDC_FI_POWER_USAGE:
|
||||
{
|
||||
RSMI_POWER_TYPE power_type = RSMI_CURRENT_POWER;
|
||||
// below call should handle both socket power and regular power
|
||||
value->status = rsmi_dev_power_get(gpu_index, &i64, &power_type);
|
||||
value->type = INTEGER;
|
||||
if (value->status == RSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(i64);
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(u64);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RDC_FI_GPU_CLOCK:
|
||||
case RDC_FI_MEM_CLOCK:
|
||||
rsmi_frequencies_t f;
|
||||
clk_type = RSMI_CLK_TYPE_SYS;
|
||||
if (field_id == RDC_FI_MEM_CLOCK) {
|
||||
clk_type = RSMI_CLK_TYPE_MEM;
|
||||
}
|
||||
value->status = rsmi_dev_gpu_clk_freq_get(gpu_index, clk_type, &f);
|
||||
case RDC_FI_GPU_MEMORY_TOTAL: {
|
||||
uint64_t u64 = 0;
|
||||
value->status = amdsmi_get_gpu_memory_total(processor_handle, AMDSMI_MEM_TYPE_VRAM, &u64);
|
||||
value->type = INTEGER;
|
||||
if (value->status == RSMI_STATUS_SUCCESS) {
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(u64);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RDC_FI_GPU_COUNT: {
|
||||
uint32_t processor_count = 0;
|
||||
// amdsmi is initialized in AMDSMI_INIT_AMD_GPUS mode -> returned sockets are GPUs
|
||||
value->status = get_processor_count(processor_count);
|
||||
value->type = INTEGER;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(processor_count);
|
||||
}
|
||||
} break;
|
||||
case RDC_FI_POWER_USAGE: {
|
||||
amdsmi_power_info_t power_info = {};
|
||||
value->status = amdsmi_get_power_info(processor_handle, &power_info);
|
||||
value->type = INTEGER;
|
||||
if (value->status != AMDSMI_STATUS_SUCCESS) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Use current_socket_power if average_socket_power is not available
|
||||
if (power_info.average_socket_power != 65535) {
|
||||
value->value.l_int = static_cast<int64_t>(power_info.average_socket_power) * 1000 * 1000;
|
||||
} else {
|
||||
value->value.l_int = static_cast<int64_t>(power_info.current_socket_power) * 1000 * 1000;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case RDC_FI_GPU_CLOCK:
|
||||
case RDC_FI_MEM_CLOCK: {
|
||||
amdsmi_clk_type_t clk_type = CLK_TYPE_SYS;
|
||||
if (field_id == RDC_FI_MEM_CLOCK) {
|
||||
clk_type = CLK_TYPE_MEM;
|
||||
}
|
||||
amdsmi_frequencies_t f = {};
|
||||
value->status = amdsmi_get_clk_freq(processor_handle, clk_type, &f);
|
||||
value->type = INTEGER;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = f.frequency[f.current];
|
||||
}
|
||||
break;
|
||||
case RDC_FI_GPU_UTIL:
|
||||
uint32_t busy_percent;
|
||||
value->status = rsmi_dev_busy_percent_get(gpu_index, &busy_percent);
|
||||
}
|
||||
case RDC_FI_GPU_UTIL: {
|
||||
amdsmi_engine_usage_t engine_usage;
|
||||
value->status = amdsmi_get_gpu_activity(processor_handle, &engine_usage);
|
||||
value->type = INTEGER;
|
||||
if (value->status == RSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(busy_percent);
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = static_cast<int64_t>(engine_usage.gfx_activity);
|
||||
}
|
||||
break;
|
||||
case RDC_FI_DEV_NAME:
|
||||
value->status = rsmi_dev_name_get(gpu_index, value->value.str, RDC_MAX_STR_LENGTH);
|
||||
}
|
||||
case RDC_FI_DEV_NAME: {
|
||||
amdsmi_asic_info_t asic_info;
|
||||
value->status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
|
||||
value->type = STRING;
|
||||
break;
|
||||
case RDC_FI_GPU_TEMP:
|
||||
case RDC_FI_MEMORY_TEMP:
|
||||
int64_t val_i64;
|
||||
sensor_type = RSMI_TEMP_TYPE_EDGE;
|
||||
if (field_id == RDC_FI_MEMORY_TEMP) {
|
||||
sensor_type = RSMI_TEMP_TYPE_MEMORY;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
memcpy(value->value.str, asic_info.market_name, sizeof(asic_info.market_name));
|
||||
}
|
||||
value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, RSMI_TEMP_CURRENT, &val_i64);
|
||||
break;
|
||||
}
|
||||
case RDC_FI_GPU_TEMP:
|
||||
case RDC_FI_MEMORY_TEMP: {
|
||||
int64_t i64 = 0;
|
||||
amdsmi_temperature_type_t sensor_type = TEMPERATURE_TYPE_EDGE;
|
||||
if (field_id == RDC_FI_MEMORY_TEMP) {
|
||||
sensor_type = TEMPERATURE_TYPE_VRAM;
|
||||
}
|
||||
value->status =
|
||||
amdsmi_get_temp_metric(processor_handle, sensor_type, AMDSMI_TEMP_CURRENT, &i64);
|
||||
|
||||
// fallback to hotspot temperature as some card may not have edge temperature.
|
||||
if (sensor_type == RSMI_TEMP_TYPE_EDGE
|
||||
&& value->status == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
sensor_type = RSMI_TEMP_TYPE_JUNCTION;
|
||||
value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type,
|
||||
RSMI_TEMP_CURRENT, &val_i64);
|
||||
if (sensor_type == TEMPERATURE_TYPE_EDGE && value->status == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
sensor_type = TEMPERATURE_TYPE_JUNCTION;
|
||||
value->status =
|
||||
amdsmi_get_temp_metric(processor_handle, sensor_type, AMDSMI_TEMP_CURRENT, &i64);
|
||||
}
|
||||
|
||||
value->type = INTEGER;
|
||||
if (value->status == RSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = val_i64;
|
||||
if (value->status == AMDSMI_STATUS_SUCCESS) {
|
||||
value->value.l_int = i64 * 1000;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RDC_FI_ECC_CORRECT_TOTAL:
|
||||
case RDC_FI_ECC_UNCORRECT_TOTAL:
|
||||
get_ecc_error(gpu_index, field_id, value);
|
||||
@@ -437,31 +475,33 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
case RDC_EVNT_XGMI_1_REQ_TX:
|
||||
case RDC_EVNT_XGMI_1_RESP_TX:
|
||||
case RDC_EVNT_XGMI_1_BEATS_TX:
|
||||
read_rsmi_counter();
|
||||
read_smi_counter();
|
||||
break;
|
||||
case RDC_EVNT_XGMI_0_THRPUT:
|
||||
case RDC_EVNT_XGMI_1_THRPUT:
|
||||
case RDC_EVNT_XGMI_2_THRPUT:
|
||||
case RDC_EVNT_XGMI_3_THRPUT:
|
||||
case RDC_EVNT_XGMI_4_THRPUT:
|
||||
case RDC_EVNT_XGMI_5_THRPUT:
|
||||
read_rsmi_counter();
|
||||
case RDC_EVNT_XGMI_5_THRPUT: {
|
||||
double coll_time_sec = 0;
|
||||
read_smi_counter();
|
||||
if (value->status == RDC_ST_OK) {
|
||||
if (rsmi_data->counter_val.time_running > 0) {
|
||||
coll_time_sec = static_cast<float>(rsmi_data->counter_val.time_running) / kGig;
|
||||
if (smi_data->counter_val.time_running > 0) {
|
||||
coll_time_sec = static_cast<double>(smi_data->counter_val.time_running) / kGig;
|
||||
value->value.l_int = (value->value.l_int * 32) / coll_time_sec;
|
||||
} else {
|
||||
value->value.l_int = 0;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
int64_t latency = now() - value->ts;
|
||||
if (value->status != RSMI_STATUS_SUCCESS) {
|
||||
if (value->status != AMDSMI_STATUS_SUCCESS) {
|
||||
if (async_fetching) { //!< Async fetching is not an error
|
||||
RDC_LOG(RDC_DEBUG, "Async fetch " << field_id_string(field_id));
|
||||
} else {
|
||||
@@ -480,42 +520,45 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
|
||||
<< value->value.str << ", latency " << latency);
|
||||
}
|
||||
|
||||
return value->status == RSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR;
|
||||
return value->status == AMDSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR;
|
||||
}
|
||||
|
||||
std::shared_ptr<FieldRSMIData> RdcMetricFetcherImpl::get_rsmi_data(RdcFieldKey key) {
|
||||
std::map<RdcFieldKey, std::shared_ptr<FieldRSMIData>>::iterator r_info = rsmi_data_.find(key);
|
||||
std::shared_ptr<FieldSMIData> RdcMetricFetcherImpl::get_smi_data(RdcFieldKey key) {
|
||||
std::map<RdcFieldKey, std::shared_ptr<FieldSMIData>>::iterator r_info = smi_data_.find(key);
|
||||
|
||||
if (r_info != rsmi_data_.end()) {
|
||||
if (r_info != smi_data_.end()) {
|
||||
return r_info->second;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static rdc_status_t init_rsmi_counter(RdcFieldKey fk, rsmi_event_group_t grp,
|
||||
rsmi_event_handle_t* handle) {
|
||||
rsmi_status_t ret;
|
||||
static rdc_status_t init_smi_counter(RdcFieldKey fk, amdsmi_event_group_t grp,
|
||||
amdsmi_event_handle_t* handle) {
|
||||
amdsmi_status_t ret;
|
||||
uint32_t counters_available;
|
||||
uint32_t dv_ind = fk.first;
|
||||
rdc_field_t f = fk.second;
|
||||
|
||||
assert(handle != nullptr);
|
||||
|
||||
ret = rsmi_dev_counter_group_supported(dv_ind, grp);
|
||||
amdsmi_processor_handle processor_handle;
|
||||
ret = get_processor_handle_from_id(dv_ind, &processor_handle);
|
||||
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return Rsmi2RdcError(ret);
|
||||
ret = amdsmi_gpu_counter_group_supported(processor_handle, grp);
|
||||
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
|
||||
ret = rsmi_counter_available_counters_get(dv_ind, grp, &counters_available);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return Rsmi2RdcError(ret);
|
||||
ret = amdsmi_get_gpu_available_counters(processor_handle, grp, &counters_available);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
if (counters_available == 0) {
|
||||
return RDC_ST_INSUFF_RESOURCES;
|
||||
}
|
||||
|
||||
rsmi_event_type_t evt = rdc_evnt_2_rsmi_field.at(f);
|
||||
amdsmi_event_type_t evt = rdc_evnt_2_smi_field.at(f);
|
||||
|
||||
// Temporarily get DAC capability
|
||||
ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE);
|
||||
@@ -525,12 +568,12 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk, rsmi_event_group_t grp,
|
||||
return RDC_ST_PERM_ERROR;
|
||||
}
|
||||
|
||||
ret = rsmi_dev_counter_create(dv_ind, evt, handle);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return Rsmi2RdcError(ret);
|
||||
ret = amdsmi_gpu_create_counter(processor_handle, evt, handle);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
|
||||
ret = rsmi_counter_control(*handle, RSMI_CNTR_CMD_START, nullptr);
|
||||
ret = amdsmi_gpu_control_counter(*handle, AMDSMI_CNTR_CMD_START, nullptr);
|
||||
|
||||
// Release DAC capability
|
||||
sc.Relinquish();
|
||||
@@ -540,11 +583,11 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk, rsmi_event_group_t grp,
|
||||
return RDC_ST_PERM_ERROR;
|
||||
}
|
||||
|
||||
return Rsmi2RdcError(ret);
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
|
||||
rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) {
|
||||
rsmi_status_t ret;
|
||||
rdc_status_t RdcMetricFetcherImpl::delete_smi_handle(RdcFieldKey fk) {
|
||||
amdsmi_status_t ret;
|
||||
|
||||
switch (fk.second) {
|
||||
case RDC_EVNT_XGMI_0_NOP_TX:
|
||||
@@ -561,52 +604,53 @@ rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) {
|
||||
case RDC_EVNT_XGMI_3_THRPUT:
|
||||
case RDC_EVNT_XGMI_4_THRPUT:
|
||||
case RDC_EVNT_XGMI_5_THRPUT: {
|
||||
rsmi_event_handle_t h;
|
||||
if (rsmi_data_.find(fk) == rsmi_data_.end()) {
|
||||
amdsmi_event_handle_t h;
|
||||
if (smi_data_.find(fk) == smi_data_.end()) {
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
h = rsmi_data_[fk]->evt_handle;
|
||||
h = smi_data_[fk]->evt_handle;
|
||||
|
||||
// Stop counting.
|
||||
ret = rsmi_counter_control(h, RSMI_CNTR_CMD_STOP, nullptr);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
rsmi_data_.erase(fk);
|
||||
ret = amdsmi_gpu_control_counter(h, AMDSMI_CNTR_CMD_STOP, nullptr);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
smi_data_.erase(fk);
|
||||
|
||||
RDC_LOG(RDC_ERROR, "Error in stopping event counter: " << Rsmi2RdcError(ret));
|
||||
return Rsmi2RdcError(ret);
|
||||
RDC_LOG(RDC_ERROR, "Error in stopping event counter: " << Smi2RdcError(ret));
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
|
||||
// Release all resources (e.g., counter and memory resources) associated
|
||||
// with evnt_handle.
|
||||
ret = rsmi_dev_counter_destroy(h);
|
||||
ret = amdsmi_gpu_destroy_counter(h);
|
||||
|
||||
rsmi_data_.erase(fk);
|
||||
return Rsmi2RdcError(ret);
|
||||
smi_data_.erase(fk);
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
default:
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
|
||||
rdc_status_t RdcMetricFetcherImpl::acquire_smi_handle(RdcFieldKey fk) {
|
||||
rdc_status_t ret = RDC_ST_OK;
|
||||
|
||||
auto get_evnt_handle = [&](rsmi_event_group_t grp) {
|
||||
rsmi_event_handle_t handle;
|
||||
auto get_evnt_handle = [&](amdsmi_event_group_t grp) {
|
||||
amdsmi_event_handle_t handle;
|
||||
rdc_status_t result;
|
||||
|
||||
if (get_rsmi_data(fk) != nullptr) {
|
||||
if (get_smi_data(fk) != nullptr) {
|
||||
// This event has already been initialized.
|
||||
return RDC_ST_ALREADY_EXIST;
|
||||
}
|
||||
|
||||
result = init_rsmi_counter(fk, grp, &handle);
|
||||
result = init_smi_counter(fk, grp, &handle);
|
||||
if (result != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to init RSMI counter. Return:" << result);
|
||||
RDC_LOG(RDC_ERROR, "Failed to init SMI counter. Return:" << result);
|
||||
return result;
|
||||
}
|
||||
auto fsh = std::shared_ptr<FieldRSMIData>(new FieldRSMIData);
|
||||
auto fsh = std::shared_ptr<FieldSMIData>(new FieldSMIData);
|
||||
|
||||
if (fsh == nullptr) {
|
||||
return RDC_ST_INSUFF_RESOURCES;
|
||||
@@ -614,7 +658,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
|
||||
|
||||
fsh->evt_handle = handle;
|
||||
|
||||
rsmi_data_[fk] = fsh;
|
||||
smi_data_[fk] = fsh;
|
||||
|
||||
return RDC_ST_OK;
|
||||
};
|
||||
@@ -628,7 +672,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
|
||||
case RDC_EVNT_XGMI_1_REQ_TX:
|
||||
case RDC_EVNT_XGMI_1_RESP_TX:
|
||||
case RDC_EVNT_XGMI_1_BEATS_TX:
|
||||
ret = get_evnt_handle(RSMI_EVNT_GRP_XGMI);
|
||||
ret = get_evnt_handle(AMDSMI_EVNT_GRP_XGMI);
|
||||
break;
|
||||
|
||||
case RDC_EVNT_XGMI_0_THRPUT:
|
||||
@@ -637,7 +681,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
|
||||
case RDC_EVNT_XGMI_3_THRPUT:
|
||||
case RDC_EVNT_XGMI_4_THRPUT:
|
||||
case RDC_EVNT_XGMI_5_THRPUT:
|
||||
ret = get_evnt_handle(RSMI_EVNT_GRP_XGMI_DATA_OUT);
|
||||
ret = get_evnt_handle(AMDSMI_EVNT_GRP_XGMI_DATA_OUT);
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
@@ -21,6 +21,7 @@ THE SOFTWARE.
|
||||
*/
|
||||
#include "rdc_lib/impl/RdcModuleMgrImpl.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
|
||||
@@ -24,35 +24,34 @@ THE SOFTWARE.
|
||||
#include <assert.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <ctime>
|
||||
#include <mutex> // NOLINT
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "common/rdc_capabilities.h"
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
#include "rdc_lib/impl/RdcTelemetryModule.h"
|
||||
#include "rdc_lib/impl/RsmiUtils.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rdc_lib/impl/SmiUtils.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
static std::unordered_map<rdc_field_t, rsmi_evt_notification_type_t> rdc_2_rsmi_event_notif_map = {
|
||||
{RDC_EVNT_NOTIF_VMFAULT, RSMI_EVT_NOTIF_VMFAULT},
|
||||
{RDC_EVNT_NOTIF_FIRST, RSMI_EVT_NOTIF_FIRST},
|
||||
{RDC_EVNT_NOTIF_THERMAL_THROTTLE, RSMI_EVT_NOTIF_THERMAL_THROTTLE},
|
||||
{RDC_EVNT_NOTIF_PRE_RESET, RSMI_EVT_NOTIF_GPU_PRE_RESET},
|
||||
{RDC_EVNT_NOTIF_POST_RESET, RSMI_EVT_NOTIF_GPU_POST_RESET},
|
||||
static std::unordered_map<rdc_field_t, amdsmi_evt_notification_type_t> rdc_2_smi_event_notif_map = {
|
||||
{RDC_EVNT_NOTIF_VMFAULT, AMDSMI_EVT_NOTIF_VMFAULT},
|
||||
{RDC_EVNT_NOTIF_FIRST, AMDSMI_EVT_NOTIF_FIRST},
|
||||
{RDC_EVNT_NOTIF_THERMAL_THROTTLE, AMDSMI_EVT_NOTIF_THERMAL_THROTTLE},
|
||||
{RDC_EVNT_NOTIF_PRE_RESET, AMDSMI_EVT_NOTIF_GPU_PRE_RESET},
|
||||
{RDC_EVNT_NOTIF_POST_RESET, AMDSMI_EVT_NOTIF_GPU_POST_RESET},
|
||||
};
|
||||
static std::unordered_map<rsmi_evt_notification_type_t, rdc_field_t> rsmi_event_notif_2_rdc_map = {
|
||||
{RSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT},
|
||||
{RSMI_EVT_NOTIF_FIRST, RDC_EVNT_NOTIF_FIRST},
|
||||
{RSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE},
|
||||
{RSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET},
|
||||
{RSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET},
|
||||
static std::unordered_map<amdsmi_evt_notification_type_t, rdc_field_t> smi_event_notif_2_rdc_map = {
|
||||
{AMDSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT},
|
||||
{AMDSMI_EVT_NOTIF_FIRST, RDC_EVNT_NOTIF_FIRST},
|
||||
{AMDSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE},
|
||||
{AMDSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET},
|
||||
{AMDSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET},
|
||||
};
|
||||
|
||||
// This const determines space allocated on stack for notification events.
|
||||
@@ -63,22 +62,22 @@ RdcNotificationImpl::RdcNotificationImpl() {}
|
||||
RdcNotificationImpl::~RdcNotificationImpl() {}
|
||||
|
||||
bool RdcNotificationImpl::is_notification_event(rdc_field_t field) const {
|
||||
if (rdc_2_rsmi_event_notif_map.find(field) == rdc_2_rsmi_event_notif_map.end()) {
|
||||
if (rdc_2_smi_event_notif_map.find(field) == rdc_2_smi_event_notif_map.end()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
rdc_status_t RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
|
||||
rsmi_status_t ret;
|
||||
amdsmi_status_t ret;
|
||||
std::map<uint32_t, uint64_t> new_masks;
|
||||
|
||||
for (uint32_t i = 0; i < fk_arr.size(); ++i) {
|
||||
if (rdc_2_rsmi_event_notif_map.find(fk_arr[i].second) == rdc_2_rsmi_event_notif_map.end()) {
|
||||
if (rdc_2_smi_event_notif_map.find(fk_arr[i].second) == rdc_2_smi_event_notif_map.end()) {
|
||||
continue;
|
||||
}
|
||||
new_masks[fk_arr[i].first] |=
|
||||
RSMI_EVENT_MASK_FROM_INDEX(rdc_2_rsmi_event_notif_map[fk_arr[i].second]);
|
||||
AMDSMI_EVENT_MASK_FROM_INDEX(rdc_2_smi_event_notif_map[fk_arr[i].second]);
|
||||
}
|
||||
|
||||
std::map<uint32_t, uint64_t>::iterator it = new_masks.begin();
|
||||
@@ -90,6 +89,15 @@ rdc_status_t RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKe
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get processor handle from GPU id
|
||||
amdsmi_processor_handle processor_handle;
|
||||
ret = get_processor_handle_from_id(it->first, &processor_handle);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"Failed to get processor handle for GPU " << it->first << " error: " << ret);
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
|
||||
// Temporarily get DAC capability
|
||||
ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE);
|
||||
|
||||
@@ -98,15 +106,15 @@ rdc_status_t RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKe
|
||||
return RDC_ST_PERM_ERROR;
|
||||
}
|
||||
|
||||
ret = rsmi_event_notification_init(it->first);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "rsmi_event_notification_init() returned "
|
||||
ret = amdsmi_init_gpu_event_notification(processor_handle);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "amdsmi_init_gpu_event_notification() returned "
|
||||
<< ret << " for device " << it->first << ". " << std::endl
|
||||
<< " Will not listen for events on this device");
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = rsmi_event_notification_mask_set(it->first, it->second);
|
||||
ret = amdsmi_set_gpu_event_notification_mask(processor_handle, it->second);
|
||||
// Release DAC capability
|
||||
sc.Relinquish();
|
||||
|
||||
@@ -115,14 +123,14 @@ rdc_status_t RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKe
|
||||
return RDC_ST_PERM_ERROR;
|
||||
}
|
||||
|
||||
if (ret == RSMI_STATUS_SUCCESS) {
|
||||
if (ret == AMDSMI_STATUS_SUCCESS) {
|
||||
gpu_evnt_notif_masks_[it->first] = it->second;
|
||||
RDC_LOG(RDC_INFO, "Event notification mask for gpu " << it->first << "is set to 0x"
|
||||
<< std::hex << it->second);
|
||||
} else {
|
||||
RDC_LOG(RDC_INFO,
|
||||
"rsmi_event_notification_mask_set() returned " << ret << " for device " << it->first);
|
||||
return Rsmi2RdcError(ret);
|
||||
RDC_LOG(RDC_INFO, "amdsmi_set_gpu_event_notification_mask() returned "
|
||||
<< ret << " for device " << it->first);
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
@@ -136,12 +144,12 @@ rdc_status_t RdcNotificationImpl::listen(rdc_evnt_notification_t* events, uint32
|
||||
}
|
||||
|
||||
uint32_t f_cnt = std::min(*num_events, kMaxRSMIEvents);
|
||||
rsmi_evt_notification_data_t rsmi_events[kMaxRSMIEvents];
|
||||
amdsmi_evt_notification_data_t smi_events[kMaxRSMIEvents];
|
||||
|
||||
rsmi_status_t ret = rsmi_event_notification_get(timeout_ms, &f_cnt, rsmi_events);
|
||||
amdsmi_status_t ret = amdsmi_get_gpu_event_notification(timeout_ms, &f_cnt, smi_events);
|
||||
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return Rsmi2RdcError(ret);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
@@ -149,35 +157,44 @@ rdc_status_t RdcNotificationImpl::listen(rdc_evnt_notification_t* events, uint32
|
||||
*num_events = f_cnt;
|
||||
|
||||
for (uint32_t i = 0; i < f_cnt; ++i) {
|
||||
assert(rsmi_event_notif_2_rdc_map.find(rsmi_events[i].event) !=
|
||||
rsmi_event_notif_2_rdc_map.end());
|
||||
events[i].gpu_id = rsmi_events[i].dv_ind;
|
||||
events[i].field.field_id = rsmi_event_notif_2_rdc_map[rsmi_events[i].event];
|
||||
assert(smi_event_notif_2_rdc_map.find(smi_events[i].event) != smi_event_notif_2_rdc_map.end());
|
||||
uint64_t bdfid;
|
||||
amdsmi_get_gpu_bdf_id(smi_events[i].processor_handle, &bdfid);
|
||||
events[i].gpu_id = bdfid;
|
||||
events[i].field.field_id = smi_event_notif_2_rdc_map[smi_events[i].event];
|
||||
events[i].field.status = RDC_ST_OK;
|
||||
events[i].field.ts = now;
|
||||
events[i].field.type = STRING;
|
||||
strncpy_with_null(events[i].field.value.str, rsmi_events[i].message, RDC_MAX_STR_LENGTH);
|
||||
strncpy_with_null(events[i].field.value.str, smi_events[i].message, RDC_MAX_STR_LENGTH);
|
||||
}
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcNotificationImpl::stop_listening(uint32_t gpu_id) {
|
||||
rsmi_status_t ret;
|
||||
amdsmi_status_t ret;
|
||||
|
||||
ret = rsmi_event_notification_mask_set(gpu_id, 0);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"rsmi_event_notification_mask_set() returned " << ret << " for device " << gpu_id);
|
||||
// Get processor handle from GPU id
|
||||
amdsmi_processor_handle processor_handle;
|
||||
ret = get_processor_handle_from_id(gpu_id, &processor_handle);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << gpu_id << " error: " << ret);
|
||||
return Smi2RdcError(ret);
|
||||
}
|
||||
|
||||
ret = rsmi_event_notification_stop(gpu_id);
|
||||
if (ret == RSMI_STATUS_SUCCESS) {
|
||||
ret = amdsmi_set_gpu_event_notification_mask(processor_handle, 0);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "amdsmi_set_gpu_event_notification_mask() returned " << ret << " for device "
|
||||
<< gpu_id);
|
||||
}
|
||||
|
||||
ret = amdsmi_stop_gpu_event_notification(processor_handle);
|
||||
if (ret == AMDSMI_STATUS_SUCCESS) {
|
||||
std::lock_guard<std::mutex> guard(notif_mutex_);
|
||||
gpu_evnt_notif_masks_[gpu_id] = 0;
|
||||
} else {
|
||||
RDC_LOG(RDC_ERROR,
|
||||
"rsmi_event_notification_stop() returned " << ret << " for device " << gpu_id);
|
||||
"amdsmi_stop_gpu_event_notification() returned " << ret << " for device " << gpu_id);
|
||||
}
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
@@ -21,21 +21,24 @@ THE SOFTWARE.
|
||||
*/
|
||||
#include "rdc_lib/impl/RdcSmiDiagnosticImpl.h"
|
||||
|
||||
#include <amd_smi/amdsmi.h>
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/impl/RsmiUtils.h"
|
||||
#include "rdc_lib/impl/SmiUtils.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
RdcSmiDiagnosticImpl::RdcSmiDiagnosticImpl() {}
|
||||
|
||||
rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) {
|
||||
rdc_status_t RdcSmiDiagnosticImpl::check_smi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) {
|
||||
if (result == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
@@ -43,14 +46,14 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RD
|
||||
result->test_case = RDC_DIAG_COMPUTE_PROCESS;
|
||||
result->status = RDC_DIAG_RESULT_SKIP;
|
||||
result->per_gpu_result_count = 0;
|
||||
rsmi_status_t err = RSMI_STATUS_SUCCESS;
|
||||
amdsmi_status_t err = AMDSMI_STATUS_SUCCESS;
|
||||
uint32_t num_items = 0;
|
||||
err = rsmi_compute_process_info_get(nullptr, &num_items);
|
||||
if (err != RSMI_STATUS_SUCCESS) {
|
||||
err = amdsmi_get_gpu_compute_process_info(nullptr, &num_items);
|
||||
if (err != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_ERROR, "Fail to get process information: " << err);
|
||||
strncpy_with_null(result->info, "Fail to retreive process information from rocm_smi_lib",
|
||||
strncpy_with_null(result->info, "Fail to retreive process information from amd_smi_lib",
|
||||
MAX_DIAG_MSG_LENGTH);
|
||||
return Rsmi2RdcError(err);
|
||||
return Smi2RdcError(err);
|
||||
}
|
||||
|
||||
// No process found
|
||||
@@ -63,13 +66,13 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RD
|
||||
|
||||
std::string info;
|
||||
// Find details of the process running on each GPU
|
||||
std::vector<rsmi_process_info_t> procs(num_items);
|
||||
err =
|
||||
rsmi_compute_process_info_get(reinterpret_cast<rsmi_process_info_t*>(&procs[0]), &num_items);
|
||||
if (err != RSMI_STATUS_SUCCESS) {
|
||||
std::vector<amdsmi_process_info_t> procs(num_items);
|
||||
err = amdsmi_get_gpu_compute_process_info(reinterpret_cast<amdsmi_process_info_t*>(&procs[0]),
|
||||
&num_items);
|
||||
if (err != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_INFO, "Fail to get process detail information: " << err);
|
||||
strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH);
|
||||
return Rsmi2RdcError(err);
|
||||
return Smi2RdcError(err);
|
||||
}
|
||||
|
||||
std::map<uint32_t, std::vector<uint32_t>> pids_per_gpu;
|
||||
@@ -85,17 +88,18 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RD
|
||||
|
||||
// Get the num_devices the process is running
|
||||
uint32_t num_devices = 0;
|
||||
err = rsmi_compute_process_gpus_get(procs[i].process_id, nullptr, &num_devices);
|
||||
if (err != RSMI_STATUS_SUCCESS || num_devices == 0) {
|
||||
amdsmi_status_t err;
|
||||
err = amdsmi_get_gpu_compute_process_gpus(procs[i].process_id, nullptr, &num_devices);
|
||||
if (err != AMDSMI_STATUS_SUCCESS || num_devices == 0) {
|
||||
RDC_LOG(RDC_INFO, "Fail to get process GPUs detail information: " << err);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get the details of devices
|
||||
std::vector<uint32_t> device_details(num_devices);
|
||||
err = rsmi_compute_process_gpus_get(
|
||||
err = amdsmi_get_gpu_compute_process_gpus(
|
||||
procs[i].process_id, reinterpret_cast<uint32_t*>(&device_details[0]), &num_devices);
|
||||
if (err != RSMI_STATUS_SUCCESS) {
|
||||
if (err != AMDSMI_STATUS_SUCCESS) {
|
||||
RDC_LOG(RDC_INFO, "Fail to get process GPUs detail information: " << err);
|
||||
continue;
|
||||
}
|
||||
@@ -147,22 +151,22 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RD
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
std::string RdcSmiDiagnosticImpl::get_temperature_string(rsmi_temperature_type_t type) const {
|
||||
std::string RdcSmiDiagnosticImpl::get_temperature_string(amdsmi_temperature_type_t type) const {
|
||||
switch (type) {
|
||||
case RSMI_TEMP_TYPE_EDGE:
|
||||
case TEMPERATURE_TYPE_EDGE:
|
||||
return "Edge";
|
||||
case RSMI_TEMP_TYPE_JUNCTION:
|
||||
case TEMPERATURE_TYPE_JUNCTION:
|
||||
return "Junction";
|
||||
case RSMI_TEMP_TYPE_MEMORY:
|
||||
case TEMPERATURE_TYPE_VRAM:
|
||||
return "Memory";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
std::string RdcSmiDiagnosticImpl::get_voltage_string(rsmi_voltage_type_t type) const {
|
||||
std::string RdcSmiDiagnosticImpl::get_voltage_string(amdsmi_voltage_type_t type) const {
|
||||
switch (type) {
|
||||
case RSMI_VOLT_TYPE_VDDGFX:
|
||||
case AMDSMI_VOLT_TYPE_VDDGFX:
|
||||
return "Vddgfx voltage";
|
||||
default:
|
||||
return "Unknown";
|
||||
@@ -170,46 +174,49 @@ std::string RdcSmiDiagnosticImpl::get_voltage_string(rsmi_voltage_type_t type) c
|
||||
}
|
||||
|
||||
// Show topology type
|
||||
rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) {
|
||||
rdc_status_t RdcSmiDiagnosticImpl::check_smi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) {
|
||||
if (result == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
*result = {};
|
||||
result->test_case = RDC_DIAG_NODE_TOPOLOGY;
|
||||
|
||||
const std::map<RSMI_IO_LINK_TYPE, std::string> link_to_string = {
|
||||
{RSMI_IOLINK_TYPE_UNDEFINED, "Undefined"},
|
||||
{RSMI_IOLINK_TYPE_PCIEXPRESS, "PCI Express"},
|
||||
{RSMI_IOLINK_TYPE_XGMI, "XGMI"},
|
||||
{RSMI_IOLINK_TYPE_NUMIOLINKTYPES, "IO Link"}};
|
||||
const std::map<amdsmi_io_link_type_t, std::string> link_to_string = {
|
||||
{AMDSMI_IOLINK_TYPE_UNDEFINED, "Undefined"},
|
||||
{AMDSMI_IOLINK_TYPE_PCIEXPRESS, "PCI Express"},
|
||||
{AMDSMI_IOLINK_TYPE_XGMI, "XGMI"},
|
||||
{AMDSMI_IOLINK_TYPE_NUMIOLINKTYPES, "IO Link"}};
|
||||
|
||||
result->status = RDC_DIAG_RESULT_SKIP;
|
||||
result->per_gpu_result_count = 0;
|
||||
rsmi_status_t err = RSMI_STATUS_SUCCESS;
|
||||
amdsmi_status_t err = AMDSMI_STATUS_SUCCESS;
|
||||
std::string info = "";
|
||||
|
||||
for (uint32_t i = 0; i < gpu_count; i++) {
|
||||
for (uint32_t j = 0; j < gpu_count; j++) {
|
||||
if (gpu_index[i] == gpu_index[j]) continue;
|
||||
std::pair<amdsmi_processor_handle, amdsmi_processor_handle> ph;
|
||||
err = get_processor_handle_from_id(gpu_index[i], &ph.first);
|
||||
err = get_processor_handle_from_id(gpu_index[i], &ph.second);
|
||||
|
||||
uint64_t weight;
|
||||
err = rsmi_topo_get_link_weight(gpu_index[i], gpu_index[j], &weight);
|
||||
if (err != RSMI_STATUS_SUCCESS) {
|
||||
err = amdsmi_topo_get_link_weight(ph.first, ph.second, &weight);
|
||||
if (err != AMDSMI_STATUS_SUCCESS) {
|
||||
result->status = RDC_DIAG_RESULT_FAIL;
|
||||
result->details.code = err;
|
||||
std::string err_info = "rsmi_topo_get_link_weight(";
|
||||
err_info += std::to_string(gpu_index[i]) + ",";
|
||||
err_info += std::to_string(gpu_index[j]) + ", &weight)";
|
||||
err_info += std::to_string(i) + ",";
|
||||
err_info += std::to_string(j) + ", &weight)";
|
||||
err_info += " fail";
|
||||
strncpy_with_null(result->details.msg, err_info.c_str(), MAX_DIAG_MSG_LENGTH);
|
||||
strncpy_with_null(result->info, err_info.c_str(), MAX_DIAG_MSG_LENGTH);
|
||||
return RDC_ST_MSI_ERROR;
|
||||
}
|
||||
|
||||
info += std::to_string(gpu_index[i]) + "=>";
|
||||
info += std::to_string(gpu_index[j]) + " weight:";
|
||||
info += std::to_string(i) + "=>";
|
||||
info += std::to_string(j) + " weight:";
|
||||
info += std::to_string(weight) + " ";
|
||||
}
|
||||
}
|
||||
@@ -223,9 +230,9 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_topo_info(uint32_t gpu_index[RDC_M
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) {
|
||||
rdc_status_t RdcSmiDiagnosticImpl::check_smi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
|
||||
uint32_t gpu_count,
|
||||
rdc_diag_test_result_t* result) {
|
||||
if (result == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
@@ -237,27 +244,27 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info(uint32_t gpu_index[RDC_
|
||||
|
||||
for (uint32_t i = 0; i < gpu_count; i++) {
|
||||
// temperature
|
||||
for (rsmi_temperature_type_t sensor_type = RSMI_TEMP_TYPE_FIRST;
|
||||
sensor_type != RSMI_TEMP_TYPE_LAST;) {
|
||||
for (amdsmi_temperature_type_t sensor_type = TEMPERATURE_TYPE_FIRST;
|
||||
sensor_type < TEMPERATURE_TYPE__MAX;) {
|
||||
auto status = check_temperature_level(gpu_index[i], sensor_type, result->info,
|
||||
result->gpu_results[i].gpu_result.msg);
|
||||
// Set to higher error level
|
||||
if (status > result->status) {
|
||||
result->status = status;
|
||||
}
|
||||
sensor_type = static_cast<rsmi_temperature_type_t>(sensor_type + 1);
|
||||
sensor_type = static_cast<amdsmi_temperature_type_t>(sensor_type + 1);
|
||||
}
|
||||
|
||||
// Voltage
|
||||
for (rsmi_voltage_type_t sensor_type = RSMI_VOLT_TYPE_FIRST;
|
||||
sensor_type != RSMI_VOLT_TYPE_LAST;) {
|
||||
for (amdsmi_voltage_type_t sensor_type = AMDSMI_VOLT_TYPE_FIRST;
|
||||
sensor_type < AMDSMI_VOLT_TYPE_LAST;) {
|
||||
auto status = check_voltage_level(gpu_index[i], sensor_type, result->info,
|
||||
result->gpu_results[i].gpu_result.msg);
|
||||
// Set to higher error level
|
||||
if (status > result->status) {
|
||||
result->status = status;
|
||||
}
|
||||
sensor_type = static_cast<rsmi_voltage_type_t>(sensor_type + 1);
|
||||
sensor_type = static_cast<amdsmi_voltage_type_t>(sensor_type + 1);
|
||||
}
|
||||
result->gpu_results->gpu_index = gpu_index[i];
|
||||
result->per_gpu_result_count++;
|
||||
@@ -266,24 +273,25 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info(uint32_t gpu_index[RDC_
|
||||
}
|
||||
|
||||
rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
|
||||
uint32_t gpu_index, rsmi_temperature_type_t type, char msg[MAX_DIAG_MSG_LENGTH],
|
||||
uint32_t gpu_index, amdsmi_temperature_type_t type, char msg[MAX_DIAG_MSG_LENGTH],
|
||||
char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) {
|
||||
rdc_diag_result_t result = RDC_DIAG_RESULT_PASS;
|
||||
rsmi_temperature_metric_t met = RSMI_TEMP_CURRENT;
|
||||
rsmi_status_t err = RSMI_STATUS_SUCCESS;
|
||||
amdsmi_temperature_metric_t met = AMDSMI_TEMP_CURRENT;
|
||||
amdsmi_status_t err = AMDSMI_STATUS_SUCCESS;
|
||||
int64_t current_temp = 0;
|
||||
std::string info = msg;
|
||||
std::string per_gpu_info = per_gpu_msg;
|
||||
amdsmi_processor_handle processor_handle;
|
||||
get_processor_handle_from_id(gpu_index, &processor_handle);
|
||||
|
||||
err = rsmi_dev_temp_metric_get(gpu_index, type, met, ¤t_temp);
|
||||
|
||||
if (err != RSMI_STATUS_SUCCESS) return result;
|
||||
err = amdsmi_get_temp_metric(processor_handle, type, met, ¤t_temp);
|
||||
if (err != AMDSMI_STATUS_SUCCESS) return result;
|
||||
|
||||
// Max temperature
|
||||
met = RSMI_TEMP_MAX;
|
||||
met = AMDSMI_TEMP_MAX;
|
||||
int64_t max_temp = 0;
|
||||
err = rsmi_dev_temp_metric_get(gpu_index, type, met, &max_temp);
|
||||
if (err == RSMI_STATUS_SUCCESS) {
|
||||
err = amdsmi_get_temp_metric(processor_handle, type, met, &max_temp);
|
||||
if (err == AMDSMI_STATUS_SUCCESS) {
|
||||
if (current_temp >= max_temp) {
|
||||
result = RDC_DIAG_RESULT_WARN;
|
||||
per_gpu_info += "Max ";
|
||||
@@ -305,10 +313,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
|
||||
}
|
||||
}
|
||||
|
||||
met = RSMI_TEMP_MIN;
|
||||
met = AMDSMI_TEMP_MIN;
|
||||
int64_t min_temp = 0;
|
||||
err = rsmi_dev_temp_metric_get(gpu_index, type, met, &min_temp);
|
||||
if (err == RSMI_STATUS_SUCCESS) {
|
||||
err = amdsmi_get_temp_metric(processor_handle, type, met, &min_temp);
|
||||
if (err == AMDSMI_STATUS_SUCCESS) {
|
||||
if (current_temp <= min_temp) {
|
||||
result = RDC_DIAG_RESULT_WARN;
|
||||
per_gpu_info += "Min ";
|
||||
@@ -329,10 +337,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
|
||||
}
|
||||
}
|
||||
|
||||
met = RSMI_TEMP_CRITICAL;
|
||||
met = AMDSMI_TEMP_CRITICAL;
|
||||
int64_t critical_temp = 0;
|
||||
err = rsmi_dev_temp_metric_get(gpu_index, type, met, &critical_temp);
|
||||
if (err == RSMI_STATUS_SUCCESS) {
|
||||
err = amdsmi_get_temp_metric(processor_handle, type, met, &critical_temp);
|
||||
if (err == AMDSMI_STATUS_SUCCESS) {
|
||||
if (current_temp >= critical_temp) {
|
||||
result = RDC_DIAG_RESULT_FAIL;
|
||||
per_gpu_info += "Critical ";
|
||||
@@ -353,10 +361,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
|
||||
}
|
||||
}
|
||||
|
||||
met = RSMI_TEMP_EMERGENCY;
|
||||
met = AMDSMI_TEMP_EMERGENCY;
|
||||
int64_t emergency_temp = 0;
|
||||
err = rsmi_dev_temp_metric_get(gpu_index, type, met, &emergency_temp);
|
||||
if (err == RSMI_STATUS_SUCCESS) {
|
||||
err = amdsmi_get_temp_metric(processor_handle, type, met, &emergency_temp);
|
||||
if (err == AMDSMI_STATUS_SUCCESS) {
|
||||
if (current_temp >= critical_temp) {
|
||||
result = RDC_DIAG_RESULT_FAIL;
|
||||
per_gpu_info += "Emergency ";
|
||||
@@ -377,10 +385,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
|
||||
}
|
||||
}
|
||||
|
||||
met = RSMI_TEMP_CRIT_MIN;
|
||||
met = AMDSMI_TEMP_CRIT_MIN;
|
||||
int64_t critical_min_temp = 0;
|
||||
err = rsmi_dev_temp_metric_get(gpu_index, type, met, &critical_min_temp);
|
||||
if (err == RSMI_STATUS_SUCCESS) {
|
||||
err = amdsmi_get_temp_metric(processor_handle, type, met, &critical_min_temp);
|
||||
if (err == AMDSMI_STATUS_SUCCESS) {
|
||||
if (current_temp <= critical_min_temp) {
|
||||
result = RDC_DIAG_RESULT_FAIL;
|
||||
per_gpu_info += "Critical Min ";
|
||||
@@ -408,24 +416,26 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
|
||||
}
|
||||
|
||||
rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index,
|
||||
rsmi_voltage_type_t type,
|
||||
amdsmi_voltage_type_t type,
|
||||
char msg[MAX_DIAG_MSG_LENGTH],
|
||||
char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) {
|
||||
rdc_diag_result_t result = RDC_DIAG_RESULT_PASS;
|
||||
rsmi_voltage_metric_t met = RSMI_VOLT_CURRENT;
|
||||
rsmi_status_t err = RSMI_STATUS_SUCCESS;
|
||||
amdsmi_voltage_metric_t met = AMDSMI_VOLT_CURRENT;
|
||||
amdsmi_status_t err = AMDSMI_STATUS_SUCCESS;
|
||||
int64_t current_voltage = 0;
|
||||
std::string info = msg;
|
||||
std::string per_gpu_info = per_gpu_msg;
|
||||
amdsmi_processor_handle processor_handle;
|
||||
get_processor_handle_from_id(gpu_index, &processor_handle);
|
||||
|
||||
err = rsmi_dev_volt_metric_get(gpu_index, type, met, ¤t_voltage);
|
||||
if (err != RSMI_STATUS_SUCCESS) return result;
|
||||
err = amdsmi_get_gpu_volt_metric(processor_handle, type, met, ¤t_voltage);
|
||||
if (err != AMDSMI_STATUS_SUCCESS) return result;
|
||||
|
||||
// Max voltage
|
||||
met = RSMI_VOLT_MAX;
|
||||
met = AMDSMI_VOLT_MAX;
|
||||
int64_t max_volt = 0;
|
||||
err = rsmi_dev_volt_metric_get(gpu_index, type, met, &max_volt);
|
||||
if (err == RSMI_STATUS_SUCCESS) {
|
||||
err = amdsmi_get_gpu_volt_metric(processor_handle, type, met, &max_volt);
|
||||
if (err == AMDSMI_STATUS_SUCCESS) {
|
||||
if (current_voltage >= max_volt) {
|
||||
result = RDC_DIAG_RESULT_WARN;
|
||||
per_gpu_info += "Max ";
|
||||
@@ -448,10 +458,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index,
|
||||
}
|
||||
|
||||
// Min voltage
|
||||
met = RSMI_VOLT_MIN;
|
||||
met = AMDSMI_VOLT_MIN;
|
||||
int64_t min_volt = 0;
|
||||
err = rsmi_dev_volt_metric_get(gpu_index, type, met, &min_volt);
|
||||
if (err == RSMI_STATUS_SUCCESS) {
|
||||
err = amdsmi_get_gpu_volt_metric(processor_handle, type, met, &min_volt);
|
||||
if (err == AMDSMI_STATUS_SUCCESS) {
|
||||
if (current_voltage <= min_volt) {
|
||||
result = RDC_DIAG_RESULT_WARN;
|
||||
per_gpu_info += "Min ";
|
||||
@@ -474,10 +484,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index,
|
||||
}
|
||||
|
||||
// Max Critical voltage
|
||||
met = RSMI_VOLT_MAX_CRIT;
|
||||
met = AMDSMI_VOLT_MAX_CRIT;
|
||||
int64_t critical_max_volt = 0;
|
||||
err = rsmi_dev_volt_metric_get(gpu_index, type, met, &critical_max_volt);
|
||||
if (err == RSMI_STATUS_SUCCESS) {
|
||||
err = amdsmi_get_gpu_volt_metric(nullptr, type, met, &critical_max_volt);
|
||||
if (err == AMDSMI_STATUS_SUCCESS) {
|
||||
if (current_voltage >= critical_max_volt) {
|
||||
result = RDC_DIAG_RESULT_FAIL;
|
||||
per_gpu_info += "Critical Max ";
|
||||
@@ -500,10 +510,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index,
|
||||
}
|
||||
|
||||
// Min Critical voltage
|
||||
met = RSMI_VOLT_MIN_CRIT;
|
||||
met = AMDSMI_VOLT_MIN_CRIT;
|
||||
int64_t critical_min_volt = 0;
|
||||
err = rsmi_dev_volt_metric_get(gpu_index, type, met, &critical_min_volt);
|
||||
if (err == RSMI_STATUS_SUCCESS) {
|
||||
err = amdsmi_get_gpu_volt_metric(nullptr, type, met, &critical_min_volt);
|
||||
if (err == AMDSMI_STATUS_SUCCESS) {
|
||||
if (current_voltage <= critical_min_volt) {
|
||||
result = RDC_DIAG_RESULT_FAIL;
|
||||
per_gpu_info += "Critical Min ";
|
||||
|
||||
@@ -42,8 +42,8 @@ RdcSmiLib::RdcSmiLib(const RdcMetricFetcherPtr& mf)
|
||||
}
|
||||
}
|
||||
|
||||
// Bulk fetch wrapper for the rocm_smi_lib. This will be replaced after
|
||||
// rocm_smi_lib can support bulk fetch.
|
||||
// Bulk fetch wrapper for the amd_smi_lib. This will be replaced after
|
||||
// amd_smi_lib can support bulk fetch.
|
||||
rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count,
|
||||
rdc_field_value_f callback,
|
||||
@@ -52,7 +52,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from rocm_smi_lib.");
|
||||
RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from amd_smi_lib.");
|
||||
|
||||
// Bulk fetch fields
|
||||
std::vector<rdc_gpu_field_value_t> bulk_results;
|
||||
@@ -60,7 +60,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
|
||||
rdc_status_t status =
|
||||
metric_fetcher_->bulk_fetch_smi_fields(fields, fields_count, bulk_results);
|
||||
RDC_LOG(RDC_DEBUG, "Bulk fetched " << bulk_results.size()
|
||||
<< " fields from rocm_smi_lib which return " << status);
|
||||
<< " fields from amd_smi_lib which return " << status);
|
||||
if (bulk_results.size() > 0) {
|
||||
rdc_status_t status = callback(&bulk_results[0], bulk_results.size(), user_data);
|
||||
if (status != RDC_ST_OK) {
|
||||
@@ -116,12 +116,12 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < fields_count; i++) {
|
||||
ret = metric_fetcher_->acquire_rsmi_handle({fields[i].gpu_index, fields[i].field_id});
|
||||
ret = metric_fetcher_->acquire_smi_handle({fields[i].gpu_index, fields[i].field_id});
|
||||
if (ret != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR, "Failed to acquire rocm_smi handle for field.");
|
||||
RDC_LOG(RDC_ERROR, "Failed to acquire amd_smi handle for field.");
|
||||
}
|
||||
}
|
||||
RDC_LOG(RDC_DEBUG, "acquire " << fields_count << " field handles from rocm_smi_lib");
|
||||
RDC_LOG(RDC_DEBUG, "acquire " << fields_count << " field handles from amd_smi_lib");
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
@@ -133,9 +133,9 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < fields_count; i++) {
|
||||
metric_fetcher_->delete_rsmi_handle({fields[i].gpu_index, fields[i].field_id});
|
||||
metric_fetcher_->delete_smi_handle({fields[i].gpu_index, fields[i].field_id});
|
||||
}
|
||||
RDC_LOG(RDC_DEBUG, "delete " << fields_count << " field handles from rocm_smi_lib");
|
||||
RDC_LOG(RDC_DEBUG, "delete " << fields_count << " field handles from amd_smi_lib");
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
@@ -146,7 +146,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
// List of fields supported by rocm_smi_lib
|
||||
// List of fields supported by amd_smi_lib
|
||||
const std::vector<uint32_t> fields{
|
||||
RDC_FI_GPU_COUNT, RDC_FI_DEV_NAME,
|
||||
RDC_FI_GPU_CLOCK, RDC_FI_MEM_CLOCK,
|
||||
@@ -192,11 +192,11 @@ rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
|
||||
}
|
||||
switch (test_case) {
|
||||
case RDC_DIAG_COMPUTE_PROCESS:
|
||||
return smi_diag_->check_rsmi_process_info(gpu_index, gpu_count, result);
|
||||
return smi_diag_->check_smi_process_info(gpu_index, gpu_count, result);
|
||||
case RDC_DIAG_NODE_TOPOLOGY:
|
||||
return smi_diag_->check_rsmi_topo_info(gpu_index, gpu_count, result);
|
||||
return smi_diag_->check_smi_topo_info(gpu_index, gpu_count, result);
|
||||
case RDC_DIAG_GPU_PARAMETERS:
|
||||
return smi_diag_->check_rsmi_param_info(gpu_index, gpu_count, result);
|
||||
return smi_diag_->check_smi_param_info(gpu_index, gpu_count, result);
|
||||
default:
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi) {
|
||||
switch (rsmi) {
|
||||
case RSMI_STATUS_SUCCESS:
|
||||
return RDC_ST_OK;
|
||||
|
||||
case RSMI_STATUS_INVALID_ARGS:
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
|
||||
case RSMI_STATUS_NOT_SUPPORTED:
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
|
||||
case RSMI_STATUS_NOT_FOUND:
|
||||
return RDC_ST_NOT_FOUND;
|
||||
|
||||
case RSMI_STATUS_OUT_OF_RESOURCES:
|
||||
return RDC_ST_INSUFF_RESOURCES;
|
||||
|
||||
case RSMI_STATUS_FILE_ERROR:
|
||||
return RDC_ST_FILE_ERROR;
|
||||
|
||||
case RSMI_STATUS_NO_DATA:
|
||||
return RDC_ST_NO_DATA;
|
||||
|
||||
case RSMI_STATUS_PERMISSION:
|
||||
return RDC_ST_PERM_ERROR;
|
||||
|
||||
case RSMI_STATUS_BUSY:
|
||||
case RSMI_STATUS_UNKNOWN_ERROR:
|
||||
case RSMI_STATUS_INTERNAL_EXCEPTION:
|
||||
case RSMI_STATUS_INPUT_OUT_OF_BOUNDS:
|
||||
case RSMI_STATUS_INIT_ERROR:
|
||||
case RSMI_STATUS_NOT_YET_IMPLEMENTED:
|
||||
case RSMI_STATUS_INSUFFICIENT_SIZE:
|
||||
case RSMI_STATUS_INTERRUPT:
|
||||
case RSMI_STATUS_UNEXPECTED_SIZE:
|
||||
case RSMI_STATUS_UNEXPECTED_DATA:
|
||||
case RSMI_STATUS_REFCOUNT_OVERFLOW:
|
||||
default:
|
||||
return RDC_ST_UNKNOWN_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
@@ -0,0 +1,142 @@
|
||||
/*
|
||||
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "rdc_lib/impl/SmiUtils.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
rdc_status_t Smi2RdcError(amdsmi_status_t rsmi) {
|
||||
switch (rsmi) {
|
||||
case AMDSMI_STATUS_SUCCESS:
|
||||
return RDC_ST_OK;
|
||||
|
||||
case AMDSMI_STATUS_INVAL:
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
|
||||
case AMDSMI_STATUS_NOT_SUPPORTED:
|
||||
return RDC_ST_NOT_SUPPORTED;
|
||||
|
||||
case AMDSMI_STATUS_NOT_FOUND:
|
||||
return RDC_ST_NOT_FOUND;
|
||||
|
||||
case AMDSMI_STATUS_OUT_OF_RESOURCES:
|
||||
return RDC_ST_INSUFF_RESOURCES;
|
||||
|
||||
case AMDSMI_STATUS_FILE_ERROR:
|
||||
return RDC_ST_FILE_ERROR;
|
||||
|
||||
case AMDSMI_STATUS_NO_DATA:
|
||||
return RDC_ST_NO_DATA;
|
||||
|
||||
case AMDSMI_STATUS_NO_PERM:
|
||||
return RDC_ST_PERM_ERROR;
|
||||
|
||||
case AMDSMI_STATUS_BUSY:
|
||||
case AMDSMI_STATUS_UNKNOWN_ERROR:
|
||||
case AMDSMI_STATUS_INTERNAL_EXCEPTION:
|
||||
case AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS:
|
||||
case AMDSMI_STATUS_INIT_ERROR:
|
||||
case AMDSMI_STATUS_NOT_YET_IMPLEMENTED:
|
||||
case AMDSMI_STATUS_INSUFFICIENT_SIZE:
|
||||
case AMDSMI_STATUS_INTERRUPT:
|
||||
case AMDSMI_STATUS_UNEXPECTED_SIZE:
|
||||
case AMDSMI_STATUS_UNEXPECTED_DATA:
|
||||
case AMDSMI_STATUS_REFCOUNT_OVERFLOW:
|
||||
default:
|
||||
return RDC_ST_UNKNOWN_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
|
||||
amdsmi_processor_handle* processor_handle) {
|
||||
uint32_t socket_count;
|
||||
uint32_t processor_count;
|
||||
auto ret = amdsmi_get_socket_handles(&socket_count, nullptr);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
std::vector<amdsmi_socket_handle> sockets(socket_count);
|
||||
std::vector<amdsmi_processor_handle> all_processors{};
|
||||
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
|
||||
for (auto& socket : sockets) {
|
||||
ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
std::vector<amdsmi_processor_handle> processors(processor_count);
|
||||
ret = amdsmi_get_processor_handles(socket, &processor_count, processors.data());
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (auto& processor : processors) {
|
||||
processor_type_t processor_type = {};
|
||||
ret = amdsmi_get_processor_type(processor, &processor_type);
|
||||
if (processor_type != AMD_GPU) {
|
||||
RDC_LOG(RDC_ERROR, "Expect AMD_GPU device type!");
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
all_processors.push_back(processor);
|
||||
}
|
||||
}
|
||||
|
||||
if (gpu_id >= all_processors.size()) {
|
||||
return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
|
||||
}
|
||||
|
||||
// Get processor handle from GPU id
|
||||
*processor_handle = all_processors[gpu_id];
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
amdsmi_status_t get_processor_count(uint32_t& all_processor_count) {
|
||||
uint32_t total_processor_count = 0;
|
||||
uint32_t socket_count;
|
||||
auto ret = amdsmi_get_socket_handles(&socket_count, nullptr);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
std::vector<amdsmi_socket_handle> sockets(socket_count);
|
||||
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
|
||||
for (auto& socket : sockets) {
|
||||
uint32_t processor_count;
|
||||
ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr);
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
total_processor_count += processor_count;
|
||||
}
|
||||
all_processor_count = total_processor_count;
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
@@ -34,7 +34,7 @@ if(BUILD_ROCPTEST)
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
"${COMMON_DIR}"
|
||||
"${RSMI_INC_DIR}"
|
||||
"${SMI_INC_DIR}"
|
||||
"${ROCM_DIR}/include"
|
||||
"${ROCM_DIR}/include/hsa")
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ if(BUILD_ROCRTEST)
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
"${COMMON_DIR}"
|
||||
"${RSMI_INC_DIR}"
|
||||
"${SMI_INC_DIR}"
|
||||
"${ROCM_DIR}/include")
|
||||
|
||||
# Set the VERSION and SOVERSION values
|
||||
|
||||
@@ -32,20 +32,18 @@ if(BUILD_RVS)
|
||||
find_package(hsa-runtime64 REQUIRED)
|
||||
find_package(rvs REQUIRED
|
||||
HINTS ${ROCM_DIR}/lib/cmake)
|
||||
find_library(rvslib REQUIRED
|
||||
NAMES rvslib)
|
||||
|
||||
## additional libraries
|
||||
set(COMBINED_LIBS rocblas hsakmt hsa-runtime64 hip::amdhip64 yaml-cpp)
|
||||
|
||||
set(RDC_LIB_MODULES ${RDC_LIB_MODULES} ${RDC_RVS_LIB} PARENT_SCOPE)
|
||||
add_library(${RDC_RVS_LIB} SHARED ${RDC_RVS_LIB_SRC_LIST} ${RDC_RVS_LIB_INC_LIST})
|
||||
target_link_libraries(${RDC_RVS_LIB} PRIVATE ${RDC_LIB} ${BOOTSTRAP_LIB} ${rvslib} pthread dl ${COMBINED_LIBS})
|
||||
target_link_libraries(${RDC_RVS_LIB} PRIVATE ${RDC_LIB} ${BOOTSTRAP_LIB} ${rvs} pthread dl ${COMBINED_LIBS})
|
||||
target_include_directories(${RDC_RVS_LIB} PRIVATE
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
"${COMMON_DIR}"
|
||||
"${RSMI_INC_DIR}"
|
||||
"${SMI_INC_DIR}"
|
||||
"${ROCM_DIR}/include"
|
||||
"${ROCM_DIR}/include/hsa"
|
||||
"${ROCM_VALIDATION_SUITE_INCLUDE_DIR}")
|
||||
|
||||
@@ -34,8 +34,8 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
|
||||
message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
|
||||
message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
|
||||
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
|
||||
message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR})
|
||||
message("--------RSMI Inc Dir: " ${RSMI_INC_DIR})
|
||||
message("--------SMI Lib Dir: " ${SMI_LIB_DIR})
|
||||
message("--------SMI Inc Dir: " ${SMI_INC_DIR})
|
||||
message("-------GRPC ROOT Dir: " ${GRPC_ROOT})
|
||||
message("")
|
||||
|
||||
|
||||
@@ -31,8 +31,8 @@ message("----------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
|
||||
message("----------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
|
||||
message("----------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
|
||||
message("----------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
|
||||
message("----------RSMI Lib Dir: " ${RSMI_LIB_DIR})
|
||||
message("----------RSMI Inc Dir: " ${RSMI_INC_DIR})
|
||||
message("----------SMI Lib Dir: " ${SMI_LIB_DIR})
|
||||
message("----------SMI Inc Dir: " ${SMI_INC_DIR})
|
||||
message("---------GRPC Root Dir: " ${GRPC_ROOT})
|
||||
message("")
|
||||
|
||||
@@ -59,7 +59,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
"${GRPC_ROOT}/include"
|
||||
"${PROTOB_OUT_DIR}"
|
||||
"${RSMI_INC_DIR}"
|
||||
"${SMI_INC_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}")
|
||||
|
||||
set(SERVER_SRC_LIST
|
||||
@@ -68,7 +68,6 @@ set(SERVER_SRC_LIST
|
||||
"${PROTOBUF_GENERATED_SRCS}"
|
||||
"${SRC_DIR}/rdc_admin_service.cc"
|
||||
"${SRC_DIR}/rdc_api_service.cc"
|
||||
"${SRC_DIR}/rdc_rsmi_service.cc"
|
||||
"${SRC_DIR}/rdc_server_main.cc")
|
||||
message("SERVER_SRC_LIST=${SERVER_SRC_LIST}")
|
||||
|
||||
@@ -76,7 +75,7 @@ set(SERVER_DAEMON_EXE "rdcd")
|
||||
configure_file("rdc.service.in" "${PROJECT_BINARY_DIR}/rdc.service" @ONLY)
|
||||
set(SERVICE_FILE_NAME "rdc.service")
|
||||
|
||||
link_directories(${RSMI_LIB_DIR})
|
||||
link_directories(${SMI_LIB_DIR})
|
||||
|
||||
add_executable(${SERVER_DAEMON_EXE} "${SERVER_SRC_LIST}")
|
||||
|
||||
@@ -85,7 +84,7 @@ set_target_properties(${SERVER_DAEMON_EXE}
|
||||
PROPERTIES INSTALL_RPATH "\$ORIGIN/../lib")
|
||||
|
||||
target_link_libraries(${SERVER_DAEMON_EXE} pthread rt gRPC::grpc++
|
||||
cap dl rocm_smi64 rdc_bootstrap)
|
||||
cap dl amd_smi rdc_bootstrap)
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${SERVER_DAEMON_EXE}
|
||||
PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE GROUP_READ
|
||||
|
||||
@@ -22,9 +22,9 @@ THE SOFTWARE.
|
||||
#ifndef SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_
|
||||
#define SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
#include "rdc/rdc_admin_service.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef SERVER_INCLUDE_RDC_RDC_RSMI_SERVICE_H_
|
||||
#define SERVER_INCLUDE_RDC_RDC_RSMI_SERVICE_H_
|
||||
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
#include "rdc/rdc_rsmi_service.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RsmiServiceImpl final : public ::rdc::Rsmi::Service {
|
||||
public:
|
||||
RsmiServiceImpl();
|
||||
~RsmiServiceImpl();
|
||||
|
||||
rsmi_status_t Initialize(uint64_t rsmi_init_flags = 0);
|
||||
|
||||
::grpc::Status GetNumDevices(::grpc::ServerContext* context,
|
||||
const ::rdc::GetNumDevicesRequest* request,
|
||||
::rdc::GetNumDevicesResponse* reply) override;
|
||||
|
||||
::grpc::Status GetTemperature(::grpc::ServerContext* context,
|
||||
const ::rdc::GetTemperatureRequest* request,
|
||||
::rdc::GetTemperatureResponse* response) override;
|
||||
|
||||
::grpc::Status GetFanRpms(::grpc::ServerContext* context, const ::rdc::GetFanRpmsRequest* request,
|
||||
::rdc::GetFanRpmsResponse* response) override;
|
||||
|
||||
::grpc::Status GetFanSpeed(::grpc::ServerContext* context,
|
||||
const ::rdc::GetFanSpeedRequest* request,
|
||||
::rdc::GetFanSpeedResponse* response) override;
|
||||
|
||||
::grpc::Status GetFanSpeedMax(::grpc::ServerContext* context,
|
||||
const ::rdc::GetFanSpeedMaxRequest* request,
|
||||
::rdc::GetFanSpeedMaxResponse* response) override;
|
||||
|
||||
private:
|
||||
bool rsmi_initialized_;
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // SERVER_INCLUDE_RDC_RDC_RSMI_SERVICE_H_
|
||||
@@ -29,7 +29,6 @@ THE SOFTWARE.
|
||||
|
||||
#include "rdc/rdc_admin_service.h"
|
||||
#include "rdc/rdc_api_service.h"
|
||||
#include "rdc/rdc_rsmi_service.h"
|
||||
|
||||
typedef struct {
|
||||
std::string listen_address;
|
||||
@@ -49,9 +48,6 @@ class RDCServer {
|
||||
void Run(void);
|
||||
void ShutDown(void);
|
||||
|
||||
bool start_rsmi_service(void) const { return start_rsmi_service_; }
|
||||
void set_start_rsmi_service(bool s) { start_rsmi_service_ = s; }
|
||||
|
||||
bool start_rdc_admin_service(void) const { return start_rdc_admin_service_; }
|
||||
void set_start_rdc_admin_service(bool s) { start_rdc_admin_service_ = s; }
|
||||
|
||||
@@ -68,8 +64,6 @@ class RDCServer {
|
||||
bool secure_creds_;
|
||||
bool use_pinned_certs_;
|
||||
bool log_debug_;
|
||||
bool start_rsmi_service_;
|
||||
amd::rdc::RsmiServiceImpl* rsmi_service_;
|
||||
RdcdCmdLineOpts* cmd_line_;
|
||||
|
||||
bool start_rdc_admin_service_;
|
||||
|
||||
@@ -1,175 +0,0 @@
|
||||
|
||||
/*
|
||||
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include "rdc/rdc_rsmi_service.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <grpcpp/grpcpp.h>
|
||||
|
||||
#include <csignal>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
RsmiServiceImpl::RsmiServiceImpl() : rsmi_initialized_(false) {}
|
||||
|
||||
RsmiServiceImpl::~RsmiServiceImpl() {
|
||||
if (rsmi_initialized_) {
|
||||
rsmi_status_t rsmi_ret = rsmi_shut_down();
|
||||
rsmi_initialized_ = false;
|
||||
assert(rsmi_ret == RSMI_STATUS_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
// rsmi and rdc currently happen to have a 1-to-1 mapping, but
|
||||
// have this function in case that changes
|
||||
static rsmi_temperature_metric_t rdc_temp2rsmi_temp(
|
||||
::rdc::GetTemperatureRequest_TemperatureMetric rdc_temp) {
|
||||
return static_cast<rsmi_temperature_metric_t>(rdc_temp);
|
||||
}
|
||||
|
||||
rsmi_status_t RsmiServiceImpl::Initialize(uint64_t rsmi_init_flags) {
|
||||
rsmi_status_t rsmi_ret = rsmi_init(rsmi_init_flags);
|
||||
if (rsmi_ret != RSMI_STATUS_SUCCESS) {
|
||||
std::cout << "rsmi_init() returned error" << std::endl;
|
||||
} else {
|
||||
rsmi_initialized_ = true;
|
||||
}
|
||||
return rsmi_ret;
|
||||
}
|
||||
|
||||
::grpc::Status RsmiServiceImpl::GetNumDevices(::grpc::ServerContext* context,
|
||||
const ::rdc::GetNumDevicesRequest* request,
|
||||
::rdc::GetNumDevicesResponse* reply) {
|
||||
assert(reply != nullptr);
|
||||
uint32_t num_devices;
|
||||
|
||||
(void)context; // Quiet warning for now;
|
||||
(void)request;
|
||||
assert(reply != nullptr);
|
||||
|
||||
rsmi_status_t ret = rsmi_num_monitor_devices(&num_devices);
|
||||
|
||||
// TODO(cfreehil) replace below with macro
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
std::cout << "rsmi_num_monitor_devices() returned error" << std::endl;
|
||||
}
|
||||
reply->set_val(num_devices);
|
||||
reply->set_ret_val(ret);
|
||||
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RsmiServiceImpl::GetTemperature(::grpc::ServerContext* context,
|
||||
const ::rdc::GetTemperatureRequest* request,
|
||||
::rdc::GetTemperatureResponse* response) {
|
||||
(void)context; // Quiet warning for now;
|
||||
assert(response != nullptr);
|
||||
|
||||
int64_t temperature;
|
||||
rsmi_status_t ret = rsmi_dev_temp_metric_get(request->dv_ind(), request->sensor_type(),
|
||||
rdc_temp2rsmi_temp(request->metric()), &temperature);
|
||||
|
||||
response->set_temperature(temperature);
|
||||
response->set_ret_val(ret);
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RsmiServiceImpl::GetFanRpms(::grpc::ServerContext* context,
|
||||
const ::rdc::GetFanRpmsRequest* request,
|
||||
::rdc::GetFanRpmsResponse* response) {
|
||||
(void)context; // Quiet warning for now;
|
||||
assert(response != nullptr);
|
||||
|
||||
int64_t rpms;
|
||||
rsmi_status_t ret = rsmi_dev_fan_rpms_get(request->dv_ind(), request->sensor_ind(), &rpms);
|
||||
|
||||
response->set_rpms(rpms);
|
||||
response->set_ret_val(ret);
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RsmiServiceImpl::GetFanSpeed(::grpc::ServerContext* context,
|
||||
const ::rdc::GetFanSpeedRequest* request,
|
||||
::rdc::GetFanSpeedResponse* response) {
|
||||
(void)context; // Quiet warning for now;
|
||||
assert(response != nullptr);
|
||||
|
||||
int64_t speed;
|
||||
rsmi_status_t ret = rsmi_dev_fan_speed_get(request->dv_ind(), request->sensor_ind(), &speed);
|
||||
|
||||
response->set_speed(speed);
|
||||
response->set_ret_val(ret);
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
::grpc::Status RsmiServiceImpl::GetFanSpeedMax(::grpc::ServerContext* context,
|
||||
const ::rdc::GetFanSpeedMaxRequest* request,
|
||||
::rdc::GetFanSpeedMaxResponse* response) {
|
||||
(void)context; // Quiet warning for now;
|
||||
assert(response != nullptr);
|
||||
|
||||
uint64_t max_speed;
|
||||
rsmi_status_t ret =
|
||||
rsmi_dev_fan_speed_max_get(request->dv_ind(), request->sensor_ind(), &max_speed);
|
||||
|
||||
response->set_max_speed(max_speed);
|
||||
response->set_ret_val(ret);
|
||||
return ::grpc::Status::OK;
|
||||
}
|
||||
|
||||
// TODO(cfreehil): read server config from YAML file. Config can include things
|
||||
// like server address, Secure/Insecure creds, rsmi_init flags, etc.
|
||||
void RunServer() {
|
||||
std::string server_address("0.0.0.0:50051");
|
||||
RsmiServiceImpl service;
|
||||
|
||||
::grpc::ServerBuilder builder;
|
||||
// Listen on the given address without any authentication mechanism.
|
||||
builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
|
||||
// Register "service" as the instance through which we'll communicate with
|
||||
// clients. In this case it corresponds to an *synchronous* service.
|
||||
builder.RegisterService(&service);
|
||||
// Finally assemble the server.
|
||||
std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
|
||||
std::cout << "Server listening on " << server_address << std::endl;
|
||||
|
||||
uint64_t flags = 0; // TODO(cfreehil) Read this from config file
|
||||
rsmi_status_t rsmi_ret = rsmi_init(flags);
|
||||
// TODO(cfreehil): check rsmi return code
|
||||
// Wait for the server to shutdown. Note that some other thread must be
|
||||
// responsible for shutting down the server for this call to ever return.
|
||||
if (rsmi_ret != RSMI_STATUS_SUCCESS) {
|
||||
std::cout << "rsmi_init() returned error. Exiting" << std::endl;
|
||||
return;
|
||||
}
|
||||
server->Wait();
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
@@ -38,12 +38,11 @@ THE SOFTWARE.
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "common/rdc_capabilities.h"
|
||||
#include "common/rdc_utils.h"
|
||||
#include "rdc.grpc.pb.h" // NOLINT
|
||||
#include "rdc/rdc_api_service.h"
|
||||
#include "rdc/rdc_rsmi_service.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
// TODO(cfreehil):
|
||||
// The following need to be made configurable (e.g., from YAML):
|
||||
@@ -76,8 +75,7 @@ static const char* kDefaultListenAddress = "0.0.0.0";
|
||||
static const char* kDefaultListenPort = "50051";
|
||||
static const uint32_t kRSMIUMask = 027;
|
||||
|
||||
RDCServer::RDCServer()
|
||||
: secure_creds_(false), rsmi_service_(nullptr), rdc_admin_service_(nullptr) {}
|
||||
RDCServer::RDCServer() : secure_creds_(false), rdc_admin_service_(nullptr) {}
|
||||
|
||||
RDCServer::~RDCServer() {}
|
||||
|
||||
@@ -195,18 +193,6 @@ void RDCServer::Run() {
|
||||
builder.RegisterService(rdc_admin_service_);
|
||||
}
|
||||
|
||||
if (start_rsmi_service()) {
|
||||
rsmi_service_ = new amd::rdc::RsmiServiceImpl();
|
||||
builder.RegisterService(rsmi_service_);
|
||||
|
||||
rsmi_status_t ret = rsmi_service_->Initialize(0);
|
||||
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
std::cerr << "Failed to start RSMI service. ret = " << ret << std::endl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (start_api_service()) {
|
||||
api_service_ = new amd::rdc::RdcAPIServiceImpl();
|
||||
builder.RegisterService(api_service_);
|
||||
@@ -287,11 +273,6 @@ static int FileOwner(const char* fn, std::string* owner) {
|
||||
void RDCServer::ShutDown(void) {
|
||||
server_->Shutdown();
|
||||
|
||||
if (rsmi_service_) {
|
||||
delete rsmi_service_;
|
||||
rsmi_service_ = nullptr;
|
||||
}
|
||||
|
||||
if (rdc_admin_service_) {
|
||||
delete rdc_admin_service_;
|
||||
rdc_admin_service_ = nullptr;
|
||||
@@ -673,7 +654,6 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
|
||||
// TODO(cfreehil): Eventually, set these by reading a config file
|
||||
rdc_server.set_start_rsmi_service(true);
|
||||
rdc_server.set_start_rdc_admin_service(true);
|
||||
rdc_server.set_start_api_service(true);
|
||||
|
||||
|
||||
@@ -26,9 +26,9 @@ THE SOFTWARE.
|
||||
|
||||
// This file is generated on build.
|
||||
|
||||
#define rocm_smi_VERSION_MAJOR @rocm_smi_VERSION_MAJOR@
|
||||
#define rocm_smi_VERSION_MINOR @rocm_smi_VERSION_MINOR@
|
||||
#define rocm_smi_VERSION_PATCH @rocm_smi_VERSION_PATCH@
|
||||
#define rocm_smi_VERSION_BUILD "@rocm_smi_VERSION_BUILD@"
|
||||
#define amd_smi_VERSION_MAJOR @amd_smi_VERSION_MAJOR@
|
||||
#define amd_smi_VERSION_MINOR @amd_smi_VERSION_MINOR@
|
||||
#define amd_smi_VERSION_PATCH @amd_smi_VERSION_PATCH@
|
||||
#define amd_smi_VERSION_BUILD "@amd_smi_VERSION_BUILD@"
|
||||
|
||||
#endif // INCLUDE_RDC_RDC64CONFIG_H_
|
||||
|
||||
@@ -45,7 +45,7 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
|
||||
message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
|
||||
message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
|
||||
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
|
||||
message("--------RSMI Inc Dir: " ${RSMI_INC_DIR})
|
||||
message("--------SMI Inc Dir: " ${SMI_INC_DIR})
|
||||
message("")
|
||||
|
||||
set(SRC_DIR "${PROJECT_SOURCE_DIR}/tests/example")
|
||||
@@ -69,7 +69,7 @@ add_executable(${TEST_CLIENT_EXE} "${EXAMPLE_SRC_LIST}")
|
||||
|
||||
target_include_directories(${TEST_CLIENT_EXE} PRIVATE
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/../../client/include"
|
||||
"${RSMI_INC_DIR}")
|
||||
"${SMI_INC_DIR}")
|
||||
|
||||
target_link_libraries(${TEST_CLIENT_EXE} rdc_client)
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ THE SOFTWARE.
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "amd_smi/amdsmi.h"
|
||||
|
||||
#define CHK_RET_STATUS(RET) \
|
||||
if ((RET) != RDC_STATUS_SUCCESS) { \
|
||||
|
||||
@@ -41,8 +41,8 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
|
||||
message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
|
||||
message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
|
||||
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
|
||||
message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR})
|
||||
message("--------RSMI Inc Dir: " ${RSMI_INC_DIR})
|
||||
message("--------SMI Lib Dir: " ${SMI_LIB_DIR})
|
||||
message("--------SMI Inc Dir: " ${SMI_INC_DIR})
|
||||
message("")
|
||||
|
||||
set(SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
@@ -63,7 +63,7 @@ aux_source_directory(${SRC_DIR} rdctstSources)
|
||||
# Other source directories
|
||||
aux_source_directory(${SRC_DIR}/functional functionalSources)
|
||||
|
||||
link_directories(${ROCM_INSTALL_DIR} ${RSMI_LIB_DIR})
|
||||
link_directories(${ROCM_INSTALL_DIR} ${SMI_LIB_DIR})
|
||||
|
||||
# Build rules
|
||||
add_executable(${RDCTST} ${rdctstSources} ${functionalSources})
|
||||
@@ -72,7 +72,7 @@ add_executable(${RDCTST} ${rdctstSources} ${functionalSources})
|
||||
target_include_directories(
|
||||
${RDCTST}
|
||||
PUBLIC ${PROJECT_SOURCE_DIR}/include
|
||||
PUBLIC ${RSMI_INC_DIR}
|
||||
PUBLIC ${SMI_INC_DIR}
|
||||
PUBLIC ${SRC_DIR}/..)
|
||||
|
||||
target_link_libraries(${RDCTST}
|
||||
|
||||
@@ -29,6 +29,7 @@ THE SOFTWARE.
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "functional/rdci_discovery.h"
|
||||
#include "functional/rdci_dmon.h"
|
||||
#include "functional/rdci_fieldgroup.h"
|
||||
@@ -37,7 +38,6 @@ THE SOFTWARE.
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_tests/test_base.h"
|
||||
#include "rdc_tests/test_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
static RDCTstGlobals* sRDCGlvalues = nullptr;
|
||||
|
||||
|
||||
@@ -24,8 +24,8 @@ THE SOFTWARE.
|
||||
#include <assert.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rdc_tests/test_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
static const int kOutputLineLength = 80;
|
||||
static const char kLabelDelimiter[] = "####";
|
||||
|
||||
@@ -30,8 +30,8 @@ THE SOFTWARE.
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rdc_tests/test_base.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
/*static const std::map<grpc_connectivity_state, const char *> kGRPCChanState =
|
||||
{
|
||||
@@ -47,40 +47,40 @@ THE SOFTWARE.
|
||||
},
|
||||
};
|
||||
*/
|
||||
static const std::map<rsmi_gpu_block_t, const char*> kBlockNameMap = {
|
||||
{RSMI_GPU_BLOCK_UMC, "UMC"}, {RSMI_GPU_BLOCK_SDMA, "SDMA"},
|
||||
{RSMI_GPU_BLOCK_GFX, "GFX"}, {RSMI_GPU_BLOCK_MMHUB, "MMHUB"},
|
||||
{RSMI_GPU_BLOCK_ATHUB, "ATHUB"}, {RSMI_GPU_BLOCK_PCIE_BIF, "PCIE_BIF"},
|
||||
{RSMI_GPU_BLOCK_HDP, "HDP"}, {RSMI_GPU_BLOCK_XGMI_WAFL, "XGMI_WAFL"},
|
||||
{RSMI_GPU_BLOCK_DF, "DF"}, {RSMI_GPU_BLOCK_SMN, "SMN"},
|
||||
{RSMI_GPU_BLOCK_SEM, "SEM"}, {RSMI_GPU_BLOCK_MP0, "MP0"},
|
||||
{RSMI_GPU_BLOCK_MP1, "MP1"}, {RSMI_GPU_BLOCK_FUSE, "FUSE"},
|
||||
static const std::map<amdsmi_gpu_block_t, const char*> kBlockNameMap = {
|
||||
{AMDSMI_GPU_BLOCK_UMC, "UMC"}, {AMDSMI_GPU_BLOCK_SDMA, "SDMA"},
|
||||
{AMDSMI_GPU_BLOCK_GFX, "GFX"}, {AMDSMI_GPU_BLOCK_MMHUB, "MMHUB"},
|
||||
{AMDSMI_GPU_BLOCK_ATHUB, "ATHUB"}, {AMDSMI_GPU_BLOCK_PCIE_BIF, "PCIE_BIF"},
|
||||
{AMDSMI_GPU_BLOCK_HDP, "HDP"}, {AMDSMI_GPU_BLOCK_XGMI_WAFL, "XGMI_WAFL"},
|
||||
{AMDSMI_GPU_BLOCK_DF, "DF"}, {AMDSMI_GPU_BLOCK_SMN, "SMN"},
|
||||
{AMDSMI_GPU_BLOCK_SEM, "SEM"}, {AMDSMI_GPU_BLOCK_MP0, "MP0"},
|
||||
{AMDSMI_GPU_BLOCK_MP1, "MP1"}, {AMDSMI_GPU_BLOCK_FUSE, "FUSE"},
|
||||
};
|
||||
static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_FUSE, "kBlockNameMap needs to be updated");
|
||||
static_assert(AMDSMI_GPU_BLOCK_LAST == AMDSMI_GPU_BLOCK_FUSE, "kBlockNameMap needs to be updated");
|
||||
|
||||
static const char* kRasErrStateStrings[] = {
|
||||
"None", // RSMI_RAS_ERR_STATE_NONE
|
||||
"Disabled", // RSMI_RAS_ERR_STATE_DISABLED
|
||||
"Error Unknown", // RSMI_RAS_ERR_STATE_PARITY
|
||||
"Single, Correctable", // RSMI_RAS_ERR_STATE_SING_C
|
||||
"Multiple, Uncorrectable", // RSMI_RAS_ERR_STATE_MULT_UC
|
||||
"Poison" // RSMI_RAS_ERR_STATE_POISON
|
||||
"Off", // RSMI_RAS_ERR_STATE_DISABLED
|
||||
"On", // RSMI_RAS_ERR_STATE_ENABLED
|
||||
"None", // AMDSMI_RAS_ERR_STATE_NONE
|
||||
"Disabled", // AMDSMI_RAS_ERR_STATE_DISABLED
|
||||
"Error Unknown", // AMDSMI_RAS_ERR_STATE_PARITY
|
||||
"Single, Correctable", // AMDSMI_RAS_ERR_STATE_SING_C
|
||||
"Multiple, Uncorrectable", // AMDSMI_RAS_ERR_STATE_MULT_UC
|
||||
"Poison" // AMDSMI_RAS_ERR_STATE_POISON
|
||||
"Off", // AMDSMI_RAS_ERR_STATE_DISABLED
|
||||
"On", // AMDSMI_RAS_ERR_STATE_ENABLED
|
||||
};
|
||||
static_assert(sizeof(kRasErrStateStrings) / sizeof(char*) == (RSMI_RAS_ERR_STATE_LAST + 1),
|
||||
static_assert(sizeof(kRasErrStateStrings) / sizeof(char*) == (AMDSMI_RAS_ERR_STATE_LAST + 1),
|
||||
"kErrStateNameMap needs to be updated");
|
||||
|
||||
static const std::map<rsmi_ras_err_state_t, const char*> kErrStateNameMap = {
|
||||
{RSMI_RAS_ERR_STATE_NONE, kRasErrStateStrings[RSMI_RAS_ERR_STATE_NONE]},
|
||||
{RSMI_RAS_ERR_STATE_DISABLED, kRasErrStateStrings[RSMI_RAS_ERR_STATE_DISABLED]},
|
||||
{RSMI_RAS_ERR_STATE_PARITY, kRasErrStateStrings[RSMI_RAS_ERR_STATE_PARITY]},
|
||||
{RSMI_RAS_ERR_STATE_SING_C, kRasErrStateStrings[RSMI_RAS_ERR_STATE_SING_C]},
|
||||
{RSMI_RAS_ERR_STATE_MULT_UC, kRasErrStateStrings[RSMI_RAS_ERR_STATE_MULT_UC]},
|
||||
{RSMI_RAS_ERR_STATE_POISON, kRasErrStateStrings[RSMI_RAS_ERR_STATE_POISON]},
|
||||
{RSMI_RAS_ERR_STATE_ENABLED, kRasErrStateStrings[RSMI_RAS_ERR_STATE_ENABLED]},
|
||||
static const std::map<amdsmi_ras_err_state_t, const char*> kErrStateNameMap = {
|
||||
{AMDSMI_RAS_ERR_STATE_NONE, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_NONE]},
|
||||
{AMDSMI_RAS_ERR_STATE_DISABLED, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_DISABLED]},
|
||||
{AMDSMI_RAS_ERR_STATE_PARITY, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_PARITY]},
|
||||
{AMDSMI_RAS_ERR_STATE_SING_C, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_SING_C]},
|
||||
{AMDSMI_RAS_ERR_STATE_MULT_UC, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_MULT_UC]},
|
||||
{AMDSMI_RAS_ERR_STATE_POISON, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_POISON]},
|
||||
{AMDSMI_RAS_ERR_STATE_ENABLED, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_ENABLED]},
|
||||
};
|
||||
static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_ENABLED,
|
||||
static_assert(AMDSMI_RAS_ERR_STATE_LAST == AMDSMI_RAS_ERR_STATE_ENABLED,
|
||||
"kErrStateNameMap needs to be updated");
|
||||
|
||||
static const struct option long_options[] = {
|
||||
@@ -207,25 +207,35 @@ uint32_t ProcessCmdline(RDCTstGlobals* test, int arg_cnt, char** arg_list) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const char* GetBlockNameStr(rsmi_gpu_block_t id) { return kBlockNameMap.at(id); }
|
||||
const char* GetErrStateNameStr(rsmi_ras_err_state_t st) { return kErrStateNameMap.at(st); }
|
||||
const char* GetBlockNameStr(amdsmi_gpu_block_t id) { return kBlockNameMap.at(id); }
|
||||
const char* GetErrStateNameStr(amdsmi_ras_err_state_t st) { return kErrStateNameMap.at(st); }
|
||||
/*const char *GetGRPCChanStateStr(grpc_connectivity_state st) {
|
||||
return kGRPCChanState.at(st);
|
||||
}*/
|
||||
|
||||
const char* FreqEnumToStr(rsmi_clk_type rsmi_clk) {
|
||||
static_assert(RSMI_CLK_TYPE_LAST == RSMI_CLK_TYPE_MEM, "FreqEnumToStr() needs to be updated");
|
||||
const char* FreqEnumToStr(amdsmi_clk_type_t rsmi_clk) {
|
||||
static_assert(CLK_TYPE__MAX == CLK_TYPE_DCLK1, "FreqEnumToStr() needs to be updated");
|
||||
switch (rsmi_clk) {
|
||||
case RSMI_CLK_TYPE_SYS:
|
||||
case CLK_TYPE_SYS:
|
||||
return "System clock";
|
||||
case RSMI_CLK_TYPE_DF:
|
||||
case CLK_TYPE_DF:
|
||||
return "Data Fabric clock";
|
||||
case RSMI_CLK_TYPE_DCEF:
|
||||
case CLK_TYPE_DCEF:
|
||||
return "Display Controller Engine clock";
|
||||
case RSMI_CLK_TYPE_SOC:
|
||||
case CLK_TYPE_SOC:
|
||||
return "SOC clock";
|
||||
case RSMI_CLK_TYPE_MEM:
|
||||
case CLK_TYPE_MEM:
|
||||
return "Memory clock";
|
||||
case CLK_TYPE_PCIE:
|
||||
return "PCIe clock";
|
||||
case CLK_TYPE_VCLK0:
|
||||
return "VCLK0 clock";
|
||||
case CLK_TYPE_VCLK1:
|
||||
return "VCLK1 clock";
|
||||
case CLK_TYPE_DCLK0:
|
||||
return "DCLK0 clock";
|
||||
case CLK_TYPE_DCLK1:
|
||||
return "DCLK1 clock";
|
||||
default:
|
||||
return "Invalid Clock ID";
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ THE SOFTWARE.
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "amd_smi/amdsmi.h"
|
||||
|
||||
struct RDCTstGlobals {
|
||||
uint32_t verbosity;
|
||||
@@ -45,10 +45,10 @@ struct RDCTstGlobals {
|
||||
uint32_t ProcessCmdline(RDCTstGlobals* test, int arg_cnt, char** arg_list);
|
||||
|
||||
void PrintTestHeader(uint32_t dv_ind);
|
||||
const char* GetBlockNameStr(rsmi_gpu_block_t id);
|
||||
const char* GetErrStateNameStr(rsmi_ras_err_state_t st);
|
||||
const char* GetBlockNameStr(amdsmi_gpu_block_t id);
|
||||
const char* GetErrStateNameStr(amdsmi_ras_err_state_t st);
|
||||
// const char *GetGRPCChanStateStr(grpc_connectivity_state st);
|
||||
const char* FreqEnumToStr(rsmi_clk_type rsmi_clk);
|
||||
const char* FreqEnumToStr(amdsmi_clk_type_t rsmi_clk);
|
||||
|
||||
#if ENABLE_SMI
|
||||
void DumpMonitorInfo(const TestBase* test);
|
||||
|
||||
@@ -47,30 +47,88 @@
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "amd_smi/amdsmi.h"
|
||||
|
||||
static const std::map<rsmi_fw_block_t, const char*> kDevFWNameMap = {
|
||||
{RSMI_FW_BLOCK_ASD, "asd"},
|
||||
{RSMI_FW_BLOCK_CE, "ce"},
|
||||
{RSMI_FW_BLOCK_DMCU, "dmcu"},
|
||||
{RSMI_FW_BLOCK_MC, "mc"},
|
||||
{RSMI_FW_BLOCK_ME, "me"},
|
||||
{RSMI_FW_BLOCK_MEC, "mec"},
|
||||
{RSMI_FW_BLOCK_MEC2, "mec2"},
|
||||
{RSMI_FW_BLOCK_PFP, "pfp"},
|
||||
{RSMI_FW_BLOCK_RLC, "rlc"},
|
||||
{RSMI_FW_BLOCK_RLC_SRLC, "rlc_srlc"},
|
||||
{RSMI_FW_BLOCK_RLC_SRLG, "rlc_srlg"},
|
||||
{RSMI_FW_BLOCK_RLC_SRLS, "rlc_srls"},
|
||||
{RSMI_FW_BLOCK_SDMA, "sdma"},
|
||||
{RSMI_FW_BLOCK_SDMA2, "sdma2"},
|
||||
{RSMI_FW_BLOCK_SMC, "smc"},
|
||||
{RSMI_FW_BLOCK_SOS, "sos"},
|
||||
{RSMI_FW_BLOCK_TA_RAS, "ta_ras"},
|
||||
{RSMI_FW_BLOCK_TA_XGMI, "ta_xgmi"},
|
||||
{RSMI_FW_BLOCK_UVD, "uvd"},
|
||||
{RSMI_FW_BLOCK_VCE, "vce"},
|
||||
{RSMI_FW_BLOCK_VCN, "vcn"},
|
||||
static const std::map<amdsmi_fw_block_t, const char*> kDevFWNameMap = {
|
||||
{FW_ID_SMU, "SMU"},
|
||||
{FW_ID_FIRST, "FIRST"},
|
||||
{FW_ID_CP_CE, "CP_CE"},
|
||||
{FW_ID_CP_PFP, "CP_PFP"},
|
||||
{FW_ID_CP_ME, "CP_ME"},
|
||||
{FW_ID_CP_MEC_JT1, "CP_MEC_JT1"},
|
||||
{FW_ID_CP_MEC_JT2, "CP_MEC_JT2"},
|
||||
{FW_ID_CP_MEC1, "CP_MEC1"},
|
||||
{FW_ID_CP_MEC2, "CP_MEC2"},
|
||||
{FW_ID_RLC, "RLC"},
|
||||
{FW_ID_SDMA0, "SDMA0"},
|
||||
{FW_ID_SDMA1, "SDMA1"},
|
||||
{FW_ID_SDMA2, "SDMA2"},
|
||||
{FW_ID_SDMA3, "SDMA3"},
|
||||
{FW_ID_SDMA4, "SDMA4"},
|
||||
{FW_ID_SDMA5, "SDMA5"},
|
||||
{FW_ID_SDMA6, "SDMA6"},
|
||||
{FW_ID_SDMA7, "SDMA7"},
|
||||
{FW_ID_VCN, "VCN"},
|
||||
{FW_ID_UVD, "UVD"},
|
||||
{FW_ID_VCE, "VCE"},
|
||||
{FW_ID_ISP, "ISP"},
|
||||
{FW_ID_DMCU_ERAM, "DMCU_ERAM"},
|
||||
{FW_ID_DMCU_ISR, "DMCU_ISR"},
|
||||
{FW_ID_RLC_RESTORE_LIST_GPM_MEM, "RLC_RESTORE_LIST_GPM_MEM"},
|
||||
{FW_ID_RLC_RESTORE_LIST_SRM_MEM, "RLC_RESTORE_LIST_SRM_MEM"},
|
||||
{FW_ID_RLC_RESTORE_LIST_CNTL, "RLC_RESTORE_LIST_CNTL"},
|
||||
{FW_ID_RLC_V, "RLC_V"},
|
||||
{FW_ID_MMSCH, "MMSCH"},
|
||||
{FW_ID_PSP_SYSDRV, "PSP_SYSDRV"},
|
||||
{FW_ID_PSP_SOSDRV, "PSP_SOSDRV"},
|
||||
{FW_ID_PSP_TOC, "PSP_TOC"},
|
||||
{FW_ID_PSP_KEYDB, "PSP_KEYDB"},
|
||||
{FW_ID_DFC, "DFC"},
|
||||
{FW_ID_PSP_SPL, "PSP_SPL"},
|
||||
{FW_ID_DRV_CAP, "DRV_CAP"},
|
||||
{FW_ID_MC, "MC"},
|
||||
{FW_ID_PSP_BL, "PSP_BL"},
|
||||
{FW_ID_CP_PM4, "CP_PM4"},
|
||||
{FW_ID_RLC_P, "RLC_P"},
|
||||
{FW_ID_SEC_POLICY_STAGE2, "SEC_POLICY_STAGE2"},
|
||||
{FW_ID_REG_ACCESS_WHITELIST, "REG_ACCESS_WHITELIST"},
|
||||
{FW_ID_IMU_DRAM, "IMU_DRAM"},
|
||||
{FW_ID_IMU_IRAM, "IMU_IRAM"},
|
||||
{FW_ID_SDMA_TH0, "SDMA_TH0"},
|
||||
{FW_ID_SDMA_TH1, "SDMA_TH1"},
|
||||
{FW_ID_CP_MES, "CP_MES"},
|
||||
{FW_ID_MES_KIQ, "MES_KIQ"},
|
||||
{FW_ID_MES_STACK, "MES_STACK"},
|
||||
{FW_ID_MES_THREAD1, "MES_THREAD1"},
|
||||
{FW_ID_MES_THREAD1_STACK, "MES_THREAD1_STACK"},
|
||||
{FW_ID_RLX6, "RLX6"},
|
||||
{FW_ID_RLX6_DRAM_BOOT, "RLX6_DRAM_BOOT"},
|
||||
{FW_ID_RS64_ME, "RS64_ME"},
|
||||
{FW_ID_RS64_ME_P0_DATA, "RS64_ME_P0_DATA"},
|
||||
{FW_ID_RS64_ME_P1_DATA, "RS64_ME_P1_DATA"},
|
||||
{FW_ID_RS64_PFP, "RS64_PFP"},
|
||||
{FW_ID_RS64_PFP_P0_DATA, "RS64_PFP_P0_DATA"},
|
||||
{FW_ID_RS64_PFP_P1_DATA, "RS64_PFP_P1_DATA"},
|
||||
{FW_ID_RS64_MEC, "RS64_MEC"},
|
||||
{FW_ID_RS64_MEC_P0_DATA, "RS64_MEC_P0_DATA"},
|
||||
{FW_ID_RS64_MEC_P1_DATA, "RS64_MEC_P1_DATA"},
|
||||
{FW_ID_RS64_MEC_P2_DATA, "RS64_MEC_P2_DATA"},
|
||||
{FW_ID_RS64_MEC_P3_DATA, "RS64_MEC_P3_DATA"},
|
||||
{FW_ID_PPTABLE, "PPTABLE"},
|
||||
{FW_ID_PSP_SOC, "PSP_SOC"},
|
||||
{FW_ID_PSP_DBG, "PSP_DBG"},
|
||||
{FW_ID_PSP_INTF, "PSP_INTF"},
|
||||
{FW_ID_RLX6_CORE1, "RLX6_CORE1"},
|
||||
{FW_ID_RLX6_DRAM_BOOT_CORE1, "RLX6_DRAM_BOOT_CORE1"},
|
||||
{FW_ID_RLCV_LX7, "RLCV_LX7"},
|
||||
{FW_ID_RLC_SAVE_RESTORE_LIST, "RLC_SAVE_RESTORE_LIST"},
|
||||
{FW_ID_ASD, "ASD"},
|
||||
{FW_ID_TA_RAS, "TA_RAS"},
|
||||
{FW_ID_TA_XGMI, "TA_XGMI"},
|
||||
{FW_ID_RLC_SRLG, "RLC_SRLG"},
|
||||
{FW_ID_RLC_SRLS, "RLC_SRLS"},
|
||||
{FW_ID_PM, "PM"},
|
||||
{FW_ID_DMCU, "DMCU"},
|
||||
};
|
||||
|
||||
const char* NameFromFWEnum(rsmi_fw_block_t blk) { return kDevFWNameMap.at(blk); }
|
||||
const char* NameFromFWEnum(amdsmi_fw_block_t blk) { return kDevFWNameMap.at(blk); }
|
||||
|
||||
@@ -46,8 +46,8 @@
|
||||
#ifndef TESTS_RDC_TESTS_TEST_UTILS_H_
|
||||
#define TESTS_RDC_TESTS_TEST_UTILS_H_
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "amd_smi/amdsmi.h"
|
||||
|
||||
const char* NameFromFWEnum(rsmi_fw_block_t blk);
|
||||
const char* NameFromFWEnum(amdsmi_fw_block_t blk);
|
||||
|
||||
#endif // TESTS_RDC_TESTS_TEST_UTILS_H_
|
||||
|
||||
Verwijs in nieuw issue
Block a user