SWDEV-439576 - rocmsmi -> amdsmi

- Migrate to amdsmi library
- NOTE: raslib still uses rocmsmi
- Remove unused rocmsmi service
- Remove unused RDC client code
- Remove RSMI calls from protos/rdc.proto

Change-Id: Ifc34a264c506b0ec5792307ee56b34526268762d
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: 9702d0f2d7]
This commit is contained in:
Galantsev, Dmitrii
2024-01-22 18:56:42 -06:00
bovenliggende d1b8e1b484
commit 028355dff0
53 gewijzigde bestanden met toevoegingen van 774 en 2340 verwijderingen
+9 -10
Bestand weergeven
@@ -151,16 +151,16 @@ if(NOT EXISTS "${CMAKE_SOURCE_DIR}/raslib/.git" AND BUILD_RASLIB)
If you do not want to build raslib, use cmake -DBUILD_RASLIB=off")
endif()
find_package(RSMI
NAMES rocm_smi
find_package(SMI
NAMES amd_smi
HINTS ${ROCM_DIR}/lib/cmake
CONFIGURE REQUIRED)
set(RSMI_INC_DIR "${ROCM_SMI_INCLUDE_DIR}" CACHE INTERNAL "ROCm SMI include directory.")
set(RSMI_LIB_DIR "${ROCM_SMI_LIB_DIR}" CACHE INTERNAL "ROCm SMI library directory.")
set(SMI_INC_DIR "${AMD_SMI_INCLUDE_DIR}" CACHE INTERNAL "AMD SMI include directory.")
set(SMI_LIB_DIR "${AMD_SMI_LIB_DIR}" CACHE INTERNAL "AMD SMI library directory.")
if(NOT EXISTS "${RSMI_INC_DIR}" OR NOT EXISTS "${RSMI_LIB_DIR}")
message(FATAL_ERROR "rocm_smi not found in ${RSMI_INC_DIR}. Please
make sure rocm_smi is installed and present in ${RSMI_INC_DIR}.")
if(NOT EXISTS "${SMI_INC_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}")
message(FATAL_ERROR "amd_smi not found in ${SMI_INC_DIR}. Please
make sure amd_smi is installed and present in ${SMI_INC_DIR}.")
endif()
if(BUILD_RASLIB AND NOT DEFINED HSA_DIR)
@@ -301,7 +301,6 @@ if(BUILD_STANDALONE)
unset(OLD_CMAKE_INSTALL_MESSAGE)
add_subdirectory("server")
add_subdirectory("client")
add_subdirectory("rdci")
if(BUILD_TESTS)
@@ -458,7 +457,7 @@ set(CPACK_DEBIAN_RUNTIME_PACKAGE_CONTROL_EXTRA
option(ROCM_DEP_ROCMCORE "Add debian dependency on rocm-core" OFF)
mark_as_advanced(ROCM_DEP_ROCMCORE)
set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-smi-lib, libc6")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "amd-smi-lib, libc6")
if(ROCM_DEP_ROCMCORE)
string(APPEND CPACK_DEBIAN_PACKAGE_DEPENDS ", rocm-core")
endif()
@@ -485,7 +484,7 @@ endif()
set(CPACK_RPM_PACKAGE_AUTOREQ 0)
set(CPACK_RPM_PACKAGE_AUTOPROV 0)
set(CPACK_RPM_PACKAGE_REQUIRES "rocm-smi-lib")
set(CPACK_RPM_PACKAGE_REQUIRES "amd-smi-lib")
# rdc-tests need rdc
set(CPACK_RPM_TESTS_PACKAGE_REQUIRES "${CPACK_PACKAGE_NAME}")
list(APPEND CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/lib"
+1 -1
Bestand weergeven
@@ -30,7 +30,7 @@ RDC can run on AMD ROCm supported platforms, please refer to the [List of Suppor
* It is recommended to install the complete AMD ROCm platform.
For installation instruction see https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html
* At the minimum, these two components are required
(i) AMD ROCm SMI Library (https://github.com/ROCm/rocm_smi_lib)
(i) AMDSMI Library (https://github.com/ROCm/amdsmi)
(ii) AMD ROCk Kernel driver (https://github.com/ROCm/ROCK-Kernel-Driver)
## Building gRPC and protoc
-126
Bestand weergeven
@@ -1,126 +0,0 @@
# Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" Cmake Client Lib ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
## Compiler flags
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -m64")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse -msse2")
# Use this instead of above for 32 bit
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32")
if("${CMAKE_BUILD_TYPE}" STREQUAL Release)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG")
endif()
# Required Defines first:
message("")
message("Build Configuration:")
message("-------------BuildType: " ${CMAKE_BUILD_TYPE})
message("--------------Compiler: " ${CMAKE_CXX_COMPILER})
message("---------------Version: " ${CMAKE_CXX_COMPILER_VERSION})
message("----------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
message("----------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
message("----------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
message("----------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
message("----------RSMI Lib Dir: " ${RSMI_LIB_DIR})
message("----------RSMI Inc Dir: " ${RSMI_INC_DIR})
message("---------GRPC Root Dir: " ${GRPC_ROOT})
message("")
## Include common cmake modules
include(utils)
set(CLIENT_LIB "rdc_client_smi")
set(SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
set(INC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include/rdc")
################# Determine the library version #########################
## Setup the SO version based on git tags.
set(SO_VERSION_GIT_TAG_PREFIX "rdc_so_ver")
# provide git to utilities
find_program(GIT NAMES git)
# Debian package specific variables
# Set a default value for the package version
get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
# VERSION_* variables should be set by get_version_from_tag
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}")
message("SOVERSION: ${SO_VERSION_STRING}")
set(CPACK_PACKAGE_FILE_NAME "${RDC_PACKAGE}-${VERSION_STRING}")
# TODO delete these if not used
file(GLOB PROTOBUF_GENERATED_INCLUDES "${PROTOB_OUT_DIR}/*.h")
file(GLOB PROTOBUF_GENERATED_SRCS "${PROTOB_OUT_DIR}/*.cc")
set(CLIENT_LIB_SRC_LIST "${SRC_DIR}/rdc_client.cc"
"${SRC_DIR}/rdc_client_main.cc"
"${SRC_DIR}/rdc_client_utils.cc"
"${PROTOBUF_GENERATED_SRCS}"
"${COMMON_DIR}/rdc_utils.cc")
message("CLIENT_LIB_SRC_LIST=${CLIENT_LIB_SRC_LIST}")
set(CLIENT_LIB_INC_LIST "${INC_DIR}/rdc_client.h"
"${INC_DIR}/rdc_exception.h"
"${INC_DIR}/rdc_client_main.h"
"${COMMON_DIR}/rdc_utils.h")
add_library(${CLIENT_LIB} SHARED ${CLIENT_LIB_SRC_LIST} ${CLIENT_LIB_INC_LIST})
target_link_libraries(${CLIENT_LIB} pthread rt gRPC::grpc++ dl)
target_include_directories(${CLIENT_LIB} PRIVATE
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
"${CMAKE_CURRENT_SOURCE_DIR}/include"
"${PROTOB_OUT_DIR}"
"${RSMI_INC_DIR}")
# TODO: set the properties for the library once we have one
## Set the VERSION and SOVERSION values
set_property(TARGET ${CLIENT_LIB} PROPERTY
SOVERSION "${VERSION_MAJOR}")
set_property(TARGET ${CLIENT_LIB} PROPERTY
VERSION "${SO_VERSION_STRING}")
## If the library is a release, strip the target library
if("${CMAKE_BUILD_TYPE}" STREQUAL Release)
add_custom_command(
TARGET ${CLIENT_LIB}
POST_BUILD COMMAND ${CMAKE_STRIP} lib${CLIENT_LIB}.so)
endif()
## Add the install directives for the runtime library.
install(TARGETS ${CLIENT_LIB}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${RDC}
COMPONENT ${CLIENT_COMPONENT})
install(DIRECTORY ${PROJECT_SOURCE_DIR}/authentication
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${RDC}
COMPONENT ${CLIENT_COMPONENT})
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" Finished Cmake Client Lib ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
@@ -1,382 +0,0 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_H_
#define CLIENT_INCLUDE_RDC_RDC_CLIENT_H_
#include <grpcpp/grpcpp.h>
#include <memory>
#include <string>
#include "rocm_smi/rocm_smi.h"
/**
* @brief Error codes retured by rdc functions
*/
typedef enum {
RDC_STATUS_SUCCESS = 0x0, //!< Operation was successful
RDC_RSMI_STATUS_INVALID_ARGS, //!< Passed in arguments are not valid
RDC_RSMI_STATUS_NOT_SUPPORTED, //!< The requested information or
//!< action is not available for the
//!< given input, on the given system
RDC_RSMI_STATUS_FILE_ERROR, //!< Problem accessing a file. This
//!< may because the operation is not
//!< supported by the Linux kernel
//!< version running on the executing
//!< machine
RDC_RSMI_STATUS_PERMISSION, //!< Permission denied/EACCESS file
//!< error. Many functions require
//!< root access to run.
RDC_RSMI_STATUS_OUT_OF_RESOURCES, //!< Unable to acquire memory or other
//!< resource
RDC_RSMI_STATUS_INTERNAL_EXCEPTION, //!< An internal exception was caught
RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS, //!< The provided input is out of
//!< allowable or safe range
RDC_RSMI_STATUS_INIT_ERROR, //!< An error occurred when creating
//!< a communications channel
RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED, //!< The requested function has not
//!< yet been implemented in the
//!< current system for the current
//!< devices
RDC_RSMI_STATUS_NOT_FOUND, //!< An item was searched for but not
//!< found
RDC_RSMI_STATUS_INSUFFICIENT_SIZE, //!< Not enough resources were
//!< available for the operation
RDC_RSMI_STATUS_INTERRUPT, //!< An interrupt occurred during
//!< execution of function
RDC_RSMI_STATUS_UNEXPECTED_SIZE, //!< An unexpected amount of data
//!< was read
RDC_RSMI_STATUS_NO_DATA, //!< No data was found for a given
//!< input
RDC_RSMI_STATUS_UNKNOWN_ERROR, //!< An unknown error occurred
RDC_STATUS_GRPC_ERR_FIRST = 1000,
/// Not an error; returned on success.
RDC_STATUS_GRPC_OK = RDC_STATUS_GRPC_ERR_FIRST,
/// The operation was cancelled (typically by the caller).
RDC_STATUS_GRPC_CANCELLED,
/// Unknown error. An example of where this error may be returned is if a
/// Status value received from another address space belongs to an error-space
/// that is not known in this address space. Also errors raised by APIs that
/// do not return enough error information may be converted to this error.
RDC_STATUS_GRPC_UNKNOWN,
/// Client specified an invalid argument. Note that this differs from
/// FAILED_PRECONDITION. INVALID_ARGUMENT indicates arguments that are
/// problematic regardless of the state of the system (e.g., a malformed file
/// name).
RDC_STATUS_GRPC_INVALID_ARG,
/// Deadline expired before operation could complete. For operations that
/// change the state of the system, this error may be returned even if the
/// operation has completed successfully. For example, a successful response
/// from a server could have been delayed long enough for the deadline to
/// expire.
RDC_STATUS_GRPC_DEADLINE_EXCEEDED,
/// Some requested entity (e.g., file or directory) was not found.
RDC_STATUS_GRPC_NOT_FOUND,
/// Some entity that we attempted to create (e.g., file or directory) already
/// exists.
RDC_STATUS_GRPC_ALREADY_EXISTS,
/// The caller does not have permission to execute the specified operation.
/// PERMISSION_DENIED must not be used for rejections caused by exhausting
/// some resource (use RESOURCE_EXHAUSTED instead for those errors).
/// PERMISSION_DENIED must not be used if the caller can not be identified
/// (use UNAUTHENTICATED instead for those errors).
RDC_STATUS_GRPC_PERM_DENIED,
/// The request does not have valid authentication credentials for the
/// operation.
RDC_STATUS_GRPC_UNAUTHENTICATED,
/// Some resource has been exhausted, perhaps a per-user quota, or perhaps the
/// entire file system is out of space.
RDC_STATUS_GRPC_RESOURCE_EXHAUSTED,
/// Operation was rejected because the system is not in a state required for
/// the operation's execution. For example, directory to be deleted may be
/// non-empty, an rmdir operation is applied to a non-directory, etc.
///
/// A litmus test that may help a service implementor in deciding
/// between FAILED_PRECONDITION, ABORTED, and UNAVAILABLE:
/// (a) Use UNAVAILABLE if the client can retry just the failing call.
/// (b) Use ABORTED if the client should retry at a higher-level
/// (e.g., restarting a read-modify-write sequence).
/// (c) Use FAILED_PRECONDITION if the client should not retry until
/// the system state has been explicitly fixed. E.g., if an "rmdir"
/// fails because the directory is non-empty, FAILED_PRECONDITION
/// should be returned since the client should not retry unless
/// they have first fixed up the directory by deleting files from it.
/// (d) Use FAILED_PRECONDITION if the client performs conditional
/// REST Get/Update/Delete on a resource and the resource on the
/// server does not match the condition. E.g., conflicting
/// read-modify-write on the same resource.
RDC_STATUS_GRPC_FAILED_PRECOND,
/// The operation was aborted, typically due to a concurrency issue like
/// sequencer check failures, transaction aborts, etc.
///
/// See litmus test above for deciding between FAILED_PRECONDITION, ABORTED,
/// and UNAVAILABLE.
RDC_STATUS_GRPC_ABORTED,
/// Operation was attempted past the valid range. E.g., seeking or reading
/// past end of file.
///
/// Unlike INVALID_ARGUMENT, this error indicates a problem that may be fixed
/// if the system state changes. For example, a 32-bit file system will
/// generate INVALID_ARGUMENT if asked to read at an offset that is not in the
/// range [0,2^32-1], but it will generate OUT_OF_RANGE if asked to read from
/// an offset past the current file size.
///
/// There is a fair bit of overlap between FAILED_PRECONDITION and
/// OUT_OF_RANGE. We recommend using OUT_OF_RANGE (the more specific error)
/// when it applies so that callers who are iterating through a space can
/// easily look for an OUT_OF_RANGE error to detect when they are done.
RDC_STATUS_GRPC_OUT_OF_RANGE,
/// Operation is not implemented or not supported/enabled in this service.
RDC_STATUS_GRPC_UNIMPLEMENTED,
/// Internal errors. Means some invariants expected by underlying System has
/// been broken. If you see one of these errors, Something is very broken.
RDC_STATUS_GRPC_INTERNAL,
/// The service is currently unavailable. This is a most likely a transient
/// condition and may be corrected by retrying with a backoff.
///
/// \warning Although data MIGHT not have been transmitted when this
/// status occurs, there is NOT A GUARANTEE that the server has not seen
/// anything. So in general it is unsafe to retry on this status code
/// if the call is non-idempotent.
///
/// See litmus test above for deciding between FAILED_PRECONDITION, ABORTED,
/// and UNAVAILABLE.
RDC_STATUS_GRPC_UNAVAILABLE,
/// Unrecoverable data loss or corruption.
RDC_STATUS_GRPC_DATA_LOSS,
RDC_STATUS_CLIENT_ERR_FIRST = 2000,
/// SSL authentication error occurred.
RDC_STATUS_CLIENT_ERR_SSL = RDC_STATUS_CLIENT_ERR_FIRST,
RDC_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
} rdc_status_t;
/**
* @brief Handle to RDC server channel
*/
typedef uintptr_t rdc_channel_t;
#define RDC_DEFAULT_SERVER_PORT 50051
#define RDC_DEFAULT_SERVER_IP "localhost"
/*****************************************************************************/
/** @defgroup RDCAdmin RDC Administration Functions
* These administrative functions are used to monitor and control, for
* example RDC connectivity.
* @{
*/
/**
* @brief Check the connection status of a channel
*
* @details Given an ::rdc_channel_t @p channel and a boolean @p
* try_to_connect, this function will return the grpc_connectivity_state for
* that channel
*
* @p channel[in] The channel for which the status will be given
*
* @param[in] try_to_connect If the channel is currently IDLE, if the argument
* is true, transition to CONNECTING.
*
* @param[inout] state A pointer to caller provided memory to which an
* the grpc_connectivity_state will be written. grpc_connectivity_state has
* the following possible values:
* GRPC_CHANNEL_IDLE channel is idle
* GRPC_CHANNEL_CONNECTING channel is connecting
* GRPC_CHANNEL_READY channel is ready for work
* GRPC_CHANNEL_TRANSIENT_FAILURE channel has seen a failure but expects to
* recover
* GRPC_CHANNEL_SHUTDOWN channel has seen a failure that it cannot
* recover from
*
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect,
grpc_connectivity_state* state);
/**
* @brief Verify a channel's connection to the server
*
* @details Given an ::rdc_channel_t @p channel, this function will send a
* random number to the server associated with @p channel. The server will send
* the number back. Upon receiving the returned message from the server, the
* number sent to the server is compared to the number received from the
* server. If the 2 numbers are the same, the connection is verified.
* Otherwise, an appropriate error code is returned.
*
* @p channel[in] The channel for which the connection will be verified
*
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t rdc_channel_connection_verify(rdc_channel_t channel);
/** @} */ // end of RDCAdmin
/*****************************************************************************/
/** @defgroup InitShutAdmin Initialization and Shutdown
* These functions are used for initialization of RDC and clean up when
* done.
* @{
*/
/**
* @brief Create a communications channel to an RDC server
*
* @details Given a pointer to an ::rdc_channel_t @p channel, a string
* containing the ip address of the server @p ip, a string containing
* the port number on which the server is listening @p port and a bool
* indicating whether the channel should use a secure link @p secure,
* this function will attempt to create a new channel and write its
* location to address pointed to by @p channel.
*
* @p channel[inout] A pointer to caller provided memory to which an
* ::rdc_channel_t will be written
*
* @param[in] ip A pointer to a string containing the address of the server.
* If nullptr is passed for this parameter, RDC_DEFAULT_SERVER_IP will be used.
*
* @param[in] port A pointer to string containing the port on which the
* RDC server is listening. If nullptr is passed for this parameter,
* RDC_DEFAULT_SERVER_PORT will be used.
*
* @param[in] secure A bool indicating whether SSL should be used for
* communications (not currently supported)
*
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t rdc_channel_create(rdc_channel_t* channel, const char* ip, const char* port,
bool secure);
/**
* @brief Destroy a communications channel to an RDC server
*
* @details Given an ::rdc_channel_t @p channel, this function will free any
* resources used by @p channel
*
* @p channel[inout] An ::rdc_channel_t will be freed
*
* @retval ::RDC_STATUS_SUCCESS is returned upon successful call.
*
*/
rdc_status_t rdc_channel_destroy(rdc_channel_t channel);
/** @} */ // end of InitShutAdmin
/*****************************************************************************/
/** @defgroup RSMIAccess Remote ROCm SMI Calls
* These functions calls make ROCm SMI function calls on the remote server.
* Please refer to the
* [ROCm SMI documentation]
* (https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/docs) for
* information about the calls. Here, we will document any additional aspects
* of the calls introduced by RDC that are not covered in the ROCm SMI
* documentation.
*
* All of the functions in this section attempt to make an RSMI call on the
* server machine, given an ::rdc_channel_t associated with the server, and
* all the arguments that are required to make the RSMI call.
* @{
*/
/**
* @brief Remote call to rsmi_num_monitor_devices()
*
*/
rdc_status_t rdc_num_gpus_get(rdc_channel_t channel, uint64_t* num_gpu);
/** @} */ // end of RSMIAccess
/** @defgroup PhysQuer Physical State Queries
* These functions provide information about the physical characteristics of
* the device.
* @{
*/
/**
* @brief Remote call to rsmi_dev_temp_metric_get()
*
*/
rdc_status_t rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_type,
rsmi_temperature_metric_t metric, int64_t* temperature);
/**
* @brief Remote call to rsmi_dev_fan_rpms_get()
*
*/
rdc_status_t rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
int64_t* rpms);
/**
* @brief Remote call to rsmi_dev_fan_speed_get()
*
*/
rdc_status_t rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
int64_t* speed);
/**
* @brief Remote call to rsmi_dev_fan_speed_max_get()
*
*/
rdc_status_t rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
uint64_t* max_speed);
/** @} */ // end of PhysQuer
/**
* @brief Get a description of a provided RDC error status
*
* @details Set the provided pointer to a const char *, @p status_string, to
* a string containing a description of the provided error code @p status.
*
* @param[in] status The error status for which a description is desired
*
* @param[inout] status_string A pointer to a const char * which will be made
* to point to a description of the provided error code
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call
*
*/
rdc_status_t rdc_status_string(rdc_status_t status, const char** status_string);
#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_H_
@@ -1,69 +0,0 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_
#define CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_
#include <grpcpp/grpcpp.h>
#include <memory>
#include <string>
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_client.h"
namespace amd {
namespace rdc {
class RDCChannel {
public:
explicit RDCChannel(std::string server_ip, std::string server_port, bool secure_channel);
~RDCChannel();
rdc_status_t Initialize(void);
// Getters and Setters
// Don't have setter for server ip and ports; we don't want to change those
// after construction
std::string server_ip(void) const { return server_ip_; }
std::string server_port(void) const { return server_port_; }
bool secure_channel(void) const { return secure_channel_; }
std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub(void) const { return rsmi_stub_; }
std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub(void) const { return rdc_admin_stub_; }
std::shared_ptr<grpc::Channel> const channel(void) { return channel_; }
private:
std::string server_ip_;
std::string server_port_;
bool secure_channel_;
std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub_;
std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub_;
std::shared_ptr<grpc::Channel> channel_;
std::shared_ptr<grpc::ChannelCredentials> channel_creds_;
};
} // namespace rdc
} // namespace amd
#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_
@@ -1,34 +0,0 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_
#define CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_
#include "rdc/rdc_client.h"
namespace amd {
namespace rdc {
rdc_status_t GrpcErrorToRdcError(::grpc::StatusCode grpc_err);
} // namespace rdc
} // namespace amd
#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_
@@ -1,50 +0,0 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef CLIENT_INCLUDE_RDC_RDC_EXCEPTION_H_
#define CLIENT_INCLUDE_RDC_RDC_EXCEPTION_H_
#include <exception>
#include <string>
#include "rdc/rdc_client.h"
namespace amd {
namespace rdc {
/// @brief Exception type which carries an error code to return to the user.
class rdc_exception : public std::exception {
public:
rdc_exception(rdc_status_t error, const std::string description)
: err_(error), desc_(description) {}
rdc_status_t error_code() const noexcept { return err_; }
const char* what() const noexcept override { return desc_.c_str(); }
private:
rdc_status_t err_;
std::string desc_;
};
} // namespace rdc
} // namespace amd
#endif // CLIENT_INCLUDE_RDC_RDC_EXCEPTION_H_
@@ -1,547 +0,0 @@
/*
Copyright (c) 2019 - Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc/rdc_client.h"
#include <grpcpp/grpcpp.h>
#include <time.h>
#include <unistd.h>
#include <iostream>
#include "common/rdc_utils.h"
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_client_main.h"
#include "rdc/rdc_client_utils.h"
#include "rdc/rdc_exception.h"
#include "rocm_smi/rocm_smi.h"
#define CHK_PTR_ARG(PTR) \
if ((PTR) == nullptr) { \
return RDC_RSMI_STATUS_INVALID_ARGS; \
}
#define UINTPTR_TO_RDC_CHAN(UPTR) \
amd::rdc::RDCChannel* ch = reinterpret_cast<amd::rdc::RDCChannel*>(UPTR); \
if (ch == nullptr) { \
return RDC_STATUS_GRPC_INVALID_ARG; \
}
static rdc_status_t handleException() {
try {
throw;
} catch (const std::bad_alloc& e) {
debug_print("RDC exception: BadAlloc\n");
return RDC_RSMI_STATUS_OUT_OF_RESOURCES;
} catch (const amd::rdc::rdc_exception& e) {
debug_print("Exception caught: %s.\n", e.what());
return e.error_code();
return RDC_RSMI_STATUS_INTERNAL_EXCEPTION;
} catch (const std::exception& e) {
debug_print("Unhandled exception: %s\n", e.what());
assert(false && "Unhandled exception.");
return RDC_RSMI_STATUS_INTERNAL_EXCEPTION;
} catch (const std::nested_exception& e) {
debug_print("Callback threw, forwarding.\n");
e.rethrow_nested();
return RDC_RSMI_STATUS_INTERNAL_EXCEPTION;
} catch (...) {
assert(false && "Unhandled exception.");
abort();
return RDC_RSMI_STATUS_INTERNAL_EXCEPTION;
}
}
#define TRY try {
#define CATCH \
} \
catch (...) { \
return handleException(); \
}
rdc_status_t rdc_channel_create(rdc_channel_t* channel, const char* ip, const char* port,
bool secure) {
TRY std::string server_str;
std::string port_str;
if (channel == nullptr) {
return RDC_STATUS_GRPC_INVALID_ARG;
}
if (ip != nullptr) {
server_str = ip;
} else {
server_str = RDC_DEFAULT_SERVER_IP;
}
if (port != nullptr) {
port_str = port;
} else {
port_str = std::to_string(RDC_DEFAULT_SERVER_PORT);
}
amd::rdc::RDCChannel* ch = new amd::rdc::RDCChannel(server_str, port_str, secure);
if (ch == nullptr) {
return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED;
}
rdc_status_t ret = ch->Initialize();
if (ret != 0) {
delete ch;
return ret;
}
*channel = reinterpret_cast<rdc_channel_t>(ch);
return RDC_STATUS_SUCCESS;
CATCH
}
rdc_status_t rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect,
grpc_connectivity_state* state) {
TRY CHK_PTR_ARG(state) UINTPTR_TO_RDC_CHAN(channel)
* state = ch->channel()->GetState(try_to_connect);
return RDC_STATUS_SUCCESS;
CATCH
}
rdc_status_t rdc_channel_connection_verify(rdc_channel_t channel) {
TRY UINTPTR_TO_RDC_CHAN(channel)
::rdc::VerifyConnectionResponse resp;
::rdc::VerifyConnectionRequest req;
::grpc::ClientContext context;
unsigned int seed = time(NULL);
req.set_magic_num(static_cast<uint64_t>(rand_r(&seed)));
::grpc::Status status = ch->rdc_admin_stub()->VerifyConnection(&context, req, &resp);
if (!status.ok()) {
return amd::rdc::GrpcErrorToRdcError(status.error_code());
}
if (resp.echo_magic_num() != req.magic_num()) {
return RDC_STATUS_GRPC_DATA_LOSS;
}
return RDC_STATUS_SUCCESS;
CATCH
}
rdc_status_t rdc_channel_destroy(rdc_channel_t channel) {
TRY UINTPTR_TO_RDC_CHAN(channel)
delete ch;
return RDC_STATUS_SUCCESS;
CATCH
}
rdc_status_t rdc_num_gpus_get(rdc_channel_t channel, uint64_t* num_gpu) {
TRY CHK_PTR_ARG(num_gpu) UINTPTR_TO_RDC_CHAN(channel)
::rdc::GetNumDevicesResponse resp;
::rdc::GetNumDevicesRequest empty;
::grpc::ClientContext context;
::grpc::Status status = ch->rsmi_stub()->GetNumDevices(&context, empty, &resp);
if (!status.ok()) {
return amd::rdc::GrpcErrorToRdcError(status.error_code());
}
*num_gpu = resp.val();
return static_cast<rdc_status_t>(resp.ret_val());
CATCH
}
// rsmi and rdc currently happen to have a 1-to-1 mapping, but
// have this function in case that changes
static ::rdc::GetTemperatureRequest_TemperatureMetric rsmi_temp2rdc_temp(
rsmi_temperature_metric_t rsmi_temp) {
return static_cast<::rdc::GetTemperatureRequest_TemperatureMetric>(rsmi_temp);
}
rdc_status_t rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_type,
rsmi_temperature_metric_t metric, int64_t* temperature) {
TRY CHK_PTR_ARG(temperature) UINTPTR_TO_RDC_CHAN(channel)
::rdc::GetTemperatureResponse resp;
::rdc::GetTemperatureRequest in_args;
::grpc::ClientContext context;
in_args.set_metric(rsmi_temp2rdc_temp(metric));
in_args.set_dv_ind(dv_ind);
in_args.set_sensor_type(sensor_type);
::grpc::Status status = ch->rsmi_stub()->GetTemperature(&context, in_args, &resp);
if (!status.ok()) {
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
}
*temperature = resp.temperature();
return static_cast<rdc_status_t>(resp.ret_val());
CATCH
}
rdc_status_t rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
int64_t* rpms) {
TRY CHK_PTR_ARG(rpms) UINTPTR_TO_RDC_CHAN(channel)
::rdc::GetFanRpmsResponse resp;
::rdc::GetFanRpmsRequest in_args;
::grpc::ClientContext context;
in_args.set_dv_ind(dv_ind);
in_args.set_sensor_ind(sensor_ind);
::grpc::Status status = ch->rsmi_stub()->GetFanRpms(&context, in_args, &resp);
if (!status.ok()) {
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
}
*rpms = resp.rpms();
return static_cast<rdc_status_t>(resp.ret_val());
CATCH
}
rdc_status_t rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
int64_t* speed) {
TRY CHK_PTR_ARG(speed) UINTPTR_TO_RDC_CHAN(channel)
::rdc::GetFanSpeedResponse resp;
::rdc::GetFanSpeedRequest in_args;
::grpc::ClientContext context;
in_args.set_dv_ind(dv_ind);
in_args.set_sensor_ind(sensor_ind);
::grpc::Status status = ch->rsmi_stub()->GetFanSpeed(&context, in_args, &resp);
if (!status.ok()) {
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
}
*speed = resp.speed();
return static_cast<rdc_status_t>(resp.ret_val());
CATCH
}
rdc_status_t rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind,
uint64_t* max_speed) {
TRY CHK_PTR_ARG(max_speed) UINTPTR_TO_RDC_CHAN(channel)
::rdc::GetFanSpeedMaxResponse resp;
::rdc::GetFanSpeedMaxRequest in_args;
::grpc::ClientContext context;
in_args.set_dv_ind(dv_ind);
in_args.set_sensor_ind(sensor_ind);
::grpc::Status status = ch->rsmi_stub()->GetFanSpeedMax(&context, in_args, &resp);
if (!status.ok()) {
return ::amd::rdc::GrpcErrorToRdcError(status.error_code());
}
*max_speed = resp.max_speed();
return static_cast<rdc_status_t>(resp.ret_val());
CATCH
}
rdc_status_t rdc_status_string(rdc_status_t status, const char** status_string) {
TRY if (status_string == nullptr) { return RDC_RSMI_STATUS_INVALID_ARGS; }
const size_t status_u = static_cast<size_t>(status);
switch (status_u) {
case RDC_STATUS_SUCCESS:
*status_string =
"RDC_STATUS_SUCCESS: The function has been executed"
" successfully.";
break;
case RDC_RSMI_STATUS_INVALID_ARGS:
*status_string =
"RDC_RSMI_STATUS_INVALID_ARGS: The provided arguments do not"
" meet the preconditions required for calling this function.";
break;
case RDC_RSMI_STATUS_NOT_SUPPORTED:
*status_string =
"RDC_RSMI_STATUS_NOT_SUPPORTED: This function is not"
" supported in the current environment.";
break;
case RDC_RSMI_STATUS_FILE_ERROR:
*status_string =
"RDC_RSMI_STATUS_FILE_ERROR: There was an error in finding or"
" opening a file or directory. The operation may not be supported by "
"this Linux kernel version.";
break;
case RDC_RSMI_STATUS_PERMISSION:
*status_string =
"RDC_RSMI_STATUS_PERMISSION: The user ID of the calling"
" process does not have sufficient permission to execute a command."
" Often this is fixed by running as root (sudo).";
break;
case RDC_RSMI_STATUS_OUT_OF_RESOURCES:
*status_string =
"RDC_RSMI_STATUS_OUT_OF_RESOURCES: Unable to acquire "
"memory or other resource";
break;
case RDC_RSMI_STATUS_INTERNAL_EXCEPTION:
*status_string =
"RDC_RSMI_STATUS_INTERNAL_EXCEPTION: An internal "
"exception was caught";
break;
case RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS:
*status_string =
"RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS: The provided "
"input is out of allowable or safe range";
break;
case RDC_RSMI_STATUS_INIT_ERROR:
*status_string =
"RDC_RSMI_STATUS_INIT_ERROR: An error occurred during "
"initialization, during "
"monitor discovery or when when initializing internal data structures";
break;
case RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED:
*status_string =
"RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED: The called "
"function has not been implemented in this "
"system for this device type";
break;
case RDC_RSMI_STATUS_NOT_FOUND:
*status_string =
"RDC_RSMI_STATUS_NOT_FOUND: An item required to "
"complete the call was not found";
break;
case RDC_RSMI_STATUS_INSUFFICIENT_SIZE:
*status_string =
"RDC_RSMI_STATUS_INSUFFICIENT_SIZE: Not enough "
"resources were available to fully execute"
" the call";
break;
case RDC_RSMI_STATUS_UNKNOWN_ERROR:
*status_string =
"An unknown error prevented the call from completing"
" successfully";
break;
case RDC_RSMI_STATUS_INTERRUPT:
*status_string =
"RDC_RSMI_STATUS_INTERRUPT An interrupt occurred while "
"executing the function";
break;
case RDC_STATUS_GRPC_CANCELLED:
*status_string =
"RDC_STATUS_GRPC_CANCELLED The operation was cancelled (typically by "
"the caller).";
break;
case RDC_STATUS_GRPC_UNKNOWN:
*status_string =
"RDC_STATUS_GRPC_UNKNOWN Unknown error. An example of where this error"
" may be returned is if a"
"Status value received from another address space belongs to an error-"
"space that is not known in this address space. Also errors raised by "
"APIs that do not return enough error information may be converted to "
"this error.";
break;
case RDC_STATUS_GRPC_INVALID_ARG:
*status_string =
"RDC_STATUS_GRPC_INVALID_ARG Client specified an invalid argument. "
"Note that this differs from"
"FAILED_PRECONDITION. INVALID_ARGUMENT indicates arguments that are "
"problematic regardless of the state of the system (e.g., a malformed "
"file name).";
break;
case RDC_STATUS_GRPC_DEADLINE_EXCEEDED:
*status_string =
"RDC_STATUS_GRPC_DEADLINE_EXCEEDED Deadline expired before operation "
"could complete. For operations that"
"change the state of the system, this error may be returned even if "
"the operation has completed successfully. For example, a successful "
"response from a server could have been delayed long enough for the "
"deadline to expire.";
break;
case RDC_STATUS_GRPC_NOT_FOUND:
*status_string =
"RDC_STATUS_GRPC_NOT_FOUND Some requested entity (e.g., file or "
"directory) was not found.";
break;
case RDC_STATUS_GRPC_ALREADY_EXISTS:
*status_string =
"RDC_STATUS_GRPC_ALREADY_EXISTS Some entity that we "
"attempted to create "
"(e.g., file or directory) already exists.";
break;
case RDC_STATUS_GRPC_PERM_DENIED:
*status_string =
"RDC_STATUS_GRPC_PERM_DENIED The caller does not have permission to "
"execute the specified operation."
"PERMISSION_DENIED must not be used for rejections caused by "
"exhausting some resource (use RESOURCE_EXHAUSTED instead for those "
"errors). PERMISSION_DENIED must not be used if the caller can not "
" be identified (use UNAUTHENTICATED instead for those errors).";
break;
case RDC_STATUS_GRPC_UNAUTHENTICATED:
*status_string =
"RDC_STATUS_GRPC_UNAUTHENTICATED The request does not have valid "
"authentication credentials for the operation.";
break;
case RDC_STATUS_GRPC_RESOURCE_EXHAUSTED:
*status_string =
"RDC_STATUS_GRPC_RESOURCE_EXHAUSTED Some resource has been exhausted, "
"perhaps a per-user quota, or perhaps the "
"entire file system is out of space.";
break;
case RDC_STATUS_GRPC_FAILED_PRECOND:
*status_string =
"RDC_STATUS_GRPC_FAILED_PRECOND Operation was rejected because the "
"system is not in a state required for "
"the operation's execution. For example, directory to be deleted may "
"be non-empty, an rmdir operation is applied to a non-directory, etc.\n"
"A litmus test that may help a service implementor in deciding "
"between FAILED_PRECONDITION, ABORTED, and UNAVAILABLE:\n"
" (a) Use UNAVAILABLE if the client can retry just the failing call.\n"
" (b) Use ABORTED if the client should retry at a higher-level "
" (e.g., restarting a read-modify-write sequence).\n"
" (c) Use FAILED_PRECONDITION if the client should not retry until"
" the system state has been explicitly fixed. E.g., if an \"rmdir\""
" fails because the directory is non-empty, FAILED_PRECONDITION"
" should be returned since the client should not retry unless"
" they have first fixed up the directory by deleting files from it.\n"
" (d) Use FAILED_PRECONDITION if the client performs conditional"
" REST Get/Update/Delete on a resource and the resource on the"
" server does not match the condition. E.g., conflicting"
" read-modify-write on the same resource.";
break;
case RDC_STATUS_GRPC_ABORTED:
*status_string =
"RDC_STATUS_GRPC_ABORTED The operation was aborted, "
"typically due to a concurrency issue like "
"sequencer check failures, transaction aborts, etc.\n"
"See litmus test above for deciding between "
"FAILED_PRECONDITION, ABORTED, "
"and UNAVAILABLE.";
break;
case RDC_STATUS_GRPC_OUT_OF_RANGE:
*status_string =
"RDC_STATUS_GRPC_OUT_OF_RANGE Operation was attempted "
"past the valid range. E.g., seeking or reading "
"past end of file.\n"
"Unlike INVALID_ARGUMENT, this error indicates a "
"problem that may be fixed "
"if the system state changes. For example, a 32-bit file system will "
"generate INVALID_ARGUMENT if asked to read "
"at an offset that is not in the "
"range [0,2^32-1], but it will generate "
"OUT_OF_RANGE if asked to read from "
"an offset past the current file size.\n"
"There is a fair bit of overlap between FAILED_PRECONDITION and "
"OUT_OF_RANGE. We recommend using OUT_OF_RANGE "
"(the more specific error) "
"when it applies so that callers who are "
"iterating through a space can "
"easily look for an OUT_OF_RANGE error to detect when they are done.";
break;
case RDC_STATUS_GRPC_UNIMPLEMENTED:
*status_string =
"RDC_STATUS_GRPC_UNIMPLEMENTED Operation is not "
"implemented or not supported/enabled in this service.";
break;
case RDC_STATUS_GRPC_INTERNAL:
*status_string =
"RDC_STATUS_GRPC_INTERNAL Internal errors. This means "
"some invariants expected by underlying System has "
"been broken. If you see one of these errors.";
break;
case RDC_STATUS_GRPC_UNAVAILABLE:
*status_string =
"RDC_STATUS_GRPC_UNAVAILABLE The service is currently unavailable. "
"This is a most likely a transient "
"condition and may be corrected by retrying with a backoff.\n"
"Warning: Although data MIGHT not have been transmitted when this "
"status occurs, there is NOT A GUARANTEE that the server has not seen "
"anything. So in general it is unsafe to retry on this status code "
"if the call is non-idempotent. "
"See litmus test above for deciding between "
"FAILED_PRECONDITION, ABORTED,"
"and UNAVAILABLE.";
break;
case RDC_STATUS_GRPC_DATA_LOSS:
*status_string = "RDC_STATUS_GRPC_DATA_LOSS Unrecoverable data loss or corruption.";
break;
case RDC_STATUS_UNKNOWN_ERROR:
*status_string = "RDC_STATUS_UNKNOWN_ERROR An unknown RDC error occurred.";
break;
case RDC_STATUS_CLIENT_ERR_SSL:
*status_string = "An error occurred when executing SSL authentication operations.";
break;
default:
*status_string =
"RDC_RSMI_STATUS_UNKNOWN_ERROR An "
"unknown error occurred";
return RDC_RSMI_STATUS_UNKNOWN_ERROR;
}
return RDC_STATUS_SUCCESS;
CATCH
}
@@ -1,177 +0,0 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc/rdc_client_main.h"
#include <assert.h>
#include <grpcpp/grpcpp.h>
#include <string>
#include "common/rdc_utils.h"
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_client.h"
namespace amd {
namespace rdc {
#ifdef USE_PINNED_CERTS
// Pinned certificates
static const char* kDefaultRDCServerCertPinPath = "/etc/rdc/server/rdc_server.crt";
static const char* kDefaultRDCClientKeyPinPath = "/etc/rdc/client/private/rdc_client.key";
static const char* kDefaultRDCClientCertPinPath = "/etc/rdc/client/rdc_client.crt";
#endif // USE_PINNED_CERTS
// PKI certificates
static const char* kDefaultRDCClientCertKeyPkiPath = "/etc/rdc/client/private/rdc_client_cert.key";
static const char* kDefaultRDCClientCertPemPkiPath = "/etc/rdc/client/certs/rdc_client_cert.pem";
static const char* kDefaultRDCClientCACertPemPkiPath = "/etc/rdc/client/certs/rdc_cacert.pem";
RDCChannel::RDCChannel(std::string server_ip, std::string server_port, bool secure)
: server_ip_(server_ip), server_port_(server_port), secure_channel_(secure) {}
RDCChannel::~RDCChannel() {}
#ifdef USE_PINNED_CERTS
static int ConstructSSLOptsPin(grpc::SslCredentialsOptions* ssl_opts) {
assert(ssl_opts != nullptr);
if (ssl_opts == nullptr) {
return -EINVAL;
}
// Ensure the required paths exists before going forward
// TODO(cfreehil): override these defaults with values read from config
// file
if (!amd::rdc::FileExists(kDefaultRDCClientKeyPinPath) ||
!amd::rdc::FileExists(kDefaultRDCServerCertPinPath) ||
!amd::rdc::FileExists(kDefaultRDCClientCertPinPath)) {
return -ENOENT;
}
std::string cli_key;
std::string ser_crt;
std::string cli_crt;
int ret;
ret = amd::rdc::ReadFile(kDefaultRDCClientKeyPinPath, &cli_key);
if (ret) {
return ret;
}
ret = amd::rdc::ReadFile(kDefaultRDCServerCertPinPath, &ser_crt);
if (ret) {
return ret;
}
ret = amd::rdc::ReadFile(kDefaultRDCClientCertPinPath, &cli_crt);
if (ret) {
return ret;
}
ssl_opts->pem_root_certs = ser_crt;
ssl_opts->pem_private_key = cli_key;
ssl_opts->pem_cert_chain = cli_crt;
return 0;
}
#endif // USE_PINNED_CERTS
static int ConstructSSLOptsPKI(grpc::SslCredentialsOptions* ssl_opts) {
assert(ssl_opts != nullptr);
if (ssl_opts == nullptr) {
return -EINVAL;
}
// Ensure the required paths exists before going forward
// TODO(cfreehil): override these defaults with values read from config
// file
if (!amd::rdc::FileExists(kDefaultRDCClientCertKeyPkiPath) ||
!amd::rdc::FileExists(kDefaultRDCClientCertPemPkiPath) ||
!amd::rdc::FileExists(kDefaultRDCClientCACertPemPkiPath)) {
return -ENOENT;
}
std::string pem_root_certs;
std::string pem_private_key;
std::string pem_cert_chain;
int ret;
ret = amd::rdc::ReadFile(kDefaultRDCClientCACertPemPkiPath, &pem_root_certs);
if (ret) {
return ret;
}
ret = amd::rdc::ReadFile(kDefaultRDCClientCertKeyPkiPath, &pem_private_key);
if (ret) {
return ret;
}
ret = amd::rdc::ReadFile(kDefaultRDCClientCertPemPkiPath, &pem_cert_chain);
if (ret) {
return ret;
}
ssl_opts->pem_root_certs = pem_root_certs;
ssl_opts->pem_private_key = pem_private_key;
ssl_opts->pem_cert_chain = pem_cert_chain;
return 0;
}
rdc_status_t RDCChannel::Initialize(void) {
assert(!server_port_.empty());
assert(!server_ip_.empty());
int ret;
std::string addr_str = server_ip() + ":";
addr_str += server_port();
if (secure_channel_) {
grpc::SslCredentialsOptions ssl_opts;
#ifdef USE_PINNED_CERTS
ret = ConstructSSLOptsPin(&ssl_opts);
#else
ret = ConstructSSLOptsPKI(&ssl_opts);
#endif
if (ret) {
std::cerr << "Failed to process OpenSSL keys and certificates." << std::endl;
return RDC_STATUS_CLIENT_ERR_SSL;
}
channel_creds_ = grpc::SslCredentials(ssl_opts);
channel_ = grpc::CreateChannel(addr_str, channel_creds_);
} else {
channel_ = ::grpc::CreateChannel(addr_str, grpc::InsecureChannelCredentials());
}
rsmi_stub_ = ::rdc::Rsmi::NewStub(channel_);
if (rsmi_stub_ == nullptr) {
return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED;
}
rdc_admin_stub_ = ::rdc::RdcAdmin::NewStub(channel_);
if (rdc_admin_stub_ == nullptr) {
return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED;
}
// Test to see if we can connect to server; if not, return err.
return RDC_STATUS_SUCCESS;
}
} // namespace rdc
} // namespace amd
@@ -1,40 +0,0 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc/rdc_client_utils.h"
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_client.h"
namespace amd {
namespace rdc {
rdc_status_t GrpcErrorToRdcError(grpc::StatusCode grpc_err) {
uint32_t grpc_err_int = static_cast<uint32_t>(grpc_err);
uint32_t rdc_grpc_base_int = static_cast<uint32_t>(RDC_STATUS_GRPC_ERR_FIRST);
uint32_t rdc_err_int = grpc_err_int + rdc_grpc_base_int;
return static_cast<rdc_status_t>(rdc_err_int);
}
} // namespace rdc
} // namespace amd
+2 -2
Bestand weergeven
@@ -12,9 +12,9 @@ The RDC includes the following libraries:
• librdc_client.so: Exposes RDC functionality using gRPC client.
• librdc.so: RDC API. This depends on librocm_smi.so.
• librdc.so: RDC API. This depends on libamd_smi.so.
• librocm_smi.so: Stateless low overhead access to GPU data.
• libamd_smi.so: Stateless low overhead access to GPU data.
![Libraries](../data/api_libs.png)
@@ -10,7 +10,7 @@ NOTE: The RDC tool is tested on the following software versions. Earlier version
• g++ (5.4.0)
• AMD ROCm, which includes AMD ROCm SMI Library
• AMD ROCm, which includes AMD AMDSMI Library
• gRPC and protoc
@@ -68,7 +68,7 @@ RDC Command Line Tool (rdci)
A command-line tool to invoke all the features of the RDC tool. This CLI can be run locally or remotely.
ROCm-SMI Library
AMDSMI Library
A stateless system management library that provides low-level interfaces to access GPU information
+1 -1
Bestand weergeven
@@ -257,7 +257,7 @@ typedef enum {
//!< represents 32 bytes
// "Composite" events. These events have additional processing beyond
// the value provided by the rocm_smi library.
// the value provided by the amd_smi library.
RDC_EVNT_XGMI_0_THRPUT = 1500, //!< Transmit throughput to XGMI
//!< neighbor 0 in byes/sec
RDC_EVNT_XGMI_1_THRPUT, //!< Transmit throughput to XGMI
@@ -34,8 +34,8 @@ namespace rdc {
class RdcMetricFetcher {
public:
virtual rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) = 0;
virtual rdc_status_t delete_rsmi_handle(RdcFieldKey fk) = 0;
virtual rdc_status_t acquire_smi_handle(RdcFieldKey fk) = 0;
virtual rdc_status_t delete_smi_handle(RdcFieldKey fk) = 0;
virtual rdc_status_t fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id,
rdc_field_value* value) = 0;
@@ -29,9 +29,9 @@ THE SOFTWARE.
#include <mutex> // NOLINT(build/c++11)
#include <queue>
#include "amd_smi/amdsmi.h"
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/rdc_common.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace rdc {
@@ -44,20 +44,20 @@ struct MetricValue {
rdc_field_value value;
};
// This union represents any RSMI handles require initialization and/or
// This union represents any SMI handles require initialization and/or
// shut down. There should only be one instance of this for each raw event
// used. For example, if a field group includes a pseudo-event and the
// underlying raw event, then only one FieldRSMIData should be created,
// underlying raw event, then only one FieldSMIData should be created,
// and it should be used by both events.
struct FieldRSMIData {
struct FieldSMIData {
union {
rsmi_event_handle_t evt_handle;
amdsmi_event_handle_t evt_handle;
};
union {
rsmi_counter_value_t counter_val;
amdsmi_counter_value_t counter_val;
};
~FieldRSMIData() {}
FieldRSMIData() : evt_handle(0), counter_val{0, 0, 0} {}
~FieldSMIData() {}
FieldSMIData() : evt_handle(0), counter_val{0, 0, 0} {}
};
//!< The data structure to store the async fetch task
@@ -77,11 +77,11 @@ class RdcMetricFetcherImpl : public RdcMetricFetcher {
RdcMetricFetcherImpl();
~RdcMetricFetcherImpl();
rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) override;
rdc_status_t delete_rsmi_handle(RdcFieldKey fk) override;
rdc_status_t acquire_smi_handle(RdcFieldKey fk) override;
rdc_status_t delete_smi_handle(RdcFieldKey fk) override;
private:
std::shared_ptr<FieldRSMIData> get_rsmi_data(RdcFieldKey key);
std::shared_ptr<FieldSMIData> get_smi_data(RdcFieldKey key);
uint64_t now();
void get_ecc_error(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
@@ -92,7 +92,7 @@ class RdcMetricFetcherImpl : public RdcMetricFetcher {
//!< Async metric retreive
std::map<RdcFieldKey, MetricValue> async_metrics_;
std::map<RdcFieldKey, std::shared_ptr<FieldRSMIData>> rsmi_data_;
std::map<RdcFieldKey, std::shared_ptr<FieldSMIData>> smi_data_;
std::queue<MetricTask> updated_tasks_;
std::mutex task_mutex_;
std::future<void> updater_; // keep the future of updater
@@ -100,8 +100,6 @@ class RdcMetricFetcherImpl : public RdcMetricFetcher {
std::atomic<bool> task_started_;
};
rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi);
} // namespace rdc
} // namespace amd
@@ -24,9 +24,8 @@ THE SOFTWARE.
#include <memory>
#include <string>
#include "amd_smi/amdsmi.h"
#include "rdc/rdc.h"
#include "rdc_lib/rdc_common.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace rdc {
@@ -35,23 +34,23 @@ class RdcSmiDiagnosticImpl {
public:
RdcSmiDiagnosticImpl();
rdc_status_t check_rsmi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t check_rsmi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_status_t check_smi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t check_smi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t check_smi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result);
rdc_status_t check_rsmi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count,
rdc_diag_test_result_t* result);
private:
rdc_diag_result_t check_temperature_level(uint32_t gpu_index, rsmi_temperature_type_t type,
rdc_diag_result_t check_temperature_level(uint32_t gpu_index, amdsmi_temperature_type_t type,
char msg[MAX_DIAG_MSG_LENGTH],
char per_gpu_msg[MAX_DIAG_MSG_LENGTH]);
std::string get_temperature_string(rsmi_temperature_type_t type) const;
std::string get_temperature_string(amdsmi_temperature_type_t type) const;
rdc_diag_result_t check_voltage_level(uint32_t gpu_index, rsmi_voltage_type_t type,
rdc_diag_result_t check_voltage_level(uint32_t gpu_index, amdsmi_voltage_type_t type,
char msg[MAX_DIAG_MSG_LENGTH],
char per_gpu_msg[MAX_DIAG_MSG_LENGTH]);
std::string get_voltage_string(rsmi_voltage_type_t type) const;
std::string get_voltage_string(amdsmi_voltage_type_t type) const;
};
typedef std::shared_ptr<RdcSmiDiagnosticImpl> RdcSmiDiagnosticPtr;
@@ -30,13 +30,13 @@ THE SOFTWARE.
#include <utility>
#include <vector>
#include "amd_smi/amdsmi.h"
#include "rdc_lib/RdcCacheManager.h"
#include "rdc_lib/RdcGroupSettings.h"
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/RdcModuleMgr.h"
#include "rdc_lib/RdcNotification.h"
#include "rdc_lib/RdcWatchTable.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace rdc {
@@ -23,12 +23,16 @@ THE SOFTWARE.
#ifndef INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_
#define INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_
#include "rocm_smi/rocm_smi.h"
#include "amd_smi/amdsmi.h"
#include "rdc/rdc.h"
namespace amd {
namespace rdc {
rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi);
rdc_status_t Smi2RdcError(amdsmi_status_t rsmi);
amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
amdsmi_processor_handle* processor_handle);
amdsmi_status_t get_processor_count(uint32_t& all_processor_count);
} // namespace rdc
} // namespace amd
@@ -63,7 +63,7 @@ class TestBase : public RdcRocrBase {
const std::string& get_per_gpu_info() const { return per_gpu_info_; }
hsa_status_t FindGPUIndex(hsa_agent_t agent, void* data);
// Return the agent by GPU index in rocm_smi
// Return the agent by GPU index in amd_smi
hsa_status_t get_agent_by_gpu_index(uint32_t gpu_index, hsa_agent_t* agent);
protected:
-82
Bestand weergeven
@@ -28,88 +28,6 @@ syntax = "proto3";
package rdc;
/****************************************************************************/
/********************************** Rsmi Service ****************************/
/****************************************************************************/
service Rsmi {
// RSMI ID services
rpc GetNumDevices (GetNumDevicesRequest) returns(GetNumDevicesResponse) {}
// RSMI Physical Queries
rpc GetTemperature(GetTemperatureRequest) returns(GetTemperatureResponse){}
rpc GetFanRpms(GetFanRpmsRequest) returns(GetFanRpmsResponse){}
rpc GetFanSpeed(GetFanSpeedRequest) returns(GetFanSpeedResponse){}
rpc GetFanSpeedMax(GetFanSpeedMaxRequest) returns(GetFanSpeedMaxResponse){}
}
/* rsmi_num_monitor_devices() */
message GetNumDevicesRequest {
}
message GetNumDevicesResponse {
uint64 val = 1;
uint64 ret_val = 2;
}
/* GetTemperature */
/* rsmi_dev_temp_metric_get() */
message GetTemperatureRequest {
uint32 dv_ind = 1;
uint32 sensor_type = 2;
enum TemperatureMetric {
RSMI_TEMP_CURRENT = 0;
RSMI_TEMP_MAX = 1;
RSMI_TEMP_MIN = 2;
RSMI_TEMP_MAX_HYST = 3;
RSMI_TEMP_MIN_HYST = 4;
RSMI_TEMP_CRITICAL = 5;
RSMI_TEMP_CRITICAL_HYST = 6;
RSMI_TEMP_EMERGENCY = 7;
RSMI_TEMP_EMERGENCY_HYST = 8;
RSMI_TEMP_CRIT_MIN = 9;
RSMI_TEMP_CRIT_MIN_HYST = 10;
RSMI_TEMP_OFFSET = 11;
RSMI_TEMP_LOWEST = 12;
RSMI_TEMP_HIGHEST = 13;
}
TemperatureMetric metric = 3;
}
message GetTemperatureResponse {
int64 temperature = 1;
uint64 ret_val = 2;
}
/* GetFanRpms */
/* rsmi_dev_fan_rpms_get() */
message GetFanRpmsRequest {
uint32 dv_ind = 1;
uint32 sensor_ind = 2;
}
message GetFanRpmsResponse {
int64 rpms = 1;
uint64 ret_val = 2;
}
/* GetFanSpeed */
/* rsmi_dev_fan_speed_get() */
message GetFanSpeedRequest {
uint32 dv_ind = 1;
uint32 sensor_ind = 2;
}
message GetFanSpeedResponse {
int64 speed = 1;
uint64 ret_val = 2;
}
/* GetFanSpeedMax */
/* rsmi_dev_fan_speed_max_get() */
message GetFanSpeedMaxRequest {
uint32 dv_ind = 1;
uint32 sensor_ind = 2;
}
message GetFanSpeedMaxResponse {
uint64 max_speed = 1;
uint64 ret_val = 2;
}
/****************************************************************************/
/********************************** RdcAdmin Service ************************/
/****************************************************************************/
@@ -37,13 +37,13 @@ class PrometheusReader(RdcReader):
if enable_pci_id == True:
try:
import sys, os
# Relaive path of rocm_smi to map gpu index to PCI id
# change smi_lib_path if the rocm_smi is installed in different folder
# Relaive path of amd_smi to map gpu index to PCI id
# change smi_lib_path if the amd_smi is installed in different folder
smi_lib_relative_path = "../../bin"
smi_lib_path = os.path.join(sys.path[0], smi_lib_relative_path)
if os.path.exists(smi_lib_path+"/rocm_smi.py"):
if os.path.exists(smi_lib_path+"/amd_smi.py"):
sys.path.append(smi_lib_path)
from rocm_smi import getBus, initializeRsmi
from amd_smi import getBus, initializeRsmi
initializeRsmi()
# Map between gpu indexes and PCIe bus addresses
self.index_to_bus_addr = {}
+4 -4
Bestand weergeven
@@ -46,8 +46,8 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR})
message("--------RSMI Inc Dir: " ${RSMI_INC_DIR})
message("---------SMI Lib Dir: " ${SMI_LIB_DIR})
message("---------SMI Inc Dir: " ${SMI_INC_DIR})
message("")
@@ -82,8 +82,8 @@ set(CPACK_PACKAGE_FILE_NAME "${RDC_PACKAGE}-${VERSION_STRING}")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core")
set(CPACK_RPM_PACKAGE_REQUIRES "rocm-core")
# link RSMI
link_directories(${RSMI_LIB_DIR})
# link SMI
link_directories(${SMI_LIB_DIR})
# add librdc_bootstrap.so
add_subdirectory(bootstrap)
@@ -28,7 +28,7 @@ target_include_directories(${BOOTSTRAP_LIB} PRIVATE
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
"${COMMON_DIR}"
"${RSMI_INC_DIR}"
"${SMI_INC_DIR}"
"${ROCM_DIR}/include")
target_include_directories(${BOOTSTRAP_LIB}
@@ -26,7 +26,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST}
"${SRC_DIR}/RdcSmiLib.cc"
"${SRC_DIR}/RdcTelemetryModule.cc"
"${SRC_DIR}/RdcWatchTableImpl.cc"
"${SRC_DIR}/RsmiUtils.cc")
"${SRC_DIR}/SmiUtils.cc")
# TODO: remove all headers? Will just dir be ok after install?
set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST}
@@ -59,16 +59,16 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST}
"${INC_DIR}/impl/RdcSmiLib.h"
"${INC_DIR}/impl/RdcTelemetryModule.h"
"${INC_DIR}/impl/RdcWatchTableImpl.h"
"${INC_DIR}/impl/RsmiUtils.h")
"${INC_DIR}/impl/SmiUtils.h")
message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}")
add_library(${RDC_LIB} SHARED ${RDC_LIB_SRC_LIST} ${RDC_LIB_INC_LIST})
target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64 cap)
target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread amd_smi cap)
target_include_directories(${RDC_LIB} PRIVATE
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
"${RSMI_INC_DIR}")
"${SMI_INC_DIR}")
# Set the VERSION and SOVERSION values
set_property(TARGET ${RDC_LIB} PROPERTY
@@ -23,6 +23,7 @@ THE SOFTWARE.
#include <string.h>
#include "amd_smi/amdsmi.h"
#include "common/rdc_fields_supported.h"
#include "rdc_lib/RdcException.h"
#include "rdc_lib/RdcLogger.h"
@@ -35,30 +36,29 @@ THE SOFTWARE.
#include "rdc_lib/impl/RdcNotificationImpl.h"
#include "rdc_lib/impl/RdcWatchTableImpl.h"
#include "rdc_lib/rdc_common.h"
#include "rocm_smi/rocm_smi.h"
namespace {
// call the rsmi_init when load library
// and rsmi_shutdown when unload the library.
class rsmi_initializer {
rsmi_initializer() {
// Make sure rsmi will not be initialized multiple times
rsmi_shut_down();
rsmi_status_t rsmi_ret = rsmi_init(0);
if (rsmi_ret != RSMI_STATUS_SUCCESS) {
throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "RSMI initialize fail");
// call the smi_init when load library
// and smi_shutdown when unload the library.
class smi_initializer {
smi_initializer() {
// Make sure smi will not be initialized multiple times
amdsmi_shut_down();
amdsmi_status_t ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS);
if (ret != AMDSMI_STATUS_SUCCESS) {
throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "SMI initialize fail");
}
}
~rsmi_initializer() { rsmi_shut_down(); }
~smi_initializer() { amdsmi_shut_down(); }
public:
static rsmi_initializer& getInstance() {
static rsmi_initializer instance;
static smi_initializer& getInstance() {
static smi_initializer instance;
return instance;
}
};
static rsmi_initializer& in = rsmi_initializer::getInstance();
static smi_initializer& in = smi_initializer::getInstance();
} // namespace
amd::rdc::RdcHandler* make_handler(rdc_operation_mode_t op_mode) {
@@ -25,42 +25,39 @@ THE SOFTWARE.
#include <string.h>
#include <sys/time.h>
#include <algorithm>
#include <chrono> //NOLINT
#include <set>
#include <vector>
#include "amd_smi/amdsmi.h"
#include "common/rdc_capabilities.h"
#include "common/rdc_fields_supported.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RsmiUtils.h"
#include "rdc_lib/impl/SmiUtils.h"
#include "rdc_lib/rdc_common.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace rdc {
static const std::unordered_map<rdc_field_t, rsmi_event_type_t> rdc_evnt_2_rsmi_field = {
{RDC_EVNT_XGMI_0_NOP_TX, RSMI_EVNT_XGMI_0_NOP_TX},
{RDC_EVNT_XGMI_0_REQ_TX, RSMI_EVNT_XGMI_0_REQUEST_TX},
{RDC_EVNT_XGMI_0_RESP_TX, RSMI_EVNT_XGMI_0_RESPONSE_TX},
{RDC_EVNT_XGMI_0_BEATS_TX, RSMI_EVNT_XGMI_0_BEATS_TX},
{RDC_EVNT_XGMI_1_NOP_TX, RSMI_EVNT_XGMI_1_NOP_TX},
{RDC_EVNT_XGMI_1_REQ_TX, RSMI_EVNT_XGMI_1_REQUEST_TX},
{RDC_EVNT_XGMI_1_RESP_TX, RSMI_EVNT_XGMI_1_RESPONSE_TX},
{RDC_EVNT_XGMI_1_BEATS_TX, RSMI_EVNT_XGMI_1_BEATS_TX},
static const std::unordered_map<rdc_field_t, amdsmi_event_type_t> rdc_evnt_2_smi_field = {
{RDC_EVNT_XGMI_0_NOP_TX, AMDSMI_EVNT_XGMI_0_NOP_TX},
{RDC_EVNT_XGMI_0_REQ_TX, AMDSMI_EVNT_XGMI_0_REQUEST_TX},
{RDC_EVNT_XGMI_0_RESP_TX, AMDSMI_EVNT_XGMI_0_RESPONSE_TX},
{RDC_EVNT_XGMI_0_BEATS_TX, AMDSMI_EVNT_XGMI_0_BEATS_TX},
{RDC_EVNT_XGMI_1_NOP_TX, AMDSMI_EVNT_XGMI_1_NOP_TX},
{RDC_EVNT_XGMI_1_REQ_TX, AMDSMI_EVNT_XGMI_1_REQUEST_TX},
{RDC_EVNT_XGMI_1_RESP_TX, AMDSMI_EVNT_XGMI_1_RESPONSE_TX},
{RDC_EVNT_XGMI_1_BEATS_TX, AMDSMI_EVNT_XGMI_1_BEATS_TX},
{RDC_EVNT_XGMI_0_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_0},
{RDC_EVNT_XGMI_1_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_1},
{RDC_EVNT_XGMI_2_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_2},
{RDC_EVNT_XGMI_3_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_3},
{RDC_EVNT_XGMI_4_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_4},
{RDC_EVNT_XGMI_5_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_5},
{RDC_EVNT_XGMI_0_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_0},
{RDC_EVNT_XGMI_1_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_1},
{RDC_EVNT_XGMI_2_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_2},
{RDC_EVNT_XGMI_3_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_3},
{RDC_EVNT_XGMI_4_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_4},
{RDC_EVNT_XGMI_5_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_5},
};
RdcMetricFetcherImpl::RdcMetricFetcherImpl() {
task_started_ = true;
RdcMetricFetcherImpl::RdcMetricFetcherImpl() : task_started_(true) {
// kick off another thread for async fetch
updater_ = std::async(std::launch::async, [this]() {
while (task_started_) {
@@ -95,37 +92,41 @@ uint64_t RdcMetricFetcherImpl::now() {
void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index, rdc_field_t field_id,
rdc_field_value* value) {
rsmi_status_t err = RSMI_STATUS_SUCCESS;
uint64_t correctable_err = 0;
uint64_t uncorrectable_err = 0;
rsmi_ras_err_state_t err_state;
amdsmi_status_t err = AMDSMI_STATUS_SUCCESS;
uint64_t correctable_count = 0;
uint64_t uncorrectable_count = 0;
amdsmi_ras_err_state_t err_state;
amdsmi_processor_handle processor_handle;
err = get_processor_handle_from_id(gpu_index, &processor_handle);
if (!value) {
return;
}
for (uint32_t b = RSMI_GPU_BLOCK_FIRST; b <= RSMI_GPU_BLOCK_LAST; b = b * 2) {
err = rsmi_dev_ecc_status_get(gpu_index, static_cast<rsmi_gpu_block_t>(b), &err_state);
if (err != RSMI_STATUS_SUCCESS) {
for (uint32_t b = AMDSMI_GPU_BLOCK_FIRST; b <= AMDSMI_GPU_BLOCK_LAST; b = b * 2) {
err =
amdsmi_get_gpu_ecc_status(processor_handle, static_cast<amdsmi_gpu_block_t>(b), &err_state);
if (err != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO, "Get the ecc Status error " << b << ":" << err);
continue;
}
rsmi_error_count_t ec;
err = rsmi_dev_ecc_count_get(gpu_index, static_cast<rsmi_gpu_block_t>(b), &ec);
amdsmi_error_count_t ec;
err = amdsmi_get_gpu_ecc_count(processor_handle, static_cast<amdsmi_gpu_block_t>(b), &ec);
if (err == RSMI_STATUS_SUCCESS) {
correctable_err += ec.correctable_err;
uncorrectable_err += ec.uncorrectable_err;
if (err == AMDSMI_STATUS_SUCCESS) {
correctable_count += ec.correctable_count;
uncorrectable_count += ec.uncorrectable_count;
}
}
value->status = RSMI_STATUS_SUCCESS;
value->status = AMDSMI_STATUS_SUCCESS;
value->type = INTEGER;
if (field_id == RDC_FI_ECC_CORRECT_TOTAL) {
value->value.l_int = correctable_err;
value->value.l_int = correctable_count;
}
if (field_id == RDC_FI_ECC_UNCORRECT_TOTAL) {
value->value.l_int = uncorrectable_err;
value->value.l_int = uncorrectable_count;
}
}
@@ -166,7 +167,10 @@ bool RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, rdc_fie
void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) {
uint32_t gpu_index = key.first;
uint64_t sent, received, max_pkt_sz;
rsmi_status_t ret;
amdsmi_status_t ret;
amdsmi_processor_handle processor_handle;
ret = get_processor_handle_from_id(gpu_index, &processor_handle);
// Return if the cache does not expire yet
do {
@@ -178,7 +182,7 @@ void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) {
}
} while (0);
ret = rsmi_dev_pci_throughput_get(gpu_index, &sent, &received, &max_pkt_sz);
ret = amdsmi_get_gpu_pci_throughput(processor_handle, &sent, &received, &max_pkt_sz);
uint64_t curTime = now();
MetricValue value;
@@ -207,12 +211,12 @@ void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) {
rx_metric->second.value.status = ret;
rx_metric->second.value.ts = curTime;
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
RDC_LOG(RDC_ERROR, "PCIe throughput not supported on GPU " << gpu_index);
return;
}
if (ret == RSMI_STATUS_SUCCESS) {
if (ret == AMDSMI_STATUS_SUCCESS) {
rx_metric->second.value.value.l_int = received;
tx_metric->second.value.value.l_int = sent;
RDC_LOG(RDC_DEBUG, "Async updated " << gpu_index << ":"
@@ -226,16 +230,16 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields(
std::vector<rdc_gpu_field_value_t>& results) { // NOLINT
const std::set<rdc_field_t> rdc_bulk_fields = {
RDC_FI_GPU_CLOCK, // current_gfxclk * 1000000
RDC_FI_MEMORY_TEMP, // temperature_mem * 1000
RDC_FI_GPU_TEMP, // temperature_edge * 1000
RDC_FI_POWER_USAGE, // average_socket_power * 1000000
RDC_FI_MEMORY_TEMP, // temperature_mem
RDC_FI_GPU_TEMP, // temperature_edge
RDC_FI_POWER_USAGE, // average_socket_power
RDC_FI_GPU_UTIL // average_gfx_activity
};
// To prevent always call the bulk API even if it is not supported,
// the static is used to cache last try.
static rsmi_status_t rs = RSMI_STATUS_SUCCESS;
if (rs != RSMI_STATUS_SUCCESS) {
static amdsmi_status_t rs = AMDSMI_STATUS_SUCCESS;
if (rs != AMDSMI_STATUS_SUCCESS) {
results.clear();
return RDC_ST_NOT_SUPPORTED;
}
@@ -248,13 +252,16 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields(
}
}
// Call the rocm_smi_lib API to bulk fetch the data
// Call the amd_smi_lib API to bulk fetch the data
auto cur_time = now();
auto ite = bulk_fields.begin();
for (; ite != bulk_fields.end(); ite++) {
rsmi_gpu_metrics_t gpu_metrics;
rs = rsmi_dev_gpu_metrics_info_get(ite->first, &gpu_metrics);
if (rs != RSMI_STATUS_SUCCESS) {
amdsmi_gpu_metrics_t gpu_metrics;
amdsmi_processor_handle processor_handle;
rs = get_processor_handle_from_id(ite->first, &processor_handle);
rs = amdsmi_get_gpu_metrics_info(processor_handle, &gpu_metrics);
if (rs != AMDSMI_STATUS_SUCCESS) {
results.clear();
return RDC_ST_NOT_SUPPORTED;
}
@@ -264,38 +271,46 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields(
value.gpu_index = ite->first;
value.field_value.field_id = field_id;
value.field_value.type = INTEGER;
value.field_value.status = RSMI_STATUS_SUCCESS;
value.field_value.status = AMDSMI_STATUS_SUCCESS;
value.field_value.ts = cur_time;
switch (field_id) {
case RDC_FI_GPU_CLOCK: // current_gfxclk * 1000000
value.field_value.value.l_int =
static_cast<int64_t>(gpu_metrics.current_gfxclk * 1000000);
static_cast<int64_t>(gpu_metrics.current_gfxclk) * 1000000;
break;
case RDC_FI_MEMORY_TEMP: // temperature_mem * 1000
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.temperature_mem * 1000);
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.temperature_mem) * 1000;
break;
case RDC_FI_GPU_TEMP: // temperature_edge * 1000
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.temperature_edge * 1000);
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.temperature_edge) * 1000;
break;
case RDC_FI_POWER_USAGE: // average_socket_power * 1000000
value.field_value.value.l_int =
static_cast<int64_t>(gpu_metrics.average_socket_power * 1000000);
case RDC_FI_POWER_USAGE: // average_socket_power
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.average_socket_power);
// Use current_socket_power if average_socket_power is not available
if (value.field_value.value.l_int == 65535) {
RDC_LOG(RDC_DEBUG, "Bulk fetch "
<< value.gpu_index << ":"
<< "RDC_FI_POWER_USAGE fallback to current_socket_power.");
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.current_socket_power);
}
// Ignore if the power is 0, which will fallback to non-bulk fetch.
if (value.field_value.value.l_int == 0) {
RDC_LOG(RDC_DEBUG, "Bulk fetch " << value.gpu_index << ":"
<< "RDC_FI_POWER_USAGE fallback to regular way.");
continue;
}
value.field_value.value.l_int *= 1000000;
break;
case RDC_FI_GPU_UTIL: // average_gfx_activity
value.field_value.value.l_int = static_cast<int64_t>(gpu_metrics.average_gfx_activity);
break;
default:
value.field_value.status = RSMI_STATUS_NOT_SUPPORTED;
value.field_value.status = AMDSMI_STATUS_NOT_SUPPORTED;
break;
}
if (value.field_value.status == RSMI_STATUS_SUCCESS) {
if (value.field_value.status == AMDSMI_STATUS_SUCCESS) {
results.push_back(value);
}
}
@@ -304,20 +319,23 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields(
return RDC_ST_OK;
}
static const uint64_t kGig = 1000000000;
constexpr double kGig = 1000000000.0;
rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id,
rdc_field_value* value) {
if (!value) {
return RDC_ST_BAD_PARAMETER;
}
uint64_t i64 = 0;
rsmi_temperature_type_t sensor_type;
rsmi_clk_type_t clk_type;
bool async_fetching = false;
RdcFieldKey f_key(gpu_index, field_id);
std::shared_ptr<FieldRSMIData> rsmi_data;
double coll_time_sec;
std::shared_ptr<FieldSMIData> smi_data;
amdsmi_processor_handle processor_handle = {};
amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle);
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << gpu_index << " error: " << ret);
return Smi2RdcError(ret);
}
if (!is_field_valid(field_id)) {
RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id << " which is not supported");
@@ -326,101 +344,121 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
value->ts = now();
value->field_id = field_id;
value->status = RSMI_STATUS_NOT_SUPPORTED;
value->status = AMDSMI_STATUS_NOT_SUPPORTED;
auto read_rsmi_counter = [&](void) {
rsmi_data = get_rsmi_data(f_key);
if (rsmi_data == nullptr) {
value->status = RSMI_STATUS_NOT_SUPPORTED;
auto read_smi_counter = [&](void) {
RdcFieldKey f_key(gpu_index, field_id);
smi_data = get_smi_data(f_key);
if (smi_data == nullptr) {
value->status = AMDSMI_STATUS_NOT_SUPPORTED;
return;
}
value->status = rsmi_counter_read(rsmi_data->evt_handle, &rsmi_data->counter_val);
value->value.l_int = rsmi_data->counter_val.value;
value->status = amdsmi_gpu_read_counter(smi_data->evt_handle, &smi_data->counter_val);
value->value.l_int = smi_data->counter_val.value;
value->type = INTEGER;
};
switch (field_id) {
case RDC_FI_GPU_MEMORY_USAGE:
value->status = rsmi_dev_memory_usage_get(gpu_index, RSMI_MEM_TYPE_VRAM, &i64);
case RDC_FI_GPU_MEMORY_USAGE: {
uint64_t u64 = 0;
value->status = amdsmi_get_gpu_memory_usage(processor_handle, AMDSMI_MEM_TYPE_VRAM, &u64);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(i64);
}
break;
case RDC_FI_GPU_MEMORY_TOTAL:
value->status = rsmi_dev_memory_total_get(gpu_index, RSMI_MEM_TYPE_VRAM, &i64);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(i64);
}
break;
case RDC_FI_GPU_COUNT:
uint32_t num_gpu;
value->status = rsmi_num_monitor_devices(&num_gpu);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(num_gpu);
}
break;
case RDC_FI_POWER_USAGE:
{
RSMI_POWER_TYPE power_type = RSMI_CURRENT_POWER;
// below call should handle both socket power and regular power
value->status = rsmi_dev_power_get(gpu_index, &i64, &power_type);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(i64);
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(u64);
}
break;
}
case RDC_FI_GPU_CLOCK:
case RDC_FI_MEM_CLOCK:
rsmi_frequencies_t f;
clk_type = RSMI_CLK_TYPE_SYS;
if (field_id == RDC_FI_MEM_CLOCK) {
clk_type = RSMI_CLK_TYPE_MEM;
}
value->status = rsmi_dev_gpu_clk_freq_get(gpu_index, clk_type, &f);
case RDC_FI_GPU_MEMORY_TOTAL: {
uint64_t u64 = 0;
value->status = amdsmi_get_gpu_memory_total(processor_handle, AMDSMI_MEM_TYPE_VRAM, &u64);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(u64);
}
break;
}
case RDC_FI_GPU_COUNT: {
uint32_t processor_count = 0;
// amdsmi is initialized in AMDSMI_INIT_AMD_GPUS mode -> returned sockets are GPUs
value->status = get_processor_count(processor_count);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(processor_count);
}
} break;
case RDC_FI_POWER_USAGE: {
amdsmi_power_info_t power_info = {};
value->status = amdsmi_get_power_info(processor_handle, &power_info);
value->type = INTEGER;
if (value->status != AMDSMI_STATUS_SUCCESS) {
break;
}
// Use current_socket_power if average_socket_power is not available
if (power_info.average_socket_power != 65535) {
value->value.l_int = static_cast<int64_t>(power_info.average_socket_power) * 1000 * 1000;
} else {
value->value.l_int = static_cast<int64_t>(power_info.current_socket_power) * 1000 * 1000;
}
break;
}
case RDC_FI_GPU_CLOCK:
case RDC_FI_MEM_CLOCK: {
amdsmi_clk_type_t clk_type = CLK_TYPE_SYS;
if (field_id == RDC_FI_MEM_CLOCK) {
clk_type = CLK_TYPE_MEM;
}
amdsmi_frequencies_t f = {};
value->status = amdsmi_get_clk_freq(processor_handle, clk_type, &f);
value->type = INTEGER;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = f.frequency[f.current];
}
break;
case RDC_FI_GPU_UTIL:
uint32_t busy_percent;
value->status = rsmi_dev_busy_percent_get(gpu_index, &busy_percent);
}
case RDC_FI_GPU_UTIL: {
amdsmi_engine_usage_t engine_usage;
value->status = amdsmi_get_gpu_activity(processor_handle, &engine_usage);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(busy_percent);
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(engine_usage.gfx_activity);
}
break;
case RDC_FI_DEV_NAME:
value->status = rsmi_dev_name_get(gpu_index, value->value.str, RDC_MAX_STR_LENGTH);
}
case RDC_FI_DEV_NAME: {
amdsmi_asic_info_t asic_info;
value->status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info);
value->type = STRING;
break;
case RDC_FI_GPU_TEMP:
case RDC_FI_MEMORY_TEMP:
int64_t val_i64;
sensor_type = RSMI_TEMP_TYPE_EDGE;
if (field_id == RDC_FI_MEMORY_TEMP) {
sensor_type = RSMI_TEMP_TYPE_MEMORY;
if (value->status == AMDSMI_STATUS_SUCCESS) {
memcpy(value->value.str, asic_info.market_name, sizeof(asic_info.market_name));
}
value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, RSMI_TEMP_CURRENT, &val_i64);
break;
}
case RDC_FI_GPU_TEMP:
case RDC_FI_MEMORY_TEMP: {
int64_t i64 = 0;
amdsmi_temperature_type_t sensor_type = TEMPERATURE_TYPE_EDGE;
if (field_id == RDC_FI_MEMORY_TEMP) {
sensor_type = TEMPERATURE_TYPE_VRAM;
}
value->status =
amdsmi_get_temp_metric(processor_handle, sensor_type, AMDSMI_TEMP_CURRENT, &i64);
// fallback to hotspot temperature as some card may not have edge temperature.
if (sensor_type == RSMI_TEMP_TYPE_EDGE
&& value->status == RSMI_STATUS_NOT_SUPPORTED) {
sensor_type = RSMI_TEMP_TYPE_JUNCTION;
value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type,
RSMI_TEMP_CURRENT, &val_i64);
if (sensor_type == TEMPERATURE_TYPE_EDGE && value->status == AMDSMI_STATUS_NOT_SUPPORTED) {
sensor_type = TEMPERATURE_TYPE_JUNCTION;
value->status =
amdsmi_get_temp_metric(processor_handle, sensor_type, AMDSMI_TEMP_CURRENT, &i64);
}
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = val_i64;
if (value->status == AMDSMI_STATUS_SUCCESS) {
value->value.l_int = i64 * 1000;
}
break;
}
case RDC_FI_ECC_CORRECT_TOTAL:
case RDC_FI_ECC_UNCORRECT_TOTAL:
get_ecc_error(gpu_index, field_id, value);
@@ -437,31 +475,33 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
case RDC_EVNT_XGMI_1_REQ_TX:
case RDC_EVNT_XGMI_1_RESP_TX:
case RDC_EVNT_XGMI_1_BEATS_TX:
read_rsmi_counter();
read_smi_counter();
break;
case RDC_EVNT_XGMI_0_THRPUT:
case RDC_EVNT_XGMI_1_THRPUT:
case RDC_EVNT_XGMI_2_THRPUT:
case RDC_EVNT_XGMI_3_THRPUT:
case RDC_EVNT_XGMI_4_THRPUT:
case RDC_EVNT_XGMI_5_THRPUT:
read_rsmi_counter();
case RDC_EVNT_XGMI_5_THRPUT: {
double coll_time_sec = 0;
read_smi_counter();
if (value->status == RDC_ST_OK) {
if (rsmi_data->counter_val.time_running > 0) {
coll_time_sec = static_cast<float>(rsmi_data->counter_val.time_running) / kGig;
if (smi_data->counter_val.time_running > 0) {
coll_time_sec = static_cast<double>(smi_data->counter_val.time_running) / kGig;
value->value.l_int = (value->value.l_int * 32) / coll_time_sec;
} else {
value->value.l_int = 0;
}
}
break;
}
default:
break;
}
int64_t latency = now() - value->ts;
if (value->status != RSMI_STATUS_SUCCESS) {
if (value->status != AMDSMI_STATUS_SUCCESS) {
if (async_fetching) { //!< Async fetching is not an error
RDC_LOG(RDC_DEBUG, "Async fetch " << field_id_string(field_id));
} else {
@@ -480,42 +520,45 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
<< value->value.str << ", latency " << latency);
}
return value->status == RSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR;
return value->status == AMDSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR;
}
std::shared_ptr<FieldRSMIData> RdcMetricFetcherImpl::get_rsmi_data(RdcFieldKey key) {
std::map<RdcFieldKey, std::shared_ptr<FieldRSMIData>>::iterator r_info = rsmi_data_.find(key);
std::shared_ptr<FieldSMIData> RdcMetricFetcherImpl::get_smi_data(RdcFieldKey key) {
std::map<RdcFieldKey, std::shared_ptr<FieldSMIData>>::iterator r_info = smi_data_.find(key);
if (r_info != rsmi_data_.end()) {
if (r_info != smi_data_.end()) {
return r_info->second;
}
return nullptr;
}
static rdc_status_t init_rsmi_counter(RdcFieldKey fk, rsmi_event_group_t grp,
rsmi_event_handle_t* handle) {
rsmi_status_t ret;
static rdc_status_t init_smi_counter(RdcFieldKey fk, amdsmi_event_group_t grp,
amdsmi_event_handle_t* handle) {
amdsmi_status_t ret;
uint32_t counters_available;
uint32_t dv_ind = fk.first;
rdc_field_t f = fk.second;
assert(handle != nullptr);
ret = rsmi_dev_counter_group_supported(dv_ind, grp);
amdsmi_processor_handle processor_handle;
ret = get_processor_handle_from_id(dv_ind, &processor_handle);
if (ret != RSMI_STATUS_SUCCESS) {
return Rsmi2RdcError(ret);
ret = amdsmi_gpu_counter_group_supported(processor_handle, grp);
if (ret != AMDSMI_STATUS_SUCCESS) {
return Smi2RdcError(ret);
}
ret = rsmi_counter_available_counters_get(dv_ind, grp, &counters_available);
if (ret != RSMI_STATUS_SUCCESS) {
return Rsmi2RdcError(ret);
ret = amdsmi_get_gpu_available_counters(processor_handle, grp, &counters_available);
if (ret != AMDSMI_STATUS_SUCCESS) {
return Smi2RdcError(ret);
}
if (counters_available == 0) {
return RDC_ST_INSUFF_RESOURCES;
}
rsmi_event_type_t evt = rdc_evnt_2_rsmi_field.at(f);
amdsmi_event_type_t evt = rdc_evnt_2_smi_field.at(f);
// Temporarily get DAC capability
ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE);
@@ -525,12 +568,12 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk, rsmi_event_group_t grp,
return RDC_ST_PERM_ERROR;
}
ret = rsmi_dev_counter_create(dv_ind, evt, handle);
if (ret != RSMI_STATUS_SUCCESS) {
return Rsmi2RdcError(ret);
ret = amdsmi_gpu_create_counter(processor_handle, evt, handle);
if (ret != AMDSMI_STATUS_SUCCESS) {
return Smi2RdcError(ret);
}
ret = rsmi_counter_control(*handle, RSMI_CNTR_CMD_START, nullptr);
ret = amdsmi_gpu_control_counter(*handle, AMDSMI_CNTR_CMD_START, nullptr);
// Release DAC capability
sc.Relinquish();
@@ -540,11 +583,11 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk, rsmi_event_group_t grp,
return RDC_ST_PERM_ERROR;
}
return Rsmi2RdcError(ret);
return Smi2RdcError(ret);
}
rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) {
rsmi_status_t ret;
rdc_status_t RdcMetricFetcherImpl::delete_smi_handle(RdcFieldKey fk) {
amdsmi_status_t ret;
switch (fk.second) {
case RDC_EVNT_XGMI_0_NOP_TX:
@@ -561,52 +604,53 @@ rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) {
case RDC_EVNT_XGMI_3_THRPUT:
case RDC_EVNT_XGMI_4_THRPUT:
case RDC_EVNT_XGMI_5_THRPUT: {
rsmi_event_handle_t h;
if (rsmi_data_.find(fk) == rsmi_data_.end()) {
amdsmi_event_handle_t h;
if (smi_data_.find(fk) == smi_data_.end()) {
return RDC_ST_NOT_SUPPORTED;
}
h = rsmi_data_[fk]->evt_handle;
h = smi_data_[fk]->evt_handle;
// Stop counting.
ret = rsmi_counter_control(h, RSMI_CNTR_CMD_STOP, nullptr);
if (ret != RSMI_STATUS_SUCCESS) {
rsmi_data_.erase(fk);
ret = amdsmi_gpu_control_counter(h, AMDSMI_CNTR_CMD_STOP, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
smi_data_.erase(fk);
RDC_LOG(RDC_ERROR, "Error in stopping event counter: " << Rsmi2RdcError(ret));
return Rsmi2RdcError(ret);
RDC_LOG(RDC_ERROR, "Error in stopping event counter: " << Smi2RdcError(ret));
return Smi2RdcError(ret);
}
// Release all resources (e.g., counter and memory resources) associated
// with evnt_handle.
ret = rsmi_dev_counter_destroy(h);
ret = amdsmi_gpu_destroy_counter(h);
rsmi_data_.erase(fk);
return Rsmi2RdcError(ret);
smi_data_.erase(fk);
return Smi2RdcError(ret);
}
default:
return RDC_ST_NOT_SUPPORTED;
}
return RDC_ST_OK;
}
rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
rdc_status_t RdcMetricFetcherImpl::acquire_smi_handle(RdcFieldKey fk) {
rdc_status_t ret = RDC_ST_OK;
auto get_evnt_handle = [&](rsmi_event_group_t grp) {
rsmi_event_handle_t handle;
auto get_evnt_handle = [&](amdsmi_event_group_t grp) {
amdsmi_event_handle_t handle;
rdc_status_t result;
if (get_rsmi_data(fk) != nullptr) {
if (get_smi_data(fk) != nullptr) {
// This event has already been initialized.
return RDC_ST_ALREADY_EXIST;
}
result = init_rsmi_counter(fk, grp, &handle);
result = init_smi_counter(fk, grp, &handle);
if (result != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Failed to init RSMI counter. Return:" << result);
RDC_LOG(RDC_ERROR, "Failed to init SMI counter. Return:" << result);
return result;
}
auto fsh = std::shared_ptr<FieldRSMIData>(new FieldRSMIData);
auto fsh = std::shared_ptr<FieldSMIData>(new FieldSMIData);
if (fsh == nullptr) {
return RDC_ST_INSUFF_RESOURCES;
@@ -614,7 +658,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
fsh->evt_handle = handle;
rsmi_data_[fk] = fsh;
smi_data_[fk] = fsh;
return RDC_ST_OK;
};
@@ -628,7 +672,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
case RDC_EVNT_XGMI_1_REQ_TX:
case RDC_EVNT_XGMI_1_RESP_TX:
case RDC_EVNT_XGMI_1_BEATS_TX:
ret = get_evnt_handle(RSMI_EVNT_GRP_XGMI);
ret = get_evnt_handle(AMDSMI_EVNT_GRP_XGMI);
break;
case RDC_EVNT_XGMI_0_THRPUT:
@@ -637,7 +681,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) {
case RDC_EVNT_XGMI_3_THRPUT:
case RDC_EVNT_XGMI_4_THRPUT:
case RDC_EVNT_XGMI_5_THRPUT:
ret = get_evnt_handle(RSMI_EVNT_GRP_XGMI_DATA_OUT);
ret = get_evnt_handle(AMDSMI_EVNT_GRP_XGMI_DATA_OUT);
break;
default:
@@ -21,6 +21,7 @@ THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcModuleMgrImpl.h"
#include <cassert>
#include <memory>
#include <type_traits>
@@ -24,35 +24,34 @@ THE SOFTWARE.
#include <assert.h>
#include <sys/time.h>
#include <cstdint>
#include <ctime>
#include <mutex> // NOLINT
#include <unordered_map>
#include <vector>
#include "amd_smi/amdsmi.h"
#include "common/rdc_capabilities.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RdcSmiLib.h"
#include "rdc_lib/impl/RdcTelemetryModule.h"
#include "rdc_lib/impl/RsmiUtils.h"
#include "rocm_smi/rocm_smi.h"
#include "rdc_lib/impl/SmiUtils.h"
namespace amd {
namespace rdc {
static std::unordered_map<rdc_field_t, rsmi_evt_notification_type_t> rdc_2_rsmi_event_notif_map = {
{RDC_EVNT_NOTIF_VMFAULT, RSMI_EVT_NOTIF_VMFAULT},
{RDC_EVNT_NOTIF_FIRST, RSMI_EVT_NOTIF_FIRST},
{RDC_EVNT_NOTIF_THERMAL_THROTTLE, RSMI_EVT_NOTIF_THERMAL_THROTTLE},
{RDC_EVNT_NOTIF_PRE_RESET, RSMI_EVT_NOTIF_GPU_PRE_RESET},
{RDC_EVNT_NOTIF_POST_RESET, RSMI_EVT_NOTIF_GPU_POST_RESET},
static std::unordered_map<rdc_field_t, amdsmi_evt_notification_type_t> rdc_2_smi_event_notif_map = {
{RDC_EVNT_NOTIF_VMFAULT, AMDSMI_EVT_NOTIF_VMFAULT},
{RDC_EVNT_NOTIF_FIRST, AMDSMI_EVT_NOTIF_FIRST},
{RDC_EVNT_NOTIF_THERMAL_THROTTLE, AMDSMI_EVT_NOTIF_THERMAL_THROTTLE},
{RDC_EVNT_NOTIF_PRE_RESET, AMDSMI_EVT_NOTIF_GPU_PRE_RESET},
{RDC_EVNT_NOTIF_POST_RESET, AMDSMI_EVT_NOTIF_GPU_POST_RESET},
};
static std::unordered_map<rsmi_evt_notification_type_t, rdc_field_t> rsmi_event_notif_2_rdc_map = {
{RSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT},
{RSMI_EVT_NOTIF_FIRST, RDC_EVNT_NOTIF_FIRST},
{RSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE},
{RSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET},
{RSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET},
static std::unordered_map<amdsmi_evt_notification_type_t, rdc_field_t> smi_event_notif_2_rdc_map = {
{AMDSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT},
{AMDSMI_EVT_NOTIF_FIRST, RDC_EVNT_NOTIF_FIRST},
{AMDSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE},
{AMDSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET},
{AMDSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET},
};
// This const determines space allocated on stack for notification events.
@@ -63,22 +62,22 @@ RdcNotificationImpl::RdcNotificationImpl() {}
RdcNotificationImpl::~RdcNotificationImpl() {}
bool RdcNotificationImpl::is_notification_event(rdc_field_t field) const {
if (rdc_2_rsmi_event_notif_map.find(field) == rdc_2_rsmi_event_notif_map.end()) {
if (rdc_2_smi_event_notif_map.find(field) == rdc_2_smi_event_notif_map.end()) {
return false;
}
return true;
}
rdc_status_t RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKey> fk_arr) {
rsmi_status_t ret;
amdsmi_status_t ret;
std::map<uint32_t, uint64_t> new_masks;
for (uint32_t i = 0; i < fk_arr.size(); ++i) {
if (rdc_2_rsmi_event_notif_map.find(fk_arr[i].second) == rdc_2_rsmi_event_notif_map.end()) {
if (rdc_2_smi_event_notif_map.find(fk_arr[i].second) == rdc_2_smi_event_notif_map.end()) {
continue;
}
new_masks[fk_arr[i].first] |=
RSMI_EVENT_MASK_FROM_INDEX(rdc_2_rsmi_event_notif_map[fk_arr[i].second]);
AMDSMI_EVENT_MASK_FROM_INDEX(rdc_2_smi_event_notif_map[fk_arr[i].second]);
}
std::map<uint32_t, uint64_t>::iterator it = new_masks.begin();
@@ -90,6 +89,15 @@ rdc_status_t RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKe
continue;
}
// Get processor handle from GPU id
amdsmi_processor_handle processor_handle;
ret = get_processor_handle_from_id(it->first, &processor_handle);
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"Failed to get processor handle for GPU " << it->first << " error: " << ret);
return Smi2RdcError(ret);
}
// Temporarily get DAC capability
ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE);
@@ -98,15 +106,15 @@ rdc_status_t RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKe
return RDC_ST_PERM_ERROR;
}
ret = rsmi_event_notification_init(it->first);
if (ret != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "rsmi_event_notification_init() returned "
ret = amdsmi_init_gpu_event_notification(processor_handle);
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "amdsmi_init_gpu_event_notification() returned "
<< ret << " for device " << it->first << ". " << std::endl
<< " Will not listen for events on this device");
continue;
}
ret = rsmi_event_notification_mask_set(it->first, it->second);
ret = amdsmi_set_gpu_event_notification_mask(processor_handle, it->second);
// Release DAC capability
sc.Relinquish();
@@ -115,14 +123,14 @@ rdc_status_t RdcNotificationImpl::set_listen_events(const std::vector<RdcFieldKe
return RDC_ST_PERM_ERROR;
}
if (ret == RSMI_STATUS_SUCCESS) {
if (ret == AMDSMI_STATUS_SUCCESS) {
gpu_evnt_notif_masks_[it->first] = it->second;
RDC_LOG(RDC_INFO, "Event notification mask for gpu " << it->first << "is set to 0x"
<< std::hex << it->second);
} else {
RDC_LOG(RDC_INFO,
"rsmi_event_notification_mask_set() returned " << ret << " for device " << it->first);
return Rsmi2RdcError(ret);
RDC_LOG(RDC_INFO, "amdsmi_set_gpu_event_notification_mask() returned "
<< ret << " for device " << it->first);
return Smi2RdcError(ret);
}
}
return RDC_ST_OK;
@@ -136,12 +144,12 @@ rdc_status_t RdcNotificationImpl::listen(rdc_evnt_notification_t* events, uint32
}
uint32_t f_cnt = std::min(*num_events, kMaxRSMIEvents);
rsmi_evt_notification_data_t rsmi_events[kMaxRSMIEvents];
amdsmi_evt_notification_data_t smi_events[kMaxRSMIEvents];
rsmi_status_t ret = rsmi_event_notification_get(timeout_ms, &f_cnt, rsmi_events);
amdsmi_status_t ret = amdsmi_get_gpu_event_notification(timeout_ms, &f_cnt, smi_events);
if (ret != RSMI_STATUS_SUCCESS) {
return Rsmi2RdcError(ret);
if (ret != AMDSMI_STATUS_SUCCESS) {
return Smi2RdcError(ret);
}
struct timeval tv;
gettimeofday(&tv, NULL);
@@ -149,35 +157,44 @@ rdc_status_t RdcNotificationImpl::listen(rdc_evnt_notification_t* events, uint32
*num_events = f_cnt;
for (uint32_t i = 0; i < f_cnt; ++i) {
assert(rsmi_event_notif_2_rdc_map.find(rsmi_events[i].event) !=
rsmi_event_notif_2_rdc_map.end());
events[i].gpu_id = rsmi_events[i].dv_ind;
events[i].field.field_id = rsmi_event_notif_2_rdc_map[rsmi_events[i].event];
assert(smi_event_notif_2_rdc_map.find(smi_events[i].event) != smi_event_notif_2_rdc_map.end());
uint64_t bdfid;
amdsmi_get_gpu_bdf_id(smi_events[i].processor_handle, &bdfid);
events[i].gpu_id = bdfid;
events[i].field.field_id = smi_event_notif_2_rdc_map[smi_events[i].event];
events[i].field.status = RDC_ST_OK;
events[i].field.ts = now;
events[i].field.type = STRING;
strncpy_with_null(events[i].field.value.str, rsmi_events[i].message, RDC_MAX_STR_LENGTH);
strncpy_with_null(events[i].field.value.str, smi_events[i].message, RDC_MAX_STR_LENGTH);
}
return RDC_ST_OK;
}
rdc_status_t RdcNotificationImpl::stop_listening(uint32_t gpu_id) {
rsmi_status_t ret;
amdsmi_status_t ret;
ret = rsmi_event_notification_mask_set(gpu_id, 0);
if (ret != RSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR,
"rsmi_event_notification_mask_set() returned " << ret << " for device " << gpu_id);
// Get processor handle from GPU id
amdsmi_processor_handle processor_handle;
ret = get_processor_handle_from_id(gpu_id, &processor_handle);
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << gpu_id << " error: " << ret);
return Smi2RdcError(ret);
}
ret = rsmi_event_notification_stop(gpu_id);
if (ret == RSMI_STATUS_SUCCESS) {
ret = amdsmi_set_gpu_event_notification_mask(processor_handle, 0);
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "amdsmi_set_gpu_event_notification_mask() returned " << ret << " for device "
<< gpu_id);
}
ret = amdsmi_stop_gpu_event_notification(processor_handle);
if (ret == AMDSMI_STATUS_SUCCESS) {
std::lock_guard<std::mutex> guard(notif_mutex_);
gpu_evnt_notif_masks_[gpu_id] = 0;
} else {
RDC_LOG(RDC_ERROR,
"rsmi_event_notification_stop() returned " << ret << " for device " << gpu_id);
"amdsmi_stop_gpu_event_notification() returned " << ret << " for device " << gpu_id);
}
return RDC_ST_OK;
}
@@ -21,21 +21,24 @@ THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcSmiDiagnosticImpl.h"
#include <amd_smi/amdsmi.h>
#include <map>
#include <string>
#include <vector>
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/impl/RsmiUtils.h"
#include "rdc_lib/impl/SmiUtils.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
RdcSmiDiagnosticImpl::RdcSmiDiagnosticImpl() {}
rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
rdc_status_t RdcSmiDiagnosticImpl::check_smi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
@@ -43,14 +46,14 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RD
result->test_case = RDC_DIAG_COMPUTE_PROCESS;
result->status = RDC_DIAG_RESULT_SKIP;
result->per_gpu_result_count = 0;
rsmi_status_t err = RSMI_STATUS_SUCCESS;
amdsmi_status_t err = AMDSMI_STATUS_SUCCESS;
uint32_t num_items = 0;
err = rsmi_compute_process_info_get(nullptr, &num_items);
if (err != RSMI_STATUS_SUCCESS) {
err = amdsmi_get_gpu_compute_process_info(nullptr, &num_items);
if (err != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Fail to get process information: " << err);
strncpy_with_null(result->info, "Fail to retreive process information from rocm_smi_lib",
strncpy_with_null(result->info, "Fail to retreive process information from amd_smi_lib",
MAX_DIAG_MSG_LENGTH);
return Rsmi2RdcError(err);
return Smi2RdcError(err);
}
// No process found
@@ -63,13 +66,13 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RD
std::string info;
// Find details of the process running on each GPU
std::vector<rsmi_process_info_t> procs(num_items);
err =
rsmi_compute_process_info_get(reinterpret_cast<rsmi_process_info_t*>(&procs[0]), &num_items);
if (err != RSMI_STATUS_SUCCESS) {
std::vector<amdsmi_process_info_t> procs(num_items);
err = amdsmi_get_gpu_compute_process_info(reinterpret_cast<amdsmi_process_info_t*>(&procs[0]),
&num_items);
if (err != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO, "Fail to get process detail information: " << err);
strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH);
return Rsmi2RdcError(err);
return Smi2RdcError(err);
}
std::map<uint32_t, std::vector<uint32_t>> pids_per_gpu;
@@ -85,17 +88,18 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RD
// Get the num_devices the process is running
uint32_t num_devices = 0;
err = rsmi_compute_process_gpus_get(procs[i].process_id, nullptr, &num_devices);
if (err != RSMI_STATUS_SUCCESS || num_devices == 0) {
amdsmi_status_t err;
err = amdsmi_get_gpu_compute_process_gpus(procs[i].process_id, nullptr, &num_devices);
if (err != AMDSMI_STATUS_SUCCESS || num_devices == 0) {
RDC_LOG(RDC_INFO, "Fail to get process GPUs detail information: " << err);
continue;
}
// Get the details of devices
std::vector<uint32_t> device_details(num_devices);
err = rsmi_compute_process_gpus_get(
err = amdsmi_get_gpu_compute_process_gpus(
procs[i].process_id, reinterpret_cast<uint32_t*>(&device_details[0]), &num_devices);
if (err != RSMI_STATUS_SUCCESS) {
if (err != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_INFO, "Fail to get process GPUs detail information: " << err);
continue;
}
@@ -147,22 +151,22 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RD
return RDC_ST_OK;
}
std::string RdcSmiDiagnosticImpl::get_temperature_string(rsmi_temperature_type_t type) const {
std::string RdcSmiDiagnosticImpl::get_temperature_string(amdsmi_temperature_type_t type) const {
switch (type) {
case RSMI_TEMP_TYPE_EDGE:
case TEMPERATURE_TYPE_EDGE:
return "Edge";
case RSMI_TEMP_TYPE_JUNCTION:
case TEMPERATURE_TYPE_JUNCTION:
return "Junction";
case RSMI_TEMP_TYPE_MEMORY:
case TEMPERATURE_TYPE_VRAM:
return "Memory";
default:
return "Unknown";
}
}
std::string RdcSmiDiagnosticImpl::get_voltage_string(rsmi_voltage_type_t type) const {
std::string RdcSmiDiagnosticImpl::get_voltage_string(amdsmi_voltage_type_t type) const {
switch (type) {
case RSMI_VOLT_TYPE_VDDGFX:
case AMDSMI_VOLT_TYPE_VDDGFX:
return "Vddgfx voltage";
default:
return "Unknown";
@@ -170,46 +174,49 @@ std::string RdcSmiDiagnosticImpl::get_voltage_string(rsmi_voltage_type_t type) c
}
// Show topology type
rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
rdc_status_t RdcSmiDiagnosticImpl::check_smi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
*result = {};
result->test_case = RDC_DIAG_NODE_TOPOLOGY;
const std::map<RSMI_IO_LINK_TYPE, std::string> link_to_string = {
{RSMI_IOLINK_TYPE_UNDEFINED, "Undefined"},
{RSMI_IOLINK_TYPE_PCIEXPRESS, "PCI Express"},
{RSMI_IOLINK_TYPE_XGMI, "XGMI"},
{RSMI_IOLINK_TYPE_NUMIOLINKTYPES, "IO Link"}};
const std::map<amdsmi_io_link_type_t, std::string> link_to_string = {
{AMDSMI_IOLINK_TYPE_UNDEFINED, "Undefined"},
{AMDSMI_IOLINK_TYPE_PCIEXPRESS, "PCI Express"},
{AMDSMI_IOLINK_TYPE_XGMI, "XGMI"},
{AMDSMI_IOLINK_TYPE_NUMIOLINKTYPES, "IO Link"}};
result->status = RDC_DIAG_RESULT_SKIP;
result->per_gpu_result_count = 0;
rsmi_status_t err = RSMI_STATUS_SUCCESS;
amdsmi_status_t err = AMDSMI_STATUS_SUCCESS;
std::string info = "";
for (uint32_t i = 0; i < gpu_count; i++) {
for (uint32_t j = 0; j < gpu_count; j++) {
if (gpu_index[i] == gpu_index[j]) continue;
std::pair<amdsmi_processor_handle, amdsmi_processor_handle> ph;
err = get_processor_handle_from_id(gpu_index[i], &ph.first);
err = get_processor_handle_from_id(gpu_index[i], &ph.second);
uint64_t weight;
err = rsmi_topo_get_link_weight(gpu_index[i], gpu_index[j], &weight);
if (err != RSMI_STATUS_SUCCESS) {
err = amdsmi_topo_get_link_weight(ph.first, ph.second, &weight);
if (err != AMDSMI_STATUS_SUCCESS) {
result->status = RDC_DIAG_RESULT_FAIL;
result->details.code = err;
std::string err_info = "rsmi_topo_get_link_weight(";
err_info += std::to_string(gpu_index[i]) + ",";
err_info += std::to_string(gpu_index[j]) + ", &weight)";
err_info += std::to_string(i) + ",";
err_info += std::to_string(j) + ", &weight)";
err_info += " fail";
strncpy_with_null(result->details.msg, err_info.c_str(), MAX_DIAG_MSG_LENGTH);
strncpy_with_null(result->info, err_info.c_str(), MAX_DIAG_MSG_LENGTH);
return RDC_ST_MSI_ERROR;
}
info += std::to_string(gpu_index[i]) + "=>";
info += std::to_string(gpu_index[j]) + " weight:";
info += std::to_string(i) + "=>";
info += std::to_string(j) + " weight:";
info += std::to_string(weight) + " ";
}
}
@@ -223,9 +230,9 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_topo_info(uint32_t gpu_index[RDC_M
return RDC_ST_OK;
}
rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
rdc_status_t RdcSmiDiagnosticImpl::check_smi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES],
uint32_t gpu_count,
rdc_diag_test_result_t* result) {
if (result == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
@@ -237,27 +244,27 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info(uint32_t gpu_index[RDC_
for (uint32_t i = 0; i < gpu_count; i++) {
// temperature
for (rsmi_temperature_type_t sensor_type = RSMI_TEMP_TYPE_FIRST;
sensor_type != RSMI_TEMP_TYPE_LAST;) {
for (amdsmi_temperature_type_t sensor_type = TEMPERATURE_TYPE_FIRST;
sensor_type < TEMPERATURE_TYPE__MAX;) {
auto status = check_temperature_level(gpu_index[i], sensor_type, result->info,
result->gpu_results[i].gpu_result.msg);
// Set to higher error level
if (status > result->status) {
result->status = status;
}
sensor_type = static_cast<rsmi_temperature_type_t>(sensor_type + 1);
sensor_type = static_cast<amdsmi_temperature_type_t>(sensor_type + 1);
}
// Voltage
for (rsmi_voltage_type_t sensor_type = RSMI_VOLT_TYPE_FIRST;
sensor_type != RSMI_VOLT_TYPE_LAST;) {
for (amdsmi_voltage_type_t sensor_type = AMDSMI_VOLT_TYPE_FIRST;
sensor_type < AMDSMI_VOLT_TYPE_LAST;) {
auto status = check_voltage_level(gpu_index[i], sensor_type, result->info,
result->gpu_results[i].gpu_result.msg);
// Set to higher error level
if (status > result->status) {
result->status = status;
}
sensor_type = static_cast<rsmi_voltage_type_t>(sensor_type + 1);
sensor_type = static_cast<amdsmi_voltage_type_t>(sensor_type + 1);
}
result->gpu_results->gpu_index = gpu_index[i];
result->per_gpu_result_count++;
@@ -266,24 +273,25 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info(uint32_t gpu_index[RDC_
}
rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
uint32_t gpu_index, rsmi_temperature_type_t type, char msg[MAX_DIAG_MSG_LENGTH],
uint32_t gpu_index, amdsmi_temperature_type_t type, char msg[MAX_DIAG_MSG_LENGTH],
char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) {
rdc_diag_result_t result = RDC_DIAG_RESULT_PASS;
rsmi_temperature_metric_t met = RSMI_TEMP_CURRENT;
rsmi_status_t err = RSMI_STATUS_SUCCESS;
amdsmi_temperature_metric_t met = AMDSMI_TEMP_CURRENT;
amdsmi_status_t err = AMDSMI_STATUS_SUCCESS;
int64_t current_temp = 0;
std::string info = msg;
std::string per_gpu_info = per_gpu_msg;
amdsmi_processor_handle processor_handle;
get_processor_handle_from_id(gpu_index, &processor_handle);
err = rsmi_dev_temp_metric_get(gpu_index, type, met, &current_temp);
if (err != RSMI_STATUS_SUCCESS) return result;
err = amdsmi_get_temp_metric(processor_handle, type, met, &current_temp);
if (err != AMDSMI_STATUS_SUCCESS) return result;
// Max temperature
met = RSMI_TEMP_MAX;
met = AMDSMI_TEMP_MAX;
int64_t max_temp = 0;
err = rsmi_dev_temp_metric_get(gpu_index, type, met, &max_temp);
if (err == RSMI_STATUS_SUCCESS) {
err = amdsmi_get_temp_metric(processor_handle, type, met, &max_temp);
if (err == AMDSMI_STATUS_SUCCESS) {
if (current_temp >= max_temp) {
result = RDC_DIAG_RESULT_WARN;
per_gpu_info += "Max ";
@@ -305,10 +313,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
}
}
met = RSMI_TEMP_MIN;
met = AMDSMI_TEMP_MIN;
int64_t min_temp = 0;
err = rsmi_dev_temp_metric_get(gpu_index, type, met, &min_temp);
if (err == RSMI_STATUS_SUCCESS) {
err = amdsmi_get_temp_metric(processor_handle, type, met, &min_temp);
if (err == AMDSMI_STATUS_SUCCESS) {
if (current_temp <= min_temp) {
result = RDC_DIAG_RESULT_WARN;
per_gpu_info += "Min ";
@@ -329,10 +337,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
}
}
met = RSMI_TEMP_CRITICAL;
met = AMDSMI_TEMP_CRITICAL;
int64_t critical_temp = 0;
err = rsmi_dev_temp_metric_get(gpu_index, type, met, &critical_temp);
if (err == RSMI_STATUS_SUCCESS) {
err = amdsmi_get_temp_metric(processor_handle, type, met, &critical_temp);
if (err == AMDSMI_STATUS_SUCCESS) {
if (current_temp >= critical_temp) {
result = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Critical ";
@@ -353,10 +361,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
}
}
met = RSMI_TEMP_EMERGENCY;
met = AMDSMI_TEMP_EMERGENCY;
int64_t emergency_temp = 0;
err = rsmi_dev_temp_metric_get(gpu_index, type, met, &emergency_temp);
if (err == RSMI_STATUS_SUCCESS) {
err = amdsmi_get_temp_metric(processor_handle, type, met, &emergency_temp);
if (err == AMDSMI_STATUS_SUCCESS) {
if (current_temp >= critical_temp) {
result = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Emergency ";
@@ -377,10 +385,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
}
}
met = RSMI_TEMP_CRIT_MIN;
met = AMDSMI_TEMP_CRIT_MIN;
int64_t critical_min_temp = 0;
err = rsmi_dev_temp_metric_get(gpu_index, type, met, &critical_min_temp);
if (err == RSMI_STATUS_SUCCESS) {
err = amdsmi_get_temp_metric(processor_handle, type, met, &critical_min_temp);
if (err == AMDSMI_STATUS_SUCCESS) {
if (current_temp <= critical_min_temp) {
result = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Critical Min ";
@@ -408,24 +416,26 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level(
}
rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index,
rsmi_voltage_type_t type,
amdsmi_voltage_type_t type,
char msg[MAX_DIAG_MSG_LENGTH],
char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) {
rdc_diag_result_t result = RDC_DIAG_RESULT_PASS;
rsmi_voltage_metric_t met = RSMI_VOLT_CURRENT;
rsmi_status_t err = RSMI_STATUS_SUCCESS;
amdsmi_voltage_metric_t met = AMDSMI_VOLT_CURRENT;
amdsmi_status_t err = AMDSMI_STATUS_SUCCESS;
int64_t current_voltage = 0;
std::string info = msg;
std::string per_gpu_info = per_gpu_msg;
amdsmi_processor_handle processor_handle;
get_processor_handle_from_id(gpu_index, &processor_handle);
err = rsmi_dev_volt_metric_get(gpu_index, type, met, &current_voltage);
if (err != RSMI_STATUS_SUCCESS) return result;
err = amdsmi_get_gpu_volt_metric(processor_handle, type, met, &current_voltage);
if (err != AMDSMI_STATUS_SUCCESS) return result;
// Max voltage
met = RSMI_VOLT_MAX;
met = AMDSMI_VOLT_MAX;
int64_t max_volt = 0;
err = rsmi_dev_volt_metric_get(gpu_index, type, met, &max_volt);
if (err == RSMI_STATUS_SUCCESS) {
err = amdsmi_get_gpu_volt_metric(processor_handle, type, met, &max_volt);
if (err == AMDSMI_STATUS_SUCCESS) {
if (current_voltage >= max_volt) {
result = RDC_DIAG_RESULT_WARN;
per_gpu_info += "Max ";
@@ -448,10 +458,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index,
}
// Min voltage
met = RSMI_VOLT_MIN;
met = AMDSMI_VOLT_MIN;
int64_t min_volt = 0;
err = rsmi_dev_volt_metric_get(gpu_index, type, met, &min_volt);
if (err == RSMI_STATUS_SUCCESS) {
err = amdsmi_get_gpu_volt_metric(processor_handle, type, met, &min_volt);
if (err == AMDSMI_STATUS_SUCCESS) {
if (current_voltage <= min_volt) {
result = RDC_DIAG_RESULT_WARN;
per_gpu_info += "Min ";
@@ -474,10 +484,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index,
}
// Max Critical voltage
met = RSMI_VOLT_MAX_CRIT;
met = AMDSMI_VOLT_MAX_CRIT;
int64_t critical_max_volt = 0;
err = rsmi_dev_volt_metric_get(gpu_index, type, met, &critical_max_volt);
if (err == RSMI_STATUS_SUCCESS) {
err = amdsmi_get_gpu_volt_metric(nullptr, type, met, &critical_max_volt);
if (err == AMDSMI_STATUS_SUCCESS) {
if (current_voltage >= critical_max_volt) {
result = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Critical Max ";
@@ -500,10 +510,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index,
}
// Min Critical voltage
met = RSMI_VOLT_MIN_CRIT;
met = AMDSMI_VOLT_MIN_CRIT;
int64_t critical_min_volt = 0;
err = rsmi_dev_volt_metric_get(gpu_index, type, met, &critical_min_volt);
if (err == RSMI_STATUS_SUCCESS) {
err = amdsmi_get_gpu_volt_metric(nullptr, type, met, &critical_min_volt);
if (err == AMDSMI_STATUS_SUCCESS) {
if (current_voltage <= critical_min_volt) {
result = RDC_DIAG_RESULT_FAIL;
per_gpu_info += "Critical Min ";
@@ -42,8 +42,8 @@ RdcSmiLib::RdcSmiLib(const RdcMetricFetcherPtr& mf)
}
}
// Bulk fetch wrapper for the rocm_smi_lib. This will be replaced after
// rocm_smi_lib can support bulk fetch.
// Bulk fetch wrapper for the amd_smi_lib. This will be replaced after
// amd_smi_lib can support bulk fetch.
rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
@@ -52,7 +52,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
return RDC_ST_BAD_PARAMETER;
}
RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from rocm_smi_lib.");
RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from amd_smi_lib.");
// Bulk fetch fields
std::vector<rdc_gpu_field_value_t> bulk_results;
@@ -60,7 +60,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields,
rdc_status_t status =
metric_fetcher_->bulk_fetch_smi_fields(fields, fields_count, bulk_results);
RDC_LOG(RDC_DEBUG, "Bulk fetched " << bulk_results.size()
<< " fields from rocm_smi_lib which return " << status);
<< " fields from amd_smi_lib which return " << status);
if (bulk_results.size() > 0) {
rdc_status_t status = callback(&bulk_results[0], bulk_results.size(), user_data);
if (status != RDC_ST_OK) {
@@ -116,12 +116,12 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint
}
for (uint32_t i = 0; i < fields_count; i++) {
ret = metric_fetcher_->acquire_rsmi_handle({fields[i].gpu_index, fields[i].field_id});
ret = metric_fetcher_->acquire_smi_handle({fields[i].gpu_index, fields[i].field_id});
if (ret != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Failed to acquire rocm_smi handle for field.");
RDC_LOG(RDC_ERROR, "Failed to acquire amd_smi handle for field.");
}
}
RDC_LOG(RDC_DEBUG, "acquire " << fields_count << " field handles from rocm_smi_lib");
RDC_LOG(RDC_DEBUG, "acquire " << fields_count << " field handles from amd_smi_lib");
return RDC_ST_OK;
}
@@ -133,9 +133,9 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields,
}
for (uint32_t i = 0; i < fields_count; i++) {
metric_fetcher_->delete_rsmi_handle({fields[i].gpu_index, fields[i].field_id});
metric_fetcher_->delete_smi_handle({fields[i].gpu_index, fields[i].field_id});
}
RDC_LOG(RDC_DEBUG, "delete " << fields_count << " field handles from rocm_smi_lib");
RDC_LOG(RDC_DEBUG, "delete " << fields_count << " field handles from amd_smi_lib");
return RDC_ST_OK;
}
@@ -146,7 +146,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI
return RDC_ST_BAD_PARAMETER;
}
// List of fields supported by rocm_smi_lib
// List of fields supported by amd_smi_lib
const std::vector<uint32_t> fields{
RDC_FI_GPU_COUNT, RDC_FI_DEV_NAME,
RDC_FI_GPU_CLOCK, RDC_FI_MEM_CLOCK,
@@ -192,11 +192,11 @@ rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case,
}
switch (test_case) {
case RDC_DIAG_COMPUTE_PROCESS:
return smi_diag_->check_rsmi_process_info(gpu_index, gpu_count, result);
return smi_diag_->check_smi_process_info(gpu_index, gpu_count, result);
case RDC_DIAG_NODE_TOPOLOGY:
return smi_diag_->check_rsmi_topo_info(gpu_index, gpu_count, result);
return smi_diag_->check_smi_topo_info(gpu_index, gpu_count, result);
case RDC_DIAG_GPU_PARAMETERS:
return smi_diag_->check_rsmi_param_info(gpu_index, gpu_count, result);
return smi_diag_->check_smi_param_info(gpu_index, gpu_count, result);
default:
return RDC_ST_NOT_SUPPORTED;
}
@@ -1,72 +0,0 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc/rdc.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace rdc {
rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi) {
switch (rsmi) {
case RSMI_STATUS_SUCCESS:
return RDC_ST_OK;
case RSMI_STATUS_INVALID_ARGS:
return RDC_ST_BAD_PARAMETER;
case RSMI_STATUS_NOT_SUPPORTED:
return RDC_ST_NOT_SUPPORTED;
case RSMI_STATUS_NOT_FOUND:
return RDC_ST_NOT_FOUND;
case RSMI_STATUS_OUT_OF_RESOURCES:
return RDC_ST_INSUFF_RESOURCES;
case RSMI_STATUS_FILE_ERROR:
return RDC_ST_FILE_ERROR;
case RSMI_STATUS_NO_DATA:
return RDC_ST_NO_DATA;
case RSMI_STATUS_PERMISSION:
return RDC_ST_PERM_ERROR;
case RSMI_STATUS_BUSY:
case RSMI_STATUS_UNKNOWN_ERROR:
case RSMI_STATUS_INTERNAL_EXCEPTION:
case RSMI_STATUS_INPUT_OUT_OF_BOUNDS:
case RSMI_STATUS_INIT_ERROR:
case RSMI_STATUS_NOT_YET_IMPLEMENTED:
case RSMI_STATUS_INSUFFICIENT_SIZE:
case RSMI_STATUS_INTERRUPT:
case RSMI_STATUS_UNEXPECTED_SIZE:
case RSMI_STATUS_UNEXPECTED_DATA:
case RSMI_STATUS_REFCOUNT_OVERFLOW:
default:
return RDC_ST_UNKNOWN_ERROR;
}
}
} // namespace rdc
} // namespace amd
@@ -0,0 +1,142 @@
/*
Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/SmiUtils.h"
#include <cstdint>
#include <vector>
#include "amd_smi/amdsmi.h"
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
namespace amd {
namespace rdc {
rdc_status_t Smi2RdcError(amdsmi_status_t rsmi) {
switch (rsmi) {
case AMDSMI_STATUS_SUCCESS:
return RDC_ST_OK;
case AMDSMI_STATUS_INVAL:
return RDC_ST_BAD_PARAMETER;
case AMDSMI_STATUS_NOT_SUPPORTED:
return RDC_ST_NOT_SUPPORTED;
case AMDSMI_STATUS_NOT_FOUND:
return RDC_ST_NOT_FOUND;
case AMDSMI_STATUS_OUT_OF_RESOURCES:
return RDC_ST_INSUFF_RESOURCES;
case AMDSMI_STATUS_FILE_ERROR:
return RDC_ST_FILE_ERROR;
case AMDSMI_STATUS_NO_DATA:
return RDC_ST_NO_DATA;
case AMDSMI_STATUS_NO_PERM:
return RDC_ST_PERM_ERROR;
case AMDSMI_STATUS_BUSY:
case AMDSMI_STATUS_UNKNOWN_ERROR:
case AMDSMI_STATUS_INTERNAL_EXCEPTION:
case AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS:
case AMDSMI_STATUS_INIT_ERROR:
case AMDSMI_STATUS_NOT_YET_IMPLEMENTED:
case AMDSMI_STATUS_INSUFFICIENT_SIZE:
case AMDSMI_STATUS_INTERRUPT:
case AMDSMI_STATUS_UNEXPECTED_SIZE:
case AMDSMI_STATUS_UNEXPECTED_DATA:
case AMDSMI_STATUS_REFCOUNT_OVERFLOW:
default:
return RDC_ST_UNKNOWN_ERROR;
}
}
amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
amdsmi_processor_handle* processor_handle) {
uint32_t socket_count;
uint32_t processor_count;
auto ret = amdsmi_get_socket_handles(&socket_count, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
std::vector<amdsmi_socket_handle> sockets(socket_count);
std::vector<amdsmi_processor_handle> all_processors{};
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
for (auto& socket : sockets) {
ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
std::vector<amdsmi_processor_handle> processors(processor_count);
ret = amdsmi_get_processor_handles(socket, &processor_count, processors.data());
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
for (auto& processor : processors) {
processor_type_t processor_type = {};
ret = amdsmi_get_processor_type(processor, &processor_type);
if (processor_type != AMD_GPU) {
RDC_LOG(RDC_ERROR, "Expect AMD_GPU device type!");
return AMDSMI_STATUS_NOT_SUPPORTED;
}
all_processors.push_back(processor);
}
}
if (gpu_id >= all_processors.size()) {
return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS;
}
// Get processor handle from GPU id
*processor_handle = all_processors[gpu_id];
return AMDSMI_STATUS_SUCCESS;
}
amdsmi_status_t get_processor_count(uint32_t& all_processor_count) {
uint32_t total_processor_count = 0;
uint32_t socket_count;
auto ret = amdsmi_get_socket_handles(&socket_count, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
std::vector<amdsmi_socket_handle> sockets(socket_count);
ret = amdsmi_get_socket_handles(&socket_count, sockets.data());
for (auto& socket : sockets) {
uint32_t processor_count;
ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
}
total_processor_count += processor_count;
}
all_processor_count = total_processor_count;
return AMDSMI_STATUS_SUCCESS;
}
} // namespace rdc
} // namespace amd
@@ -34,7 +34,7 @@ if(BUILD_ROCPTEST)
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
"${COMMON_DIR}"
"${RSMI_INC_DIR}"
"${SMI_INC_DIR}"
"${ROCM_DIR}/include"
"${ROCM_DIR}/include/hsa")
@@ -45,7 +45,7 @@ if(BUILD_ROCRTEST)
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
"${COMMON_DIR}"
"${RSMI_INC_DIR}"
"${SMI_INC_DIR}"
"${ROCM_DIR}/include")
# Set the VERSION and SOVERSION values
@@ -32,20 +32,18 @@ if(BUILD_RVS)
find_package(hsa-runtime64 REQUIRED)
find_package(rvs REQUIRED
HINTS ${ROCM_DIR}/lib/cmake)
find_library(rvslib REQUIRED
NAMES rvslib)
## additional libraries
set(COMBINED_LIBS rocblas hsakmt hsa-runtime64 hip::amdhip64 yaml-cpp)
set(RDC_LIB_MODULES ${RDC_LIB_MODULES} ${RDC_RVS_LIB} PARENT_SCOPE)
add_library(${RDC_RVS_LIB} SHARED ${RDC_RVS_LIB_SRC_LIST} ${RDC_RVS_LIB_INC_LIST})
target_link_libraries(${RDC_RVS_LIB} PRIVATE ${RDC_LIB} ${BOOTSTRAP_LIB} ${rvslib} pthread dl ${COMBINED_LIBS})
target_link_libraries(${RDC_RVS_LIB} PRIVATE ${RDC_LIB} ${BOOTSTRAP_LIB} ${rvs} pthread dl ${COMBINED_LIBS})
target_include_directories(${RDC_RVS_LIB} PRIVATE
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
"${COMMON_DIR}"
"${RSMI_INC_DIR}"
"${SMI_INC_DIR}"
"${ROCM_DIR}/include"
"${ROCM_DIR}/include/hsa"
"${ROCM_VALIDATION_SUITE_INCLUDE_DIR}")
+2 -2
Bestand weergeven
@@ -34,8 +34,8 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR})
message("--------RSMI Inc Dir: " ${RSMI_INC_DIR})
message("--------SMI Lib Dir: " ${SMI_LIB_DIR})
message("--------SMI Inc Dir: " ${SMI_INC_DIR})
message("-------GRPC ROOT Dir: " ${GRPC_ROOT})
message("")
+5 -6
Bestand weergeven
@@ -31,8 +31,8 @@ message("----------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
message("----------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
message("----------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
message("----------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
message("----------RSMI Lib Dir: " ${RSMI_LIB_DIR})
message("----------RSMI Inc Dir: " ${RSMI_INC_DIR})
message("----------SMI Lib Dir: " ${SMI_LIB_DIR})
message("----------SMI Inc Dir: " ${SMI_INC_DIR})
message("---------GRPC Root Dir: " ${GRPC_ROOT})
message("")
@@ -59,7 +59,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include
"${PROJECT_SOURCE_DIR}/include"
"${GRPC_ROOT}/include"
"${PROTOB_OUT_DIR}"
"${RSMI_INC_DIR}"
"${SMI_INC_DIR}"
"${PROJECT_SOURCE_DIR}")
set(SERVER_SRC_LIST
@@ -68,7 +68,6 @@ set(SERVER_SRC_LIST
"${PROTOBUF_GENERATED_SRCS}"
"${SRC_DIR}/rdc_admin_service.cc"
"${SRC_DIR}/rdc_api_service.cc"
"${SRC_DIR}/rdc_rsmi_service.cc"
"${SRC_DIR}/rdc_server_main.cc")
message("SERVER_SRC_LIST=${SERVER_SRC_LIST}")
@@ -76,7 +75,7 @@ set(SERVER_DAEMON_EXE "rdcd")
configure_file("rdc.service.in" "${PROJECT_BINARY_DIR}/rdc.service" @ONLY)
set(SERVICE_FILE_NAME "rdc.service")
link_directories(${RSMI_LIB_DIR})
link_directories(${SMI_LIB_DIR})
add_executable(${SERVER_DAEMON_EXE} "${SERVER_SRC_LIST}")
@@ -85,7 +84,7 @@ set_target_properties(${SERVER_DAEMON_EXE}
PROPERTIES INSTALL_RPATH "\$ORIGIN/../lib")
target_link_libraries(${SERVER_DAEMON_EXE} pthread rt gRPC::grpc++
cap dl rocm_smi64 rdc_bootstrap)
cap dl amd_smi rdc_bootstrap)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${SERVER_DAEMON_EXE}
PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE GROUP_READ
@@ -22,9 +22,9 @@ THE SOFTWARE.
#ifndef SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_
#define SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_
#include "amd_smi/amdsmi.h"
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_admin_service.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace rdc {
@@ -1,65 +0,0 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef SERVER_INCLUDE_RDC_RDC_RSMI_SERVICE_H_
#define SERVER_INCLUDE_RDC_RDC_RSMI_SERVICE_H_
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_rsmi_service.h"
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace rdc {
class RsmiServiceImpl final : public ::rdc::Rsmi::Service {
public:
RsmiServiceImpl();
~RsmiServiceImpl();
rsmi_status_t Initialize(uint64_t rsmi_init_flags = 0);
::grpc::Status GetNumDevices(::grpc::ServerContext* context,
const ::rdc::GetNumDevicesRequest* request,
::rdc::GetNumDevicesResponse* reply) override;
::grpc::Status GetTemperature(::grpc::ServerContext* context,
const ::rdc::GetTemperatureRequest* request,
::rdc::GetTemperatureResponse* response) override;
::grpc::Status GetFanRpms(::grpc::ServerContext* context, const ::rdc::GetFanRpmsRequest* request,
::rdc::GetFanRpmsResponse* response) override;
::grpc::Status GetFanSpeed(::grpc::ServerContext* context,
const ::rdc::GetFanSpeedRequest* request,
::rdc::GetFanSpeedResponse* response) override;
::grpc::Status GetFanSpeedMax(::grpc::ServerContext* context,
const ::rdc::GetFanSpeedMaxRequest* request,
::rdc::GetFanSpeedMaxResponse* response) override;
private:
bool rsmi_initialized_;
};
} // namespace rdc
} // namespace amd
#endif // SERVER_INCLUDE_RDC_RDC_RSMI_SERVICE_H_
@@ -29,7 +29,6 @@ THE SOFTWARE.
#include "rdc/rdc_admin_service.h"
#include "rdc/rdc_api_service.h"
#include "rdc/rdc_rsmi_service.h"
typedef struct {
std::string listen_address;
@@ -49,9 +48,6 @@ class RDCServer {
void Run(void);
void ShutDown(void);
bool start_rsmi_service(void) const { return start_rsmi_service_; }
void set_start_rsmi_service(bool s) { start_rsmi_service_ = s; }
bool start_rdc_admin_service(void) const { return start_rdc_admin_service_; }
void set_start_rdc_admin_service(bool s) { start_rdc_admin_service_ = s; }
@@ -68,8 +64,6 @@ class RDCServer {
bool secure_creds_;
bool use_pinned_certs_;
bool log_debug_;
bool start_rsmi_service_;
amd::rdc::RsmiServiceImpl* rsmi_service_;
RdcdCmdLineOpts* cmd_line_;
bool start_rdc_admin_service_;
@@ -1,175 +0,0 @@
/*
Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc/rdc_rsmi_service.h"
#include <assert.h>
#include <grpcpp/grpcpp.h>
#include <csignal>
#include <iostream>
#include <memory>
#include <string>
#include "rdc.grpc.pb.h" // NOLINT
#include "rocm_smi/rocm_smi.h"
namespace amd {
namespace rdc {
RsmiServiceImpl::RsmiServiceImpl() : rsmi_initialized_(false) {}
RsmiServiceImpl::~RsmiServiceImpl() {
if (rsmi_initialized_) {
rsmi_status_t rsmi_ret = rsmi_shut_down();
rsmi_initialized_ = false;
assert(rsmi_ret == RSMI_STATUS_SUCCESS);
}
}
// rsmi and rdc currently happen to have a 1-to-1 mapping, but
// have this function in case that changes
static rsmi_temperature_metric_t rdc_temp2rsmi_temp(
::rdc::GetTemperatureRequest_TemperatureMetric rdc_temp) {
return static_cast<rsmi_temperature_metric_t>(rdc_temp);
}
rsmi_status_t RsmiServiceImpl::Initialize(uint64_t rsmi_init_flags) {
rsmi_status_t rsmi_ret = rsmi_init(rsmi_init_flags);
if (rsmi_ret != RSMI_STATUS_SUCCESS) {
std::cout << "rsmi_init() returned error" << std::endl;
} else {
rsmi_initialized_ = true;
}
return rsmi_ret;
}
::grpc::Status RsmiServiceImpl::GetNumDevices(::grpc::ServerContext* context,
const ::rdc::GetNumDevicesRequest* request,
::rdc::GetNumDevicesResponse* reply) {
assert(reply != nullptr);
uint32_t num_devices;
(void)context; // Quiet warning for now;
(void)request;
assert(reply != nullptr);
rsmi_status_t ret = rsmi_num_monitor_devices(&num_devices);
// TODO(cfreehil) replace below with macro
if (ret != RSMI_STATUS_SUCCESS) {
std::cout << "rsmi_num_monitor_devices() returned error" << std::endl;
}
reply->set_val(num_devices);
reply->set_ret_val(ret);
return ::grpc::Status::OK;
}
::grpc::Status RsmiServiceImpl::GetTemperature(::grpc::ServerContext* context,
const ::rdc::GetTemperatureRequest* request,
::rdc::GetTemperatureResponse* response) {
(void)context; // Quiet warning for now;
assert(response != nullptr);
int64_t temperature;
rsmi_status_t ret = rsmi_dev_temp_metric_get(request->dv_ind(), request->sensor_type(),
rdc_temp2rsmi_temp(request->metric()), &temperature);
response->set_temperature(temperature);
response->set_ret_val(ret);
return ::grpc::Status::OK;
}
::grpc::Status RsmiServiceImpl::GetFanRpms(::grpc::ServerContext* context,
const ::rdc::GetFanRpmsRequest* request,
::rdc::GetFanRpmsResponse* response) {
(void)context; // Quiet warning for now;
assert(response != nullptr);
int64_t rpms;
rsmi_status_t ret = rsmi_dev_fan_rpms_get(request->dv_ind(), request->sensor_ind(), &rpms);
response->set_rpms(rpms);
response->set_ret_val(ret);
return ::grpc::Status::OK;
}
::grpc::Status RsmiServiceImpl::GetFanSpeed(::grpc::ServerContext* context,
const ::rdc::GetFanSpeedRequest* request,
::rdc::GetFanSpeedResponse* response) {
(void)context; // Quiet warning for now;
assert(response != nullptr);
int64_t speed;
rsmi_status_t ret = rsmi_dev_fan_speed_get(request->dv_ind(), request->sensor_ind(), &speed);
response->set_speed(speed);
response->set_ret_val(ret);
return ::grpc::Status::OK;
}
::grpc::Status RsmiServiceImpl::GetFanSpeedMax(::grpc::ServerContext* context,
const ::rdc::GetFanSpeedMaxRequest* request,
::rdc::GetFanSpeedMaxResponse* response) {
(void)context; // Quiet warning for now;
assert(response != nullptr);
uint64_t max_speed;
rsmi_status_t ret =
rsmi_dev_fan_speed_max_get(request->dv_ind(), request->sensor_ind(), &max_speed);
response->set_max_speed(max_speed);
response->set_ret_val(ret);
return ::grpc::Status::OK;
}
// TODO(cfreehil): read server config from YAML file. Config can include things
// like server address, Secure/Insecure creds, rsmi_init flags, etc.
void RunServer() {
std::string server_address("0.0.0.0:50051");
RsmiServiceImpl service;
::grpc::ServerBuilder builder;
// Listen on the given address without any authentication mechanism.
builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
// Register "service" as the instance through which we'll communicate with
// clients. In this case it corresponds to an *synchronous* service.
builder.RegisterService(&service);
// Finally assemble the server.
std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
std::cout << "Server listening on " << server_address << std::endl;
uint64_t flags = 0; // TODO(cfreehil) Read this from config file
rsmi_status_t rsmi_ret = rsmi_init(flags);
// TODO(cfreehil): check rsmi return code
// Wait for the server to shutdown. Note that some other thread must be
// responsible for shutting down the server for this call to ever return.
if (rsmi_ret != RSMI_STATUS_SUCCESS) {
std::cout << "rsmi_init() returned error. Exiting" << std::endl;
return;
}
server->Wait();
}
} // namespace rdc
} // namespace amd
@@ -38,12 +38,11 @@ THE SOFTWARE.
#include <memory>
#include <string>
#include "amd_smi/amdsmi.h"
#include "common/rdc_capabilities.h"
#include "common/rdc_utils.h"
#include "rdc.grpc.pb.h" // NOLINT
#include "rdc/rdc_api_service.h"
#include "rdc/rdc_rsmi_service.h"
#include "rocm_smi/rocm_smi.h"
// TODO(cfreehil):
// The following need to be made configurable (e.g., from YAML):
@@ -76,8 +75,7 @@ static const char* kDefaultListenAddress = "0.0.0.0";
static const char* kDefaultListenPort = "50051";
static const uint32_t kRSMIUMask = 027;
RDCServer::RDCServer()
: secure_creds_(false), rsmi_service_(nullptr), rdc_admin_service_(nullptr) {}
RDCServer::RDCServer() : secure_creds_(false), rdc_admin_service_(nullptr) {}
RDCServer::~RDCServer() {}
@@ -195,18 +193,6 @@ void RDCServer::Run() {
builder.RegisterService(rdc_admin_service_);
}
if (start_rsmi_service()) {
rsmi_service_ = new amd::rdc::RsmiServiceImpl();
builder.RegisterService(rsmi_service_);
rsmi_status_t ret = rsmi_service_->Initialize(0);
if (ret != RSMI_STATUS_SUCCESS) {
std::cerr << "Failed to start RSMI service. ret = " << ret << std::endl;
return;
}
}
if (start_api_service()) {
api_service_ = new amd::rdc::RdcAPIServiceImpl();
builder.RegisterService(api_service_);
@@ -287,11 +273,6 @@ static int FileOwner(const char* fn, std::string* owner) {
void RDCServer::ShutDown(void) {
server_->Shutdown();
if (rsmi_service_) {
delete rsmi_service_;
rsmi_service_ = nullptr;
}
if (rdc_admin_service_) {
delete rdc_admin_service_;
rdc_admin_service_ = nullptr;
@@ -673,7 +654,6 @@ int main(int argc, char** argv) {
}
// TODO(cfreehil): Eventually, set these by reading a config file
rdc_server.set_start_rsmi_service(true);
rdc_server.set_start_rdc_admin_service(true);
rdc_server.set_start_api_service(true);
+4 -4
Bestand weergeven
@@ -26,9 +26,9 @@ THE SOFTWARE.
// This file is generated on build.
#define rocm_smi_VERSION_MAJOR @rocm_smi_VERSION_MAJOR@
#define rocm_smi_VERSION_MINOR @rocm_smi_VERSION_MINOR@
#define rocm_smi_VERSION_PATCH @rocm_smi_VERSION_PATCH@
#define rocm_smi_VERSION_BUILD "@rocm_smi_VERSION_BUILD@"
#define amd_smi_VERSION_MAJOR @amd_smi_VERSION_MAJOR@
#define amd_smi_VERSION_MINOR @amd_smi_VERSION_MINOR@
#define amd_smi_VERSION_PATCH @amd_smi_VERSION_PATCH@
#define amd_smi_VERSION_BUILD "@amd_smi_VERSION_BUILD@"
#endif // INCLUDE_RDC_RDC64CONFIG_H_
@@ -45,7 +45,7 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
message("--------RSMI Inc Dir: " ${RSMI_INC_DIR})
message("--------SMI Inc Dir: " ${SMI_INC_DIR})
message("")
set(SRC_DIR "${PROJECT_SOURCE_DIR}/tests/example")
@@ -69,7 +69,7 @@ add_executable(${TEST_CLIENT_EXE} "${EXAMPLE_SRC_LIST}")
target_include_directories(${TEST_CLIENT_EXE} PRIVATE
"${CMAKE_CURRENT_SOURCE_DIR}/../../client/include"
"${RSMI_INC_DIR}")
"${SMI_INC_DIR}")
target_link_libraries(${TEST_CLIENT_EXE} rdc_client)
@@ -28,7 +28,7 @@ THE SOFTWARE.
#include <iostream>
#include "rocm_smi/rocm_smi.h"
#include "amd_smi/amdsmi.h"
#define CHK_RET_STATUS(RET) \
if ((RET) != RDC_STATUS_SUCCESS) { \
@@ -41,8 +41,8 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR})
message("--------RSMI Inc Dir: " ${RSMI_INC_DIR})
message("--------SMI Lib Dir: " ${SMI_LIB_DIR})
message("--------SMI Inc Dir: " ${SMI_INC_DIR})
message("")
set(SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
@@ -63,7 +63,7 @@ aux_source_directory(${SRC_DIR} rdctstSources)
# Other source directories
aux_source_directory(${SRC_DIR}/functional functionalSources)
link_directories(${ROCM_INSTALL_DIR} ${RSMI_LIB_DIR})
link_directories(${ROCM_INSTALL_DIR} ${SMI_LIB_DIR})
# Build rules
add_executable(${RDCTST} ${rdctstSources} ${functionalSources})
@@ -72,7 +72,7 @@ add_executable(${RDCTST} ${rdctstSources} ${functionalSources})
target_include_directories(
${RDCTST}
PUBLIC ${PROJECT_SOURCE_DIR}/include
PUBLIC ${RSMI_INC_DIR}
PUBLIC ${SMI_INC_DIR}
PUBLIC ${SRC_DIR}/..)
target_link_libraries(${RDCTST}
+1 -1
Bestand weergeven
@@ -29,6 +29,7 @@ THE SOFTWARE.
#include <string>
#include <vector>
#include "amd_smi/amdsmi.h"
#include "functional/rdci_discovery.h"
#include "functional/rdci_dmon.h"
#include "functional/rdci_fieldgroup.h"
@@ -37,7 +38,6 @@ THE SOFTWARE.
#include "rdc/rdc.h"
#include "rdc_tests/test_base.h"
#include "rdc_tests/test_common.h"
#include "rocm_smi/rocm_smi.h"
static RDCTstGlobals* sRDCGlvalues = nullptr;
@@ -24,8 +24,8 @@ THE SOFTWARE.
#include <assert.h>
#include <gtest/gtest.h>
#include "amd_smi/amdsmi.h"
#include "rdc_tests/test_common.h"
#include "rocm_smi/rocm_smi.h"
static const int kOutputLineLength = 80;
static const char kLabelDelimiter[] = "####";
@@ -30,8 +30,8 @@ THE SOFTWARE.
#include <map>
#include <string>
#include "amd_smi/amdsmi.h"
#include "rdc_tests/test_base.h"
#include "rocm_smi/rocm_smi.h"
/*static const std::map<grpc_connectivity_state, const char *> kGRPCChanState =
{
@@ -47,40 +47,40 @@ THE SOFTWARE.
},
};
*/
static const std::map<rsmi_gpu_block_t, const char*> kBlockNameMap = {
{RSMI_GPU_BLOCK_UMC, "UMC"}, {RSMI_GPU_BLOCK_SDMA, "SDMA"},
{RSMI_GPU_BLOCK_GFX, "GFX"}, {RSMI_GPU_BLOCK_MMHUB, "MMHUB"},
{RSMI_GPU_BLOCK_ATHUB, "ATHUB"}, {RSMI_GPU_BLOCK_PCIE_BIF, "PCIE_BIF"},
{RSMI_GPU_BLOCK_HDP, "HDP"}, {RSMI_GPU_BLOCK_XGMI_WAFL, "XGMI_WAFL"},
{RSMI_GPU_BLOCK_DF, "DF"}, {RSMI_GPU_BLOCK_SMN, "SMN"},
{RSMI_GPU_BLOCK_SEM, "SEM"}, {RSMI_GPU_BLOCK_MP0, "MP0"},
{RSMI_GPU_BLOCK_MP1, "MP1"}, {RSMI_GPU_BLOCK_FUSE, "FUSE"},
static const std::map<amdsmi_gpu_block_t, const char*> kBlockNameMap = {
{AMDSMI_GPU_BLOCK_UMC, "UMC"}, {AMDSMI_GPU_BLOCK_SDMA, "SDMA"},
{AMDSMI_GPU_BLOCK_GFX, "GFX"}, {AMDSMI_GPU_BLOCK_MMHUB, "MMHUB"},
{AMDSMI_GPU_BLOCK_ATHUB, "ATHUB"}, {AMDSMI_GPU_BLOCK_PCIE_BIF, "PCIE_BIF"},
{AMDSMI_GPU_BLOCK_HDP, "HDP"}, {AMDSMI_GPU_BLOCK_XGMI_WAFL, "XGMI_WAFL"},
{AMDSMI_GPU_BLOCK_DF, "DF"}, {AMDSMI_GPU_BLOCK_SMN, "SMN"},
{AMDSMI_GPU_BLOCK_SEM, "SEM"}, {AMDSMI_GPU_BLOCK_MP0, "MP0"},
{AMDSMI_GPU_BLOCK_MP1, "MP1"}, {AMDSMI_GPU_BLOCK_FUSE, "FUSE"},
};
static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_FUSE, "kBlockNameMap needs to be updated");
static_assert(AMDSMI_GPU_BLOCK_LAST == AMDSMI_GPU_BLOCK_FUSE, "kBlockNameMap needs to be updated");
static const char* kRasErrStateStrings[] = {
"None", // RSMI_RAS_ERR_STATE_NONE
"Disabled", // RSMI_RAS_ERR_STATE_DISABLED
"Error Unknown", // RSMI_RAS_ERR_STATE_PARITY
"Single, Correctable", // RSMI_RAS_ERR_STATE_SING_C
"Multiple, Uncorrectable", // RSMI_RAS_ERR_STATE_MULT_UC
"Poison" // RSMI_RAS_ERR_STATE_POISON
"Off", // RSMI_RAS_ERR_STATE_DISABLED
"On", // RSMI_RAS_ERR_STATE_ENABLED
"None", // AMDSMI_RAS_ERR_STATE_NONE
"Disabled", // AMDSMI_RAS_ERR_STATE_DISABLED
"Error Unknown", // AMDSMI_RAS_ERR_STATE_PARITY
"Single, Correctable", // AMDSMI_RAS_ERR_STATE_SING_C
"Multiple, Uncorrectable", // AMDSMI_RAS_ERR_STATE_MULT_UC
"Poison" // AMDSMI_RAS_ERR_STATE_POISON
"Off", // AMDSMI_RAS_ERR_STATE_DISABLED
"On", // AMDSMI_RAS_ERR_STATE_ENABLED
};
static_assert(sizeof(kRasErrStateStrings) / sizeof(char*) == (RSMI_RAS_ERR_STATE_LAST + 1),
static_assert(sizeof(kRasErrStateStrings) / sizeof(char*) == (AMDSMI_RAS_ERR_STATE_LAST + 1),
"kErrStateNameMap needs to be updated");
static const std::map<rsmi_ras_err_state_t, const char*> kErrStateNameMap = {
{RSMI_RAS_ERR_STATE_NONE, kRasErrStateStrings[RSMI_RAS_ERR_STATE_NONE]},
{RSMI_RAS_ERR_STATE_DISABLED, kRasErrStateStrings[RSMI_RAS_ERR_STATE_DISABLED]},
{RSMI_RAS_ERR_STATE_PARITY, kRasErrStateStrings[RSMI_RAS_ERR_STATE_PARITY]},
{RSMI_RAS_ERR_STATE_SING_C, kRasErrStateStrings[RSMI_RAS_ERR_STATE_SING_C]},
{RSMI_RAS_ERR_STATE_MULT_UC, kRasErrStateStrings[RSMI_RAS_ERR_STATE_MULT_UC]},
{RSMI_RAS_ERR_STATE_POISON, kRasErrStateStrings[RSMI_RAS_ERR_STATE_POISON]},
{RSMI_RAS_ERR_STATE_ENABLED, kRasErrStateStrings[RSMI_RAS_ERR_STATE_ENABLED]},
static const std::map<amdsmi_ras_err_state_t, const char*> kErrStateNameMap = {
{AMDSMI_RAS_ERR_STATE_NONE, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_NONE]},
{AMDSMI_RAS_ERR_STATE_DISABLED, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_DISABLED]},
{AMDSMI_RAS_ERR_STATE_PARITY, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_PARITY]},
{AMDSMI_RAS_ERR_STATE_SING_C, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_SING_C]},
{AMDSMI_RAS_ERR_STATE_MULT_UC, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_MULT_UC]},
{AMDSMI_RAS_ERR_STATE_POISON, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_POISON]},
{AMDSMI_RAS_ERR_STATE_ENABLED, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_ENABLED]},
};
static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_ENABLED,
static_assert(AMDSMI_RAS_ERR_STATE_LAST == AMDSMI_RAS_ERR_STATE_ENABLED,
"kErrStateNameMap needs to be updated");
static const struct option long_options[] = {
@@ -207,25 +207,35 @@ uint32_t ProcessCmdline(RDCTstGlobals* test, int arg_cnt, char** arg_list) {
return 1;
}
const char* GetBlockNameStr(rsmi_gpu_block_t id) { return kBlockNameMap.at(id); }
const char* GetErrStateNameStr(rsmi_ras_err_state_t st) { return kErrStateNameMap.at(st); }
const char* GetBlockNameStr(amdsmi_gpu_block_t id) { return kBlockNameMap.at(id); }
const char* GetErrStateNameStr(amdsmi_ras_err_state_t st) { return kErrStateNameMap.at(st); }
/*const char *GetGRPCChanStateStr(grpc_connectivity_state st) {
return kGRPCChanState.at(st);
}*/
const char* FreqEnumToStr(rsmi_clk_type rsmi_clk) {
static_assert(RSMI_CLK_TYPE_LAST == RSMI_CLK_TYPE_MEM, "FreqEnumToStr() needs to be updated");
const char* FreqEnumToStr(amdsmi_clk_type_t rsmi_clk) {
static_assert(CLK_TYPE__MAX == CLK_TYPE_DCLK1, "FreqEnumToStr() needs to be updated");
switch (rsmi_clk) {
case RSMI_CLK_TYPE_SYS:
case CLK_TYPE_SYS:
return "System clock";
case RSMI_CLK_TYPE_DF:
case CLK_TYPE_DF:
return "Data Fabric clock";
case RSMI_CLK_TYPE_DCEF:
case CLK_TYPE_DCEF:
return "Display Controller Engine clock";
case RSMI_CLK_TYPE_SOC:
case CLK_TYPE_SOC:
return "SOC clock";
case RSMI_CLK_TYPE_MEM:
case CLK_TYPE_MEM:
return "Memory clock";
case CLK_TYPE_PCIE:
return "PCIe clock";
case CLK_TYPE_VCLK0:
return "VCLK0 clock";
case CLK_TYPE_VCLK1:
return "VCLK1 clock";
case CLK_TYPE_DCLK0:
return "DCLK0 clock";
case CLK_TYPE_DCLK1:
return "DCLK1 clock";
default:
return "Invalid Clock ID";
}
@@ -27,7 +27,7 @@ THE SOFTWARE.
#include <string>
#include <vector>
#include "rocm_smi/rocm_smi.h"
#include "amd_smi/amdsmi.h"
struct RDCTstGlobals {
uint32_t verbosity;
@@ -45,10 +45,10 @@ struct RDCTstGlobals {
uint32_t ProcessCmdline(RDCTstGlobals* test, int arg_cnt, char** arg_list);
void PrintTestHeader(uint32_t dv_ind);
const char* GetBlockNameStr(rsmi_gpu_block_t id);
const char* GetErrStateNameStr(rsmi_ras_err_state_t st);
const char* GetBlockNameStr(amdsmi_gpu_block_t id);
const char* GetErrStateNameStr(amdsmi_ras_err_state_t st);
// const char *GetGRPCChanStateStr(grpc_connectivity_state st);
const char* FreqEnumToStr(rsmi_clk_type rsmi_clk);
const char* FreqEnumToStr(amdsmi_clk_type_t rsmi_clk);
#if ENABLE_SMI
void DumpMonitorInfo(const TestBase* test);
@@ -47,30 +47,88 @@
#include <map>
#include "rocm_smi/rocm_smi.h"
#include "amd_smi/amdsmi.h"
static const std::map<rsmi_fw_block_t, const char*> kDevFWNameMap = {
{RSMI_FW_BLOCK_ASD, "asd"},
{RSMI_FW_BLOCK_CE, "ce"},
{RSMI_FW_BLOCK_DMCU, "dmcu"},
{RSMI_FW_BLOCK_MC, "mc"},
{RSMI_FW_BLOCK_ME, "me"},
{RSMI_FW_BLOCK_MEC, "mec"},
{RSMI_FW_BLOCK_MEC2, "mec2"},
{RSMI_FW_BLOCK_PFP, "pfp"},
{RSMI_FW_BLOCK_RLC, "rlc"},
{RSMI_FW_BLOCK_RLC_SRLC, "rlc_srlc"},
{RSMI_FW_BLOCK_RLC_SRLG, "rlc_srlg"},
{RSMI_FW_BLOCK_RLC_SRLS, "rlc_srls"},
{RSMI_FW_BLOCK_SDMA, "sdma"},
{RSMI_FW_BLOCK_SDMA2, "sdma2"},
{RSMI_FW_BLOCK_SMC, "smc"},
{RSMI_FW_BLOCK_SOS, "sos"},
{RSMI_FW_BLOCK_TA_RAS, "ta_ras"},
{RSMI_FW_BLOCK_TA_XGMI, "ta_xgmi"},
{RSMI_FW_BLOCK_UVD, "uvd"},
{RSMI_FW_BLOCK_VCE, "vce"},
{RSMI_FW_BLOCK_VCN, "vcn"},
static const std::map<amdsmi_fw_block_t, const char*> kDevFWNameMap = {
{FW_ID_SMU, "SMU"},
{FW_ID_FIRST, "FIRST"},
{FW_ID_CP_CE, "CP_CE"},
{FW_ID_CP_PFP, "CP_PFP"},
{FW_ID_CP_ME, "CP_ME"},
{FW_ID_CP_MEC_JT1, "CP_MEC_JT1"},
{FW_ID_CP_MEC_JT2, "CP_MEC_JT2"},
{FW_ID_CP_MEC1, "CP_MEC1"},
{FW_ID_CP_MEC2, "CP_MEC2"},
{FW_ID_RLC, "RLC"},
{FW_ID_SDMA0, "SDMA0"},
{FW_ID_SDMA1, "SDMA1"},
{FW_ID_SDMA2, "SDMA2"},
{FW_ID_SDMA3, "SDMA3"},
{FW_ID_SDMA4, "SDMA4"},
{FW_ID_SDMA5, "SDMA5"},
{FW_ID_SDMA6, "SDMA6"},
{FW_ID_SDMA7, "SDMA7"},
{FW_ID_VCN, "VCN"},
{FW_ID_UVD, "UVD"},
{FW_ID_VCE, "VCE"},
{FW_ID_ISP, "ISP"},
{FW_ID_DMCU_ERAM, "DMCU_ERAM"},
{FW_ID_DMCU_ISR, "DMCU_ISR"},
{FW_ID_RLC_RESTORE_LIST_GPM_MEM, "RLC_RESTORE_LIST_GPM_MEM"},
{FW_ID_RLC_RESTORE_LIST_SRM_MEM, "RLC_RESTORE_LIST_SRM_MEM"},
{FW_ID_RLC_RESTORE_LIST_CNTL, "RLC_RESTORE_LIST_CNTL"},
{FW_ID_RLC_V, "RLC_V"},
{FW_ID_MMSCH, "MMSCH"},
{FW_ID_PSP_SYSDRV, "PSP_SYSDRV"},
{FW_ID_PSP_SOSDRV, "PSP_SOSDRV"},
{FW_ID_PSP_TOC, "PSP_TOC"},
{FW_ID_PSP_KEYDB, "PSP_KEYDB"},
{FW_ID_DFC, "DFC"},
{FW_ID_PSP_SPL, "PSP_SPL"},
{FW_ID_DRV_CAP, "DRV_CAP"},
{FW_ID_MC, "MC"},
{FW_ID_PSP_BL, "PSP_BL"},
{FW_ID_CP_PM4, "CP_PM4"},
{FW_ID_RLC_P, "RLC_P"},
{FW_ID_SEC_POLICY_STAGE2, "SEC_POLICY_STAGE2"},
{FW_ID_REG_ACCESS_WHITELIST, "REG_ACCESS_WHITELIST"},
{FW_ID_IMU_DRAM, "IMU_DRAM"},
{FW_ID_IMU_IRAM, "IMU_IRAM"},
{FW_ID_SDMA_TH0, "SDMA_TH0"},
{FW_ID_SDMA_TH1, "SDMA_TH1"},
{FW_ID_CP_MES, "CP_MES"},
{FW_ID_MES_KIQ, "MES_KIQ"},
{FW_ID_MES_STACK, "MES_STACK"},
{FW_ID_MES_THREAD1, "MES_THREAD1"},
{FW_ID_MES_THREAD1_STACK, "MES_THREAD1_STACK"},
{FW_ID_RLX6, "RLX6"},
{FW_ID_RLX6_DRAM_BOOT, "RLX6_DRAM_BOOT"},
{FW_ID_RS64_ME, "RS64_ME"},
{FW_ID_RS64_ME_P0_DATA, "RS64_ME_P0_DATA"},
{FW_ID_RS64_ME_P1_DATA, "RS64_ME_P1_DATA"},
{FW_ID_RS64_PFP, "RS64_PFP"},
{FW_ID_RS64_PFP_P0_DATA, "RS64_PFP_P0_DATA"},
{FW_ID_RS64_PFP_P1_DATA, "RS64_PFP_P1_DATA"},
{FW_ID_RS64_MEC, "RS64_MEC"},
{FW_ID_RS64_MEC_P0_DATA, "RS64_MEC_P0_DATA"},
{FW_ID_RS64_MEC_P1_DATA, "RS64_MEC_P1_DATA"},
{FW_ID_RS64_MEC_P2_DATA, "RS64_MEC_P2_DATA"},
{FW_ID_RS64_MEC_P3_DATA, "RS64_MEC_P3_DATA"},
{FW_ID_PPTABLE, "PPTABLE"},
{FW_ID_PSP_SOC, "PSP_SOC"},
{FW_ID_PSP_DBG, "PSP_DBG"},
{FW_ID_PSP_INTF, "PSP_INTF"},
{FW_ID_RLX6_CORE1, "RLX6_CORE1"},
{FW_ID_RLX6_DRAM_BOOT_CORE1, "RLX6_DRAM_BOOT_CORE1"},
{FW_ID_RLCV_LX7, "RLCV_LX7"},
{FW_ID_RLC_SAVE_RESTORE_LIST, "RLC_SAVE_RESTORE_LIST"},
{FW_ID_ASD, "ASD"},
{FW_ID_TA_RAS, "TA_RAS"},
{FW_ID_TA_XGMI, "TA_XGMI"},
{FW_ID_RLC_SRLG, "RLC_SRLG"},
{FW_ID_RLC_SRLS, "RLC_SRLS"},
{FW_ID_PM, "PM"},
{FW_ID_DMCU, "DMCU"},
};
const char* NameFromFWEnum(rsmi_fw_block_t blk) { return kDevFWNameMap.at(blk); }
const char* NameFromFWEnum(amdsmi_fw_block_t blk) { return kDevFWNameMap.at(blk); }
@@ -46,8 +46,8 @@
#ifndef TESTS_RDC_TESTS_TEST_UTILS_H_
#define TESTS_RDC_TESTS_TEST_UTILS_H_
#include "rocm_smi/rocm_smi.h"
#include "amd_smi/amdsmi.h"
const char* NameFromFWEnum(rsmi_fw_block_t blk);
const char* NameFromFWEnum(amdsmi_fw_block_t blk);
#endif // TESTS_RDC_TESTS_TEST_UTILS_H_