diff --git a/projects/rdc/CMakeLists.txt b/projects/rdc/CMakeLists.txt index e018edb264..1df192ed69 100755 --- a/projects/rdc/CMakeLists.txt +++ b/projects/rdc/CMakeLists.txt @@ -151,16 +151,16 @@ if(NOT EXISTS "${CMAKE_SOURCE_DIR}/raslib/.git" AND BUILD_RASLIB) If you do not want to build raslib, use cmake -DBUILD_RASLIB=off") endif() -find_package(RSMI - NAMES rocm_smi +find_package(SMI + NAMES amd_smi HINTS ${ROCM_DIR}/lib/cmake CONFIGURE REQUIRED) -set(RSMI_INC_DIR "${ROCM_SMI_INCLUDE_DIR}" CACHE INTERNAL "ROCm SMI include directory.") -set(RSMI_LIB_DIR "${ROCM_SMI_LIB_DIR}" CACHE INTERNAL "ROCm SMI library directory.") +set(SMI_INC_DIR "${AMD_SMI_INCLUDE_DIR}" CACHE INTERNAL "AMD SMI include directory.") +set(SMI_LIB_DIR "${AMD_SMI_LIB_DIR}" CACHE INTERNAL "AMD SMI library directory.") -if(NOT EXISTS "${RSMI_INC_DIR}" OR NOT EXISTS "${RSMI_LIB_DIR}") - message(FATAL_ERROR "rocm_smi not found in ${RSMI_INC_DIR}. Please - make sure rocm_smi is installed and present in ${RSMI_INC_DIR}.") +if(NOT EXISTS "${SMI_INC_DIR}" OR NOT EXISTS "${SMI_LIB_DIR}") + message(FATAL_ERROR "amd_smi not found in ${SMI_INC_DIR}. Please + make sure amd_smi is installed and present in ${SMI_INC_DIR}.") endif() if(BUILD_RASLIB AND NOT DEFINED HSA_DIR) @@ -301,7 +301,6 @@ if(BUILD_STANDALONE) unset(OLD_CMAKE_INSTALL_MESSAGE) add_subdirectory("server") - add_subdirectory("client") add_subdirectory("rdci") if(BUILD_TESTS) @@ -458,7 +457,7 @@ set(CPACK_DEBIAN_RUNTIME_PACKAGE_CONTROL_EXTRA option(ROCM_DEP_ROCMCORE "Add debian dependency on rocm-core" OFF) mark_as_advanced(ROCM_DEP_ROCMCORE) -set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-smi-lib, libc6") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "amd-smi-lib, libc6") if(ROCM_DEP_ROCMCORE) string(APPEND CPACK_DEBIAN_PACKAGE_DEPENDS ", rocm-core") endif() @@ -485,7 +484,7 @@ endif() set(CPACK_RPM_PACKAGE_AUTOREQ 0) set(CPACK_RPM_PACKAGE_AUTOPROV 0) -set(CPACK_RPM_PACKAGE_REQUIRES "rocm-smi-lib") +set(CPACK_RPM_PACKAGE_REQUIRES "amd-smi-lib") # rdc-tests need rdc set(CPACK_RPM_TESTS_PACKAGE_REQUIRES "${CPACK_PACKAGE_NAME}") list(APPEND CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/lib" diff --git a/projects/rdc/README.md b/projects/rdc/README.md index 105e108e63..1ae13c39a9 100644 --- a/projects/rdc/README.md +++ b/projects/rdc/README.md @@ -30,7 +30,7 @@ RDC can run on AMD ROCm supported platforms, please refer to the [List of Suppor * It is recommended to install the complete AMD ROCm platform. For installation instruction see https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html * At the minimum, these two components are required - (i) AMD ROCm SMI Library (https://github.com/ROCm/rocm_smi_lib) + (i) AMDSMI Library (https://github.com/ROCm/amdsmi) (ii) AMD ROCk Kernel driver (https://github.com/ROCm/ROCK-Kernel-Driver) ## Building gRPC and protoc diff --git a/projects/rdc/client/CMakeLists.txt b/projects/rdc/client/CMakeLists.txt deleted file mode 100755 index fa0f3b1deb..0000000000 --- a/projects/rdc/client/CMakeLists.txt +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") -message(" Cmake Client Lib ") -message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") - -## Compiler flags -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -m64") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse -msse2") -# Use this instead of above for 32 bit -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32") - -if("${CMAKE_BUILD_TYPE}" STREQUAL Release) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") -else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG") -endif() - - -# Required Defines first: - -message("") -message("Build Configuration:") -message("-------------BuildType: " ${CMAKE_BUILD_TYPE}) -message("--------------Compiler: " ${CMAKE_CXX_COMPILER}) -message("---------------Version: " ${CMAKE_CXX_COMPILER_VERSION}) -message("----------Proj Src Dir: " ${PROJECT_SOURCE_DIR}) -message("----------Proj Bld Dir: " ${PROJECT_BINARY_DIR}) -message("----------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) -message("----------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) -message("----------RSMI Lib Dir: " ${RSMI_LIB_DIR}) -message("----------RSMI Inc Dir: " ${RSMI_INC_DIR}) -message("---------GRPC Root Dir: " ${GRPC_ROOT}) -message("") - -## Include common cmake modules -include(utils) - -set(CLIENT_LIB "rdc_client_smi") -set(SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src") -set(INC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include/rdc") - -################# Determine the library version ######################### -## Setup the SO version based on git tags. -set(SO_VERSION_GIT_TAG_PREFIX "rdc_so_ver") - -# provide git to utilities -find_program(GIT NAMES git) - -# Debian package specific variables -# Set a default value for the package version -get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT) - -# VERSION_* variables should be set by get_version_from_tag -set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}") -message("SOVERSION: ${SO_VERSION_STRING}") - -set(CPACK_PACKAGE_FILE_NAME "${RDC_PACKAGE}-${VERSION_STRING}") - -# TODO delete these if not used -file(GLOB PROTOBUF_GENERATED_INCLUDES "${PROTOB_OUT_DIR}/*.h") -file(GLOB PROTOBUF_GENERATED_SRCS "${PROTOB_OUT_DIR}/*.cc") - -set(CLIENT_LIB_SRC_LIST "${SRC_DIR}/rdc_client.cc" - "${SRC_DIR}/rdc_client_main.cc" - "${SRC_DIR}/rdc_client_utils.cc" - "${PROTOBUF_GENERATED_SRCS}" - "${COMMON_DIR}/rdc_utils.cc") -message("CLIENT_LIB_SRC_LIST=${CLIENT_LIB_SRC_LIST}") - -set(CLIENT_LIB_INC_LIST "${INC_DIR}/rdc_client.h" - "${INC_DIR}/rdc_exception.h" - "${INC_DIR}/rdc_client_main.h" - "${COMMON_DIR}/rdc_utils.h") - -add_library(${CLIENT_LIB} SHARED ${CLIENT_LIB_SRC_LIST} ${CLIENT_LIB_INC_LIST}) -target_link_libraries(${CLIENT_LIB} pthread rt gRPC::grpc++ dl) -target_include_directories(${CLIENT_LIB} PRIVATE - "${PROJECT_SOURCE_DIR}" - "${PROJECT_SOURCE_DIR}/include" - "${CMAKE_CURRENT_SOURCE_DIR}/include" - "${PROTOB_OUT_DIR}" - "${RSMI_INC_DIR}") -# TODO: set the properties for the library once we have one -## Set the VERSION and SOVERSION values -set_property(TARGET ${CLIENT_LIB} PROPERTY - SOVERSION "${VERSION_MAJOR}") -set_property(TARGET ${CLIENT_LIB} PROPERTY - VERSION "${SO_VERSION_STRING}") - -## If the library is a release, strip the target library -if("${CMAKE_BUILD_TYPE}" STREQUAL Release) - add_custom_command( - TARGET ${CLIENT_LIB} - POST_BUILD COMMAND ${CMAKE_STRIP} lib${CLIENT_LIB}.so) -endif() - -## Add the install directives for the runtime library. -install(TARGETS ${CLIENT_LIB} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${RDC} - COMPONENT ${CLIENT_COMPONENT}) -install(DIRECTORY ${PROJECT_SOURCE_DIR}/authentication - DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${RDC} - COMPONENT ${CLIENT_COMPONENT}) - -message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") -message(" Finished Cmake Client Lib ") -message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") diff --git a/projects/rdc/client/include/rdc/rdc_client.h b/projects/rdc/client/include/rdc/rdc_client.h deleted file mode 100644 index 8813f2c43f..0000000000 --- a/projects/rdc/client/include/rdc/rdc_client.h +++ /dev/null @@ -1,382 +0,0 @@ - -/* -Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_H_ -#define CLIENT_INCLUDE_RDC_RDC_CLIENT_H_ - -#include - -#include -#include - -#include "rocm_smi/rocm_smi.h" - -/** - * @brief Error codes retured by rdc functions - */ -typedef enum { - RDC_STATUS_SUCCESS = 0x0, //!< Operation was successful - RDC_RSMI_STATUS_INVALID_ARGS, //!< Passed in arguments are not valid - RDC_RSMI_STATUS_NOT_SUPPORTED, //!< The requested information or - //!< action is not available for the - //!< given input, on the given system - RDC_RSMI_STATUS_FILE_ERROR, //!< Problem accessing a file. This - //!< may because the operation is not - //!< supported by the Linux kernel - //!< version running on the executing - //!< machine - RDC_RSMI_STATUS_PERMISSION, //!< Permission denied/EACCESS file - //!< error. Many functions require - //!< root access to run. - RDC_RSMI_STATUS_OUT_OF_RESOURCES, //!< Unable to acquire memory or other - //!< resource - RDC_RSMI_STATUS_INTERNAL_EXCEPTION, //!< An internal exception was caught - RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS, //!< The provided input is out of - //!< allowable or safe range - RDC_RSMI_STATUS_INIT_ERROR, //!< An error occurred when creating - //!< a communications channel - RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED, //!< The requested function has not - //!< yet been implemented in the - //!< current system for the current - //!< devices - RDC_RSMI_STATUS_NOT_FOUND, //!< An item was searched for but not - //!< found - RDC_RSMI_STATUS_INSUFFICIENT_SIZE, //!< Not enough resources were - //!< available for the operation - RDC_RSMI_STATUS_INTERRUPT, //!< An interrupt occurred during - //!< execution of function - RDC_RSMI_STATUS_UNEXPECTED_SIZE, //!< An unexpected amount of data - //!< was read - RDC_RSMI_STATUS_NO_DATA, //!< No data was found for a given - //!< input - RDC_RSMI_STATUS_UNKNOWN_ERROR, //!< An unknown error occurred - RDC_STATUS_GRPC_ERR_FIRST = 1000, - - /// Not an error; returned on success. - RDC_STATUS_GRPC_OK = RDC_STATUS_GRPC_ERR_FIRST, - - /// The operation was cancelled (typically by the caller). - RDC_STATUS_GRPC_CANCELLED, - - /// Unknown error. An example of where this error may be returned is if a - /// Status value received from another address space belongs to an error-space - /// that is not known in this address space. Also errors raised by APIs that - /// do not return enough error information may be converted to this error. - RDC_STATUS_GRPC_UNKNOWN, - - /// Client specified an invalid argument. Note that this differs from - /// FAILED_PRECONDITION. INVALID_ARGUMENT indicates arguments that are - /// problematic regardless of the state of the system (e.g., a malformed file - /// name). - RDC_STATUS_GRPC_INVALID_ARG, - - /// Deadline expired before operation could complete. For operations that - /// change the state of the system, this error may be returned even if the - /// operation has completed successfully. For example, a successful response - /// from a server could have been delayed long enough for the deadline to - /// expire. - RDC_STATUS_GRPC_DEADLINE_EXCEEDED, - - /// Some requested entity (e.g., file or directory) was not found. - RDC_STATUS_GRPC_NOT_FOUND, - - /// Some entity that we attempted to create (e.g., file or directory) already - /// exists. - RDC_STATUS_GRPC_ALREADY_EXISTS, - - /// The caller does not have permission to execute the specified operation. - /// PERMISSION_DENIED must not be used for rejections caused by exhausting - /// some resource (use RESOURCE_EXHAUSTED instead for those errors). - /// PERMISSION_DENIED must not be used if the caller can not be identified - /// (use UNAUTHENTICATED instead for those errors). - RDC_STATUS_GRPC_PERM_DENIED, - - /// The request does not have valid authentication credentials for the - /// operation. - RDC_STATUS_GRPC_UNAUTHENTICATED, - - /// Some resource has been exhausted, perhaps a per-user quota, or perhaps the - /// entire file system is out of space. - RDC_STATUS_GRPC_RESOURCE_EXHAUSTED, - - /// Operation was rejected because the system is not in a state required for - /// the operation's execution. For example, directory to be deleted may be - /// non-empty, an rmdir operation is applied to a non-directory, etc. - /// - /// A litmus test that may help a service implementor in deciding - /// between FAILED_PRECONDITION, ABORTED, and UNAVAILABLE: - /// (a) Use UNAVAILABLE if the client can retry just the failing call. - /// (b) Use ABORTED if the client should retry at a higher-level - /// (e.g., restarting a read-modify-write sequence). - /// (c) Use FAILED_PRECONDITION if the client should not retry until - /// the system state has been explicitly fixed. E.g., if an "rmdir" - /// fails because the directory is non-empty, FAILED_PRECONDITION - /// should be returned since the client should not retry unless - /// they have first fixed up the directory by deleting files from it. - /// (d) Use FAILED_PRECONDITION if the client performs conditional - /// REST Get/Update/Delete on a resource and the resource on the - /// server does not match the condition. E.g., conflicting - /// read-modify-write on the same resource. - RDC_STATUS_GRPC_FAILED_PRECOND, - - /// The operation was aborted, typically due to a concurrency issue like - /// sequencer check failures, transaction aborts, etc. - /// - /// See litmus test above for deciding between FAILED_PRECONDITION, ABORTED, - /// and UNAVAILABLE. - RDC_STATUS_GRPC_ABORTED, - - /// Operation was attempted past the valid range. E.g., seeking or reading - /// past end of file. - /// - /// Unlike INVALID_ARGUMENT, this error indicates a problem that may be fixed - /// if the system state changes. For example, a 32-bit file system will - /// generate INVALID_ARGUMENT if asked to read at an offset that is not in the - /// range [0,2^32-1], but it will generate OUT_OF_RANGE if asked to read from - /// an offset past the current file size. - /// - /// There is a fair bit of overlap between FAILED_PRECONDITION and - /// OUT_OF_RANGE. We recommend using OUT_OF_RANGE (the more specific error) - /// when it applies so that callers who are iterating through a space can - /// easily look for an OUT_OF_RANGE error to detect when they are done. - RDC_STATUS_GRPC_OUT_OF_RANGE, - - /// Operation is not implemented or not supported/enabled in this service. - RDC_STATUS_GRPC_UNIMPLEMENTED, - - /// Internal errors. Means some invariants expected by underlying System has - /// been broken. If you see one of these errors, Something is very broken. - RDC_STATUS_GRPC_INTERNAL, - - /// The service is currently unavailable. This is a most likely a transient - /// condition and may be corrected by retrying with a backoff. - /// - /// \warning Although data MIGHT not have been transmitted when this - /// status occurs, there is NOT A GUARANTEE that the server has not seen - /// anything. So in general it is unsafe to retry on this status code - /// if the call is non-idempotent. - /// - /// See litmus test above for deciding between FAILED_PRECONDITION, ABORTED, - /// and UNAVAILABLE. - RDC_STATUS_GRPC_UNAVAILABLE, - - /// Unrecoverable data loss or corruption. - RDC_STATUS_GRPC_DATA_LOSS, - - RDC_STATUS_CLIENT_ERR_FIRST = 2000, - - /// SSL authentication error occurred. - RDC_STATUS_CLIENT_ERR_SSL = RDC_STATUS_CLIENT_ERR_FIRST, - - RDC_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred -} rdc_status_t; - -/** - * @brief Handle to RDC server channel - */ -typedef uintptr_t rdc_channel_t; - -#define RDC_DEFAULT_SERVER_PORT 50051 -#define RDC_DEFAULT_SERVER_IP "localhost" - -/*****************************************************************************/ -/** @defgroup RDCAdmin RDC Administration Functions - * These administrative functions are used to monitor and control, for - * example RDC connectivity. - * @{ - */ - -/** - * @brief Check the connection status of a channel - * - * @details Given an ::rdc_channel_t @p channel and a boolean @p - * try_to_connect, this function will return the grpc_connectivity_state for - * that channel - * - * @p channel[in] The channel for which the status will be given - * - * @param[in] try_to_connect If the channel is currently IDLE, if the argument - * is true, transition to CONNECTING. - * - * @param[inout] state A pointer to caller provided memory to which an - * the grpc_connectivity_state will be written. grpc_connectivity_state has - * the following possible values: - * GRPC_CHANNEL_IDLE channel is idle - * GRPC_CHANNEL_CONNECTING channel is connecting - * GRPC_CHANNEL_READY channel is ready for work - * GRPC_CHANNEL_TRANSIENT_FAILURE channel has seen a failure but expects to - * recover - * GRPC_CHANNEL_SHUTDOWN channel has seen a failure that it cannot - * recover from - * - * @retval ::RDC_STATUS_SUCCESS is returned upon successful call. - * - */ -rdc_status_t rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect, - grpc_connectivity_state* state); - -/** - * @brief Verify a channel's connection to the server - * - * @details Given an ::rdc_channel_t @p channel, this function will send a - * random number to the server associated with @p channel. The server will send - * the number back. Upon receiving the returned message from the server, the - * number sent to the server is compared to the number received from the - * server. If the 2 numbers are the same, the connection is verified. - * Otherwise, an appropriate error code is returned. - * - * @p channel[in] The channel for which the connection will be verified - * - * @retval ::RDC_STATUS_SUCCESS is returned upon successful call. - * - */ -rdc_status_t rdc_channel_connection_verify(rdc_channel_t channel); - -/** @} */ // end of RDCAdmin - -/*****************************************************************************/ -/** @defgroup InitShutAdmin Initialization and Shutdown - * These functions are used for initialization of RDC and clean up when - * done. - * @{ - */ - -/** - * @brief Create a communications channel to an RDC server - * - * @details Given a pointer to an ::rdc_channel_t @p channel, a string - * containing the ip address of the server @p ip, a string containing - * the port number on which the server is listening @p port and a bool - * indicating whether the channel should use a secure link @p secure, - * this function will attempt to create a new channel and write its - * location to address pointed to by @p channel. - * - * @p channel[inout] A pointer to caller provided memory to which an - * ::rdc_channel_t will be written - * - * @param[in] ip A pointer to a string containing the address of the server. - * If nullptr is passed for this parameter, RDC_DEFAULT_SERVER_IP will be used. - * - * @param[in] port A pointer to string containing the port on which the - * RDC server is listening. If nullptr is passed for this parameter, - * RDC_DEFAULT_SERVER_PORT will be used. - * - * @param[in] secure A bool indicating whether SSL should be used for - * communications (not currently supported) - * - * @retval ::RDC_STATUS_SUCCESS is returned upon successful call. - * - */ -rdc_status_t rdc_channel_create(rdc_channel_t* channel, const char* ip, const char* port, - bool secure); - -/** - * @brief Destroy a communications channel to an RDC server - * - * @details Given an ::rdc_channel_t @p channel, this function will free any - * resources used by @p channel - * - * @p channel[inout] An ::rdc_channel_t will be freed - * - * @retval ::RDC_STATUS_SUCCESS is returned upon successful call. - * - */ -rdc_status_t rdc_channel_destroy(rdc_channel_t channel); - -/** @} */ // end of InitShutAdmin - -/*****************************************************************************/ -/** @defgroup RSMIAccess Remote ROCm SMI Calls - * These functions calls make ROCm SMI function calls on the remote server. - * Please refer to the - * [ROCm SMI documentation] - * (https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/docs) for - * information about the calls. Here, we will document any additional aspects - * of the calls introduced by RDC that are not covered in the ROCm SMI - * documentation. - * - * All of the functions in this section attempt to make an RSMI call on the - * server machine, given an ::rdc_channel_t associated with the server, and - * all the arguments that are required to make the RSMI call. - * @{ - */ - -/** - * @brief Remote call to rsmi_num_monitor_devices() - * - */ -rdc_status_t rdc_num_gpus_get(rdc_channel_t channel, uint64_t* num_gpu); - -/** @} */ // end of RSMIAccess - -/** @defgroup PhysQuer Physical State Queries - * These functions provide information about the physical characteristics of - * the device. - * @{ - */ -/** - * @brief Remote call to rsmi_dev_temp_metric_get() - * - */ -rdc_status_t rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_type, - rsmi_temperature_metric_t metric, int64_t* temperature); - -/** - * @brief Remote call to rsmi_dev_fan_rpms_get() - * - */ -rdc_status_t rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind, - int64_t* rpms); - -/** - * @brief Remote call to rsmi_dev_fan_speed_get() - * - */ -rdc_status_t rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind, - int64_t* speed); - -/** - * @brief Remote call to rsmi_dev_fan_speed_max_get() - * - */ -rdc_status_t rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind, - uint64_t* max_speed); -/** @} */ // end of PhysQuer - -/** - * @brief Get a description of a provided RDC error status - * - * @details Set the provided pointer to a const char *, @p status_string, to - * a string containing a description of the provided error code @p status. - * - * @param[in] status The error status for which a description is desired - * - * @param[inout] status_string A pointer to a const char * which will be made - * to point to a description of the provided error code - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call - * - */ -rdc_status_t rdc_status_string(rdc_status_t status, const char** status_string); - -#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_H_ diff --git a/projects/rdc/client/include/rdc/rdc_client_main.h b/projects/rdc/client/include/rdc/rdc_client_main.h deleted file mode 100644 index 9e8a9a841a..0000000000 --- a/projects/rdc/client/include/rdc/rdc_client_main.h +++ /dev/null @@ -1,69 +0,0 @@ - -/* -Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_ -#define CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_ - -#include - -#include -#include - -#include "rdc.grpc.pb.h" // NOLINT -#include "rdc/rdc_client.h" - -namespace amd { -namespace rdc { - -class RDCChannel { - public: - explicit RDCChannel(std::string server_ip, std::string server_port, bool secure_channel); - ~RDCChannel(); - - rdc_status_t Initialize(void); - - // Getters and Setters - - // Don't have setter for server ip and ports; we don't want to change those - // after construction - std::string server_ip(void) const { return server_ip_; } - std::string server_port(void) const { return server_port_; } - bool secure_channel(void) const { return secure_channel_; } - std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub(void) const { return rsmi_stub_; } - std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub(void) const { return rdc_admin_stub_; } - std::shared_ptr const channel(void) { return channel_; } - - private: - std::string server_ip_; - std::string server_port_; - bool secure_channel_; - std::shared_ptr<::rdc::Rsmi::Stub> rsmi_stub_; - std::shared_ptr<::rdc::RdcAdmin::Stub> rdc_admin_stub_; - std::shared_ptr channel_; - std::shared_ptr channel_creds_; -}; - -} // namespace rdc -} // namespace amd - -#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_MAIN_H_ diff --git a/projects/rdc/client/include/rdc/rdc_client_utils.h b/projects/rdc/client/include/rdc/rdc_client_utils.h deleted file mode 100644 index 641a652caf..0000000000 --- a/projects/rdc/client/include/rdc/rdc_client_utils.h +++ /dev/null @@ -1,34 +0,0 @@ -/* -Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ -#ifndef CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_ -#define CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_ - -#include "rdc/rdc_client.h" -namespace amd { -namespace rdc { - -rdc_status_t GrpcErrorToRdcError(::grpc::StatusCode grpc_err); - -} // namespace rdc -} // namespace amd - -#endif // CLIENT_INCLUDE_RDC_RDC_CLIENT_UTILS_H_ diff --git a/projects/rdc/client/include/rdc/rdc_exception.h b/projects/rdc/client/include/rdc/rdc_exception.h deleted file mode 100644 index 8d2b990332..0000000000 --- a/projects/rdc/client/include/rdc/rdc_exception.h +++ /dev/null @@ -1,50 +0,0 @@ -/* -Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#ifndef CLIENT_INCLUDE_RDC_RDC_EXCEPTION_H_ -#define CLIENT_INCLUDE_RDC_RDC_EXCEPTION_H_ - -#include -#include - -#include "rdc/rdc_client.h" - -namespace amd { -namespace rdc { - -/// @brief Exception type which carries an error code to return to the user. -class rdc_exception : public std::exception { - public: - rdc_exception(rdc_status_t error, const std::string description) - : err_(error), desc_(description) {} - rdc_status_t error_code() const noexcept { return err_; } - const char* what() const noexcept override { return desc_.c_str(); } - - private: - rdc_status_t err_; - std::string desc_; -}; - -} // namespace rdc -} // namespace amd - -#endif // CLIENT_INCLUDE_RDC_RDC_EXCEPTION_H_ diff --git a/projects/rdc/client/src/rdc_client.cc b/projects/rdc/client/src/rdc_client.cc deleted file mode 100644 index 57ee7dead1..0000000000 --- a/projects/rdc/client/src/rdc_client.cc +++ /dev/null @@ -1,547 +0,0 @@ -/* -Copyright (c) 2019 - Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "rdc/rdc_client.h" - -#include -#include -#include - -#include - -#include "common/rdc_utils.h" -#include "rdc.grpc.pb.h" // NOLINT -#include "rdc/rdc_client_main.h" -#include "rdc/rdc_client_utils.h" -#include "rdc/rdc_exception.h" -#include "rocm_smi/rocm_smi.h" - -#define CHK_PTR_ARG(PTR) \ - if ((PTR) == nullptr) { \ - return RDC_RSMI_STATUS_INVALID_ARGS; \ - } - -#define UINTPTR_TO_RDC_CHAN(UPTR) \ - amd::rdc::RDCChannel* ch = reinterpret_cast(UPTR); \ - if (ch == nullptr) { \ - return RDC_STATUS_GRPC_INVALID_ARG; \ - } - -static rdc_status_t handleException() { - try { - throw; - } catch (const std::bad_alloc& e) { - debug_print("RDC exception: BadAlloc\n"); - return RDC_RSMI_STATUS_OUT_OF_RESOURCES; - } catch (const amd::rdc::rdc_exception& e) { - debug_print("Exception caught: %s.\n", e.what()); - return e.error_code(); - return RDC_RSMI_STATUS_INTERNAL_EXCEPTION; - } catch (const std::exception& e) { - debug_print("Unhandled exception: %s\n", e.what()); - assert(false && "Unhandled exception."); - return RDC_RSMI_STATUS_INTERNAL_EXCEPTION; - } catch (const std::nested_exception& e) { - debug_print("Callback threw, forwarding.\n"); - e.rethrow_nested(); - return RDC_RSMI_STATUS_INTERNAL_EXCEPTION; - } catch (...) { - assert(false && "Unhandled exception."); - abort(); - return RDC_RSMI_STATUS_INTERNAL_EXCEPTION; - } -} - -#define TRY try { -#define CATCH \ - } \ - catch (...) { \ - return handleException(); \ - } - -rdc_status_t rdc_channel_create(rdc_channel_t* channel, const char* ip, const char* port, - bool secure) { - TRY std::string server_str; - std::string port_str; - - if (channel == nullptr) { - return RDC_STATUS_GRPC_INVALID_ARG; - } - if (ip != nullptr) { - server_str = ip; - } else { - server_str = RDC_DEFAULT_SERVER_IP; - } - - if (port != nullptr) { - port_str = port; - } else { - port_str = std::to_string(RDC_DEFAULT_SERVER_PORT); - } - - amd::rdc::RDCChannel* ch = new amd::rdc::RDCChannel(server_str, port_str, secure); - - if (ch == nullptr) { - return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED; - } - - rdc_status_t ret = ch->Initialize(); - - if (ret != 0) { - delete ch; - return ret; - } - - *channel = reinterpret_cast(ch); - - return RDC_STATUS_SUCCESS; - - CATCH -} -rdc_status_t rdc_channel_state_get(rdc_channel_t channel, bool try_to_connect, - grpc_connectivity_state* state) { - TRY CHK_PTR_ARG(state) UINTPTR_TO_RDC_CHAN(channel) - - * state = ch->channel()->GetState(try_to_connect); - return RDC_STATUS_SUCCESS; - - CATCH -} - -rdc_status_t rdc_channel_connection_verify(rdc_channel_t channel) { - TRY UINTPTR_TO_RDC_CHAN(channel) - - ::rdc::VerifyConnectionResponse resp; - ::rdc::VerifyConnectionRequest req; - ::grpc::ClientContext context; - unsigned int seed = time(NULL); - - req.set_magic_num(static_cast(rand_r(&seed))); - ::grpc::Status status = ch->rdc_admin_stub()->VerifyConnection(&context, req, &resp); - - if (!status.ok()) { - return amd::rdc::GrpcErrorToRdcError(status.error_code()); - } - - if (resp.echo_magic_num() != req.magic_num()) { - return RDC_STATUS_GRPC_DATA_LOSS; - } - - return RDC_STATUS_SUCCESS; - - CATCH -} - -rdc_status_t rdc_channel_destroy(rdc_channel_t channel) { - TRY UINTPTR_TO_RDC_CHAN(channel) - - delete ch; - - return RDC_STATUS_SUCCESS; - - CATCH -} - -rdc_status_t rdc_num_gpus_get(rdc_channel_t channel, uint64_t* num_gpu) { - TRY CHK_PTR_ARG(num_gpu) UINTPTR_TO_RDC_CHAN(channel) - - ::rdc::GetNumDevicesResponse resp; - ::rdc::GetNumDevicesRequest empty; - ::grpc::ClientContext context; - ::grpc::Status status = ch->rsmi_stub()->GetNumDevices(&context, empty, &resp); - - if (!status.ok()) { - return amd::rdc::GrpcErrorToRdcError(status.error_code()); - } - - *num_gpu = resp.val(); - return static_cast(resp.ret_val()); - - CATCH -} - -// rsmi and rdc currently happen to have a 1-to-1 mapping, but -// have this function in case that changes -static ::rdc::GetTemperatureRequest_TemperatureMetric rsmi_temp2rdc_temp( - rsmi_temperature_metric_t rsmi_temp) { - return static_cast<::rdc::GetTemperatureRequest_TemperatureMetric>(rsmi_temp); -} - -rdc_status_t rdc_dev_temp_metric_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_type, - rsmi_temperature_metric_t metric, int64_t* temperature) { - TRY CHK_PTR_ARG(temperature) UINTPTR_TO_RDC_CHAN(channel) - - ::rdc::GetTemperatureResponse resp; - ::rdc::GetTemperatureRequest in_args; - ::grpc::ClientContext context; - - in_args.set_metric(rsmi_temp2rdc_temp(metric)); - in_args.set_dv_ind(dv_ind); - in_args.set_sensor_type(sensor_type); - - ::grpc::Status status = ch->rsmi_stub()->GetTemperature(&context, in_args, &resp); - - if (!status.ok()) { - return ::amd::rdc::GrpcErrorToRdcError(status.error_code()); - } - - *temperature = resp.temperature(); - - return static_cast(resp.ret_val()); - CATCH -} - -rdc_status_t rdc_dev_fan_rpms_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind, - int64_t* rpms) { - TRY CHK_PTR_ARG(rpms) UINTPTR_TO_RDC_CHAN(channel) - - ::rdc::GetFanRpmsResponse resp; - ::rdc::GetFanRpmsRequest in_args; - ::grpc::ClientContext context; - - in_args.set_dv_ind(dv_ind); - in_args.set_sensor_ind(sensor_ind); - - ::grpc::Status status = ch->rsmi_stub()->GetFanRpms(&context, in_args, &resp); - - if (!status.ok()) { - return ::amd::rdc::GrpcErrorToRdcError(status.error_code()); - } - - *rpms = resp.rpms(); - - return static_cast(resp.ret_val()); - CATCH -} - -rdc_status_t rdc_dev_fan_speed_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind, - int64_t* speed) { - TRY CHK_PTR_ARG(speed) UINTPTR_TO_RDC_CHAN(channel) - - ::rdc::GetFanSpeedResponse resp; - ::rdc::GetFanSpeedRequest in_args; - ::grpc::ClientContext context; - - in_args.set_dv_ind(dv_ind); - in_args.set_sensor_ind(sensor_ind); - - ::grpc::Status status = ch->rsmi_stub()->GetFanSpeed(&context, in_args, &resp); - - if (!status.ok()) { - return ::amd::rdc::GrpcErrorToRdcError(status.error_code()); - } - - *speed = resp.speed(); - - return static_cast(resp.ret_val()); - CATCH -} - -rdc_status_t rdc_dev_fan_speed_max_get(rdc_channel_t channel, uint32_t dv_ind, uint32_t sensor_ind, - uint64_t* max_speed) { - TRY CHK_PTR_ARG(max_speed) UINTPTR_TO_RDC_CHAN(channel) - - ::rdc::GetFanSpeedMaxResponse resp; - ::rdc::GetFanSpeedMaxRequest in_args; - ::grpc::ClientContext context; - - in_args.set_dv_ind(dv_ind); - in_args.set_sensor_ind(sensor_ind); - - ::grpc::Status status = ch->rsmi_stub()->GetFanSpeedMax(&context, in_args, &resp); - - if (!status.ok()) { - return ::amd::rdc::GrpcErrorToRdcError(status.error_code()); - } - - *max_speed = resp.max_speed(); - - return static_cast(resp.ret_val()); - CATCH -} - -rdc_status_t rdc_status_string(rdc_status_t status, const char** status_string) { - TRY if (status_string == nullptr) { return RDC_RSMI_STATUS_INVALID_ARGS; } - - const size_t status_u = static_cast(status); - switch (status_u) { - case RDC_STATUS_SUCCESS: - *status_string = - "RDC_STATUS_SUCCESS: The function has been executed" - " successfully."; - break; - - case RDC_RSMI_STATUS_INVALID_ARGS: - *status_string = - "RDC_RSMI_STATUS_INVALID_ARGS: The provided arguments do not" - " meet the preconditions required for calling this function."; - break; - - case RDC_RSMI_STATUS_NOT_SUPPORTED: - *status_string = - "RDC_RSMI_STATUS_NOT_SUPPORTED: This function is not" - " supported in the current environment."; - break; - - case RDC_RSMI_STATUS_FILE_ERROR: - *status_string = - "RDC_RSMI_STATUS_FILE_ERROR: There was an error in finding or" - " opening a file or directory. The operation may not be supported by " - "this Linux kernel version."; - break; - - case RDC_RSMI_STATUS_PERMISSION: - *status_string = - "RDC_RSMI_STATUS_PERMISSION: The user ID of the calling" - " process does not have sufficient permission to execute a command." - " Often this is fixed by running as root (sudo)."; - break; - - case RDC_RSMI_STATUS_OUT_OF_RESOURCES: - *status_string = - "RDC_RSMI_STATUS_OUT_OF_RESOURCES: Unable to acquire " - "memory or other resource"; - break; - - case RDC_RSMI_STATUS_INTERNAL_EXCEPTION: - *status_string = - "RDC_RSMI_STATUS_INTERNAL_EXCEPTION: An internal " - "exception was caught"; - break; - - case RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS: - *status_string = - "RDC_RSMI_STATUS_INPUT_OUT_OF_BOUNDS: The provided " - "input is out of allowable or safe range"; - break; - - case RDC_RSMI_STATUS_INIT_ERROR: - *status_string = - "RDC_RSMI_STATUS_INIT_ERROR: An error occurred during " - "initialization, during " - "monitor discovery or when when initializing internal data structures"; - break; - - case RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED: - *status_string = - "RDC_RSMI_STATUS_NOT_YET_IMPLEMENTED: The called " - "function has not been implemented in this " - "system for this device type"; - break; - - case RDC_RSMI_STATUS_NOT_FOUND: - *status_string = - "RDC_RSMI_STATUS_NOT_FOUND: An item required to " - "complete the call was not found"; - break; - - case RDC_RSMI_STATUS_INSUFFICIENT_SIZE: - *status_string = - "RDC_RSMI_STATUS_INSUFFICIENT_SIZE: Not enough " - "resources were available to fully execute" - " the call"; - break; - - case RDC_RSMI_STATUS_UNKNOWN_ERROR: - *status_string = - "An unknown error prevented the call from completing" - " successfully"; - break; - - case RDC_RSMI_STATUS_INTERRUPT: - *status_string = - "RDC_RSMI_STATUS_INTERRUPT An interrupt occurred while " - "executing the function"; - break; - - case RDC_STATUS_GRPC_CANCELLED: - *status_string = - "RDC_STATUS_GRPC_CANCELLED The operation was cancelled (typically by " - "the caller)."; - break; - - case RDC_STATUS_GRPC_UNKNOWN: - *status_string = - "RDC_STATUS_GRPC_UNKNOWN Unknown error. An example of where this error" - " may be returned is if a" - "Status value received from another address space belongs to an error-" - "space that is not known in this address space. Also errors raised by " - "APIs that do not return enough error information may be converted to " - "this error."; - break; - - case RDC_STATUS_GRPC_INVALID_ARG: - *status_string = - "RDC_STATUS_GRPC_INVALID_ARG Client specified an invalid argument. " - "Note that this differs from" - "FAILED_PRECONDITION. INVALID_ARGUMENT indicates arguments that are " - "problematic regardless of the state of the system (e.g., a malformed " - "file name)."; - break; - - case RDC_STATUS_GRPC_DEADLINE_EXCEEDED: - *status_string = - "RDC_STATUS_GRPC_DEADLINE_EXCEEDED Deadline expired before operation " - "could complete. For operations that" - "change the state of the system, this error may be returned even if " - "the operation has completed successfully. For example, a successful " - "response from a server could have been delayed long enough for the " - "deadline to expire."; - break; - - case RDC_STATUS_GRPC_NOT_FOUND: - *status_string = - "RDC_STATUS_GRPC_NOT_FOUND Some requested entity (e.g., file or " - "directory) was not found."; - break; - - case RDC_STATUS_GRPC_ALREADY_EXISTS: - *status_string = - "RDC_STATUS_GRPC_ALREADY_EXISTS Some entity that we " - "attempted to create " - "(e.g., file or directory) already exists."; - break; - - case RDC_STATUS_GRPC_PERM_DENIED: - *status_string = - "RDC_STATUS_GRPC_PERM_DENIED The caller does not have permission to " - "execute the specified operation." - "PERMISSION_DENIED must not be used for rejections caused by " - "exhausting some resource (use RESOURCE_EXHAUSTED instead for those " - "errors). PERMISSION_DENIED must not be used if the caller can not " - " be identified (use UNAUTHENTICATED instead for those errors)."; - break; - - case RDC_STATUS_GRPC_UNAUTHENTICATED: - *status_string = - "RDC_STATUS_GRPC_UNAUTHENTICATED The request does not have valid " - "authentication credentials for the operation."; - break; - - case RDC_STATUS_GRPC_RESOURCE_EXHAUSTED: - *status_string = - "RDC_STATUS_GRPC_RESOURCE_EXHAUSTED Some resource has been exhausted, " - "perhaps a per-user quota, or perhaps the " - "entire file system is out of space."; - break; - - case RDC_STATUS_GRPC_FAILED_PRECOND: - *status_string = - "RDC_STATUS_GRPC_FAILED_PRECOND Operation was rejected because the " - "system is not in a state required for " - "the operation's execution. For example, directory to be deleted may " - "be non-empty, an rmdir operation is applied to a non-directory, etc.\n" - "A litmus test that may help a service implementor in deciding " - "between FAILED_PRECONDITION, ABORTED, and UNAVAILABLE:\n" - " (a) Use UNAVAILABLE if the client can retry just the failing call.\n" - " (b) Use ABORTED if the client should retry at a higher-level " - " (e.g., restarting a read-modify-write sequence).\n" - " (c) Use FAILED_PRECONDITION if the client should not retry until" - " the system state has been explicitly fixed. E.g., if an \"rmdir\"" - " fails because the directory is non-empty, FAILED_PRECONDITION" - " should be returned since the client should not retry unless" - " they have first fixed up the directory by deleting files from it.\n" - " (d) Use FAILED_PRECONDITION if the client performs conditional" - " REST Get/Update/Delete on a resource and the resource on the" - " server does not match the condition. E.g., conflicting" - " read-modify-write on the same resource."; - break; - - case RDC_STATUS_GRPC_ABORTED: - *status_string = - "RDC_STATUS_GRPC_ABORTED The operation was aborted, " - "typically due to a concurrency issue like " - "sequencer check failures, transaction aborts, etc.\n" - "See litmus test above for deciding between " - "FAILED_PRECONDITION, ABORTED, " - "and UNAVAILABLE."; - break; - - case RDC_STATUS_GRPC_OUT_OF_RANGE: - *status_string = - "RDC_STATUS_GRPC_OUT_OF_RANGE Operation was attempted " - "past the valid range. E.g., seeking or reading " - "past end of file.\n" - "Unlike INVALID_ARGUMENT, this error indicates a " - "problem that may be fixed " - "if the system state changes. For example, a 32-bit file system will " - "generate INVALID_ARGUMENT if asked to read " - "at an offset that is not in the " - "range [0,2^32-1], but it will generate " - "OUT_OF_RANGE if asked to read from " - "an offset past the current file size.\n" - "There is a fair bit of overlap between FAILED_PRECONDITION and " - "OUT_OF_RANGE. We recommend using OUT_OF_RANGE " - "(the more specific error) " - "when it applies so that callers who are " - "iterating through a space can " - "easily look for an OUT_OF_RANGE error to detect when they are done."; - break; - - case RDC_STATUS_GRPC_UNIMPLEMENTED: - *status_string = - "RDC_STATUS_GRPC_UNIMPLEMENTED Operation is not " - "implemented or not supported/enabled in this service."; - break; - - case RDC_STATUS_GRPC_INTERNAL: - *status_string = - "RDC_STATUS_GRPC_INTERNAL Internal errors. This means " - "some invariants expected by underlying System has " - "been broken. If you see one of these errors."; - break; - - case RDC_STATUS_GRPC_UNAVAILABLE: - *status_string = - "RDC_STATUS_GRPC_UNAVAILABLE The service is currently unavailable. " - "This is a most likely a transient " - "condition and may be corrected by retrying with a backoff.\n" - "Warning: Although data MIGHT not have been transmitted when this " - "status occurs, there is NOT A GUARANTEE that the server has not seen " - "anything. So in general it is unsafe to retry on this status code " - "if the call is non-idempotent. " - "See litmus test above for deciding between " - "FAILED_PRECONDITION, ABORTED," - "and UNAVAILABLE."; - break; - - case RDC_STATUS_GRPC_DATA_LOSS: - *status_string = "RDC_STATUS_GRPC_DATA_LOSS Unrecoverable data loss or corruption."; - break; - - case RDC_STATUS_UNKNOWN_ERROR: - *status_string = "RDC_STATUS_UNKNOWN_ERROR An unknown RDC error occurred."; - break; - - case RDC_STATUS_CLIENT_ERR_SSL: - *status_string = "An error occurred when executing SSL authentication operations."; - break; - - default: - *status_string = - "RDC_RSMI_STATUS_UNKNOWN_ERROR An " - "unknown error occurred"; - return RDC_RSMI_STATUS_UNKNOWN_ERROR; - } - return RDC_STATUS_SUCCESS; - CATCH -} diff --git a/projects/rdc/client/src/rdc_client_main.cc b/projects/rdc/client/src/rdc_client_main.cc deleted file mode 100644 index 978bacaa2c..0000000000 --- a/projects/rdc/client/src/rdc_client_main.cc +++ /dev/null @@ -1,177 +0,0 @@ - -/* -Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "rdc/rdc_client_main.h" - -#include -#include - -#include - -#include "common/rdc_utils.h" -#include "rdc.grpc.pb.h" // NOLINT -#include "rdc/rdc_client.h" - -namespace amd { -namespace rdc { - -#ifdef USE_PINNED_CERTS -// Pinned certificates -static const char* kDefaultRDCServerCertPinPath = "/etc/rdc/server/rdc_server.crt"; -static const char* kDefaultRDCClientKeyPinPath = "/etc/rdc/client/private/rdc_client.key"; -static const char* kDefaultRDCClientCertPinPath = "/etc/rdc/client/rdc_client.crt"; -#endif // USE_PINNED_CERTS - -// PKI certificates -static const char* kDefaultRDCClientCertKeyPkiPath = "/etc/rdc/client/private/rdc_client_cert.key"; -static const char* kDefaultRDCClientCertPemPkiPath = "/etc/rdc/client/certs/rdc_client_cert.pem"; -static const char* kDefaultRDCClientCACertPemPkiPath = "/etc/rdc/client/certs/rdc_cacert.pem"; - -RDCChannel::RDCChannel(std::string server_ip, std::string server_port, bool secure) - : server_ip_(server_ip), server_port_(server_port), secure_channel_(secure) {} - -RDCChannel::~RDCChannel() {} - -#ifdef USE_PINNED_CERTS -static int ConstructSSLOptsPin(grpc::SslCredentialsOptions* ssl_opts) { - assert(ssl_opts != nullptr); - if (ssl_opts == nullptr) { - return -EINVAL; - } - - // Ensure the required paths exists before going forward - // TODO(cfreehil): override these defaults with values read from config - // file - if (!amd::rdc::FileExists(kDefaultRDCClientKeyPinPath) || - !amd::rdc::FileExists(kDefaultRDCServerCertPinPath) || - !amd::rdc::FileExists(kDefaultRDCClientCertPinPath)) { - return -ENOENT; - } - - std::string cli_key; - std::string ser_crt; - std::string cli_crt; - int ret; - ret = amd::rdc::ReadFile(kDefaultRDCClientKeyPinPath, &cli_key); - if (ret) { - return ret; - } - ret = amd::rdc::ReadFile(kDefaultRDCServerCertPinPath, &ser_crt); - if (ret) { - return ret; - } - ret = amd::rdc::ReadFile(kDefaultRDCClientCertPinPath, &cli_crt); - if (ret) { - return ret; - } - - ssl_opts->pem_root_certs = ser_crt; - ssl_opts->pem_private_key = cli_key; - ssl_opts->pem_cert_chain = cli_crt; - - return 0; -} -#endif // USE_PINNED_CERTS - -static int ConstructSSLOptsPKI(grpc::SslCredentialsOptions* ssl_opts) { - assert(ssl_opts != nullptr); - if (ssl_opts == nullptr) { - return -EINVAL; - } - - // Ensure the required paths exists before going forward - // TODO(cfreehil): override these defaults with values read from config - // file - if (!amd::rdc::FileExists(kDefaultRDCClientCertKeyPkiPath) || - !amd::rdc::FileExists(kDefaultRDCClientCertPemPkiPath) || - !amd::rdc::FileExists(kDefaultRDCClientCACertPemPkiPath)) { - return -ENOENT; - } - - std::string pem_root_certs; - std::string pem_private_key; - std::string pem_cert_chain; - int ret; - ret = amd::rdc::ReadFile(kDefaultRDCClientCACertPemPkiPath, &pem_root_certs); - if (ret) { - return ret; - } - ret = amd::rdc::ReadFile(kDefaultRDCClientCertKeyPkiPath, &pem_private_key); - if (ret) { - return ret; - } - ret = amd::rdc::ReadFile(kDefaultRDCClientCertPemPkiPath, &pem_cert_chain); - if (ret) { - return ret; - } - - ssl_opts->pem_root_certs = pem_root_certs; - ssl_opts->pem_private_key = pem_private_key; - ssl_opts->pem_cert_chain = pem_cert_chain; - - return 0; -} - -rdc_status_t RDCChannel::Initialize(void) { - assert(!server_port_.empty()); - assert(!server_ip_.empty()); - - int ret; - std::string addr_str = server_ip() + ":"; - addr_str += server_port(); - - if (secure_channel_) { - grpc::SslCredentialsOptions ssl_opts; - -#ifdef USE_PINNED_CERTS - ret = ConstructSSLOptsPin(&ssl_opts); -#else - ret = ConstructSSLOptsPKI(&ssl_opts); -#endif - if (ret) { - std::cerr << "Failed to process OpenSSL keys and certificates." << std::endl; - return RDC_STATUS_CLIENT_ERR_SSL; - } - - channel_creds_ = grpc::SslCredentials(ssl_opts); - channel_ = grpc::CreateChannel(addr_str, channel_creds_); - } else { - channel_ = ::grpc::CreateChannel(addr_str, grpc::InsecureChannelCredentials()); - } - - rsmi_stub_ = ::rdc::Rsmi::NewStub(channel_); - if (rsmi_stub_ == nullptr) { - return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED; - } - - rdc_admin_stub_ = ::rdc::RdcAdmin::NewStub(channel_); - if (rdc_admin_stub_ == nullptr) { - return RDC_STATUS_GRPC_RESOURCE_EXHAUSTED; - } - - // Test to see if we can connect to server; if not, return err. - return RDC_STATUS_SUCCESS; -} - -} // namespace rdc -} // namespace amd diff --git a/projects/rdc/client/src/rdc_client_utils.cc b/projects/rdc/client/src/rdc_client_utils.cc deleted file mode 100644 index 151ab1aa1c..0000000000 --- a/projects/rdc/client/src/rdc_client_utils.cc +++ /dev/null @@ -1,40 +0,0 @@ -/* -Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "rdc/rdc_client_utils.h" - -#include "rdc.grpc.pb.h" // NOLINT -#include "rdc/rdc_client.h" - -namespace amd { -namespace rdc { - -rdc_status_t GrpcErrorToRdcError(grpc::StatusCode grpc_err) { - uint32_t grpc_err_int = static_cast(grpc_err); - uint32_t rdc_grpc_base_int = static_cast(RDC_STATUS_GRPC_ERR_FIRST); - uint32_t rdc_err_int = grpc_err_int + rdc_grpc_base_int; - - return static_cast(rdc_err_int); -} - -} // namespace rdc -} // namespace amd diff --git a/projects/rdc/docs/user_guide/api.md b/projects/rdc/docs/user_guide/api.md index 6d269fe7a0..958bf1cd69 100644 --- a/projects/rdc/docs/user_guide/api.md +++ b/projects/rdc/docs/user_guide/api.md @@ -12,9 +12,9 @@ The RDC includes the following libraries: • librdc_client.so: Exposes RDC functionality using gRPC client. -• librdc.so: RDC API. This depends on librocm_smi.so. +• librdc.so: RDC API. This depends on libamd_smi.so. -• librocm_smi.so: Stateless low overhead access to GPU data. +• libamd_smi.so: Stateless low overhead access to GPU data. ![Libraries](../data/api_libs.png) diff --git a/projects/rdc/docs/user_guide/handbook.md b/projects/rdc/docs/user_guide/handbook.md index 48c60fc70d..f3c79cfac2 100644 --- a/projects/rdc/docs/user_guide/handbook.md +++ b/projects/rdc/docs/user_guide/handbook.md @@ -10,7 +10,7 @@ NOTE: The RDC tool is tested on the following software versions. Earlier version • g++ (5.4.0) -• AMD ROCm, which includes AMD ROCm SMI Library +• AMD ROCm, which includes AMD AMDSMI Library • gRPC and protoc diff --git a/projects/rdc/docs/user_guide/install.md b/projects/rdc/docs/user_guide/install.md index 47f0ca0f38..351736fdd3 100644 --- a/projects/rdc/docs/user_guide/install.md +++ b/projects/rdc/docs/user_guide/install.md @@ -68,7 +68,7 @@ RDC Command Line Tool (rdci) A command-line tool to invoke all the features of the RDC tool. This CLI can be run locally or remotely. -ROCm-SMI Library +AMDSMI Library A stateless system management library that provides low-level interfaces to access GPU information diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index 9aa3dc0d58..e25cf901ef 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -257,7 +257,7 @@ typedef enum { //!< represents 32 bytes // "Composite" events. These events have additional processing beyond - // the value provided by the rocm_smi library. + // the value provided by the amd_smi library. RDC_EVNT_XGMI_0_THRPUT = 1500, //!< Transmit throughput to XGMI //!< neighbor 0 in byes/sec RDC_EVNT_XGMI_1_THRPUT, //!< Transmit throughput to XGMI diff --git a/projects/rdc/include/rdc_lib/RdcMetricFetcher.h b/projects/rdc/include/rdc_lib/RdcMetricFetcher.h index 6305e529fb..b0e7398449 100644 --- a/projects/rdc/include/rdc_lib/RdcMetricFetcher.h +++ b/projects/rdc/include/rdc_lib/RdcMetricFetcher.h @@ -34,8 +34,8 @@ namespace rdc { class RdcMetricFetcher { public: - virtual rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) = 0; - virtual rdc_status_t delete_rsmi_handle(RdcFieldKey fk) = 0; + virtual rdc_status_t acquire_smi_handle(RdcFieldKey fk) = 0; + virtual rdc_status_t delete_smi_handle(RdcFieldKey fk) = 0; virtual rdc_status_t fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value) = 0; diff --git a/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h index a0ddbe44c8..935ac788f7 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcMetricFetcherImpl.h @@ -29,9 +29,9 @@ THE SOFTWARE. #include // NOLINT(build/c++11) #include +#include "amd_smi/amdsmi.h" #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/rdc_common.h" -#include "rocm_smi/rocm_smi.h" namespace amd { namespace rdc { @@ -44,20 +44,20 @@ struct MetricValue { rdc_field_value value; }; -// This union represents any RSMI handles require initialization and/or +// This union represents any SMI handles require initialization and/or // shut down. There should only be one instance of this for each raw event // used. For example, if a field group includes a pseudo-event and the -// underlying raw event, then only one FieldRSMIData should be created, +// underlying raw event, then only one FieldSMIData should be created, // and it should be used by both events. -struct FieldRSMIData { +struct FieldSMIData { union { - rsmi_event_handle_t evt_handle; + amdsmi_event_handle_t evt_handle; }; union { - rsmi_counter_value_t counter_val; + amdsmi_counter_value_t counter_val; }; - ~FieldRSMIData() {} - FieldRSMIData() : evt_handle(0), counter_val{0, 0, 0} {} + ~FieldSMIData() {} + FieldSMIData() : evt_handle(0), counter_val{0, 0, 0} {} }; //!< The data structure to store the async fetch task @@ -77,11 +77,11 @@ class RdcMetricFetcherImpl : public RdcMetricFetcher { RdcMetricFetcherImpl(); ~RdcMetricFetcherImpl(); - rdc_status_t acquire_rsmi_handle(RdcFieldKey fk) override; - rdc_status_t delete_rsmi_handle(RdcFieldKey fk) override; + rdc_status_t acquire_smi_handle(RdcFieldKey fk) override; + rdc_status_t delete_smi_handle(RdcFieldKey fk) override; private: - std::shared_ptr get_rsmi_data(RdcFieldKey key); + std::shared_ptr get_smi_data(RdcFieldKey key); uint64_t now(); void get_ecc_error(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value); @@ -92,7 +92,7 @@ class RdcMetricFetcherImpl : public RdcMetricFetcher { //!< Async metric retreive std::map async_metrics_; - std::map> rsmi_data_; + std::map> smi_data_; std::queue updated_tasks_; std::mutex task_mutex_; std::future updater_; // keep the future of updater @@ -100,8 +100,6 @@ class RdcMetricFetcherImpl : public RdcMetricFetcher { std::atomic task_started_; }; -rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi); - } // namespace rdc } // namespace amd diff --git a/projects/rdc/include/rdc_lib/impl/RdcSmiDiagnosticImpl.h b/projects/rdc/include/rdc_lib/impl/RdcSmiDiagnosticImpl.h index cece59d0ff..f56d1eb294 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcSmiDiagnosticImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcSmiDiagnosticImpl.h @@ -24,9 +24,8 @@ THE SOFTWARE. #include #include +#include "amd_smi/amdsmi.h" #include "rdc/rdc.h" -#include "rdc_lib/rdc_common.h" -#include "rocm_smi/rocm_smi.h" namespace amd { namespace rdc { @@ -35,23 +34,23 @@ class RdcSmiDiagnosticImpl { public: RdcSmiDiagnosticImpl(); - rdc_status_t check_rsmi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, - rdc_diag_test_result_t* result); - rdc_status_t check_rsmi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + rdc_status_t check_smi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + rdc_diag_test_result_t* result); + rdc_status_t check_smi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, + rdc_diag_test_result_t* result); + rdc_status_t check_smi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, rdc_diag_test_result_t* result); - rdc_status_t check_rsmi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], uint32_t gpu_count, - rdc_diag_test_result_t* result); private: - rdc_diag_result_t check_temperature_level(uint32_t gpu_index, rsmi_temperature_type_t type, + rdc_diag_result_t check_temperature_level(uint32_t gpu_index, amdsmi_temperature_type_t type, char msg[MAX_DIAG_MSG_LENGTH], char per_gpu_msg[MAX_DIAG_MSG_LENGTH]); - std::string get_temperature_string(rsmi_temperature_type_t type) const; + std::string get_temperature_string(amdsmi_temperature_type_t type) const; - rdc_diag_result_t check_voltage_level(uint32_t gpu_index, rsmi_voltage_type_t type, + rdc_diag_result_t check_voltage_level(uint32_t gpu_index, amdsmi_voltage_type_t type, char msg[MAX_DIAG_MSG_LENGTH], char per_gpu_msg[MAX_DIAG_MSG_LENGTH]); - std::string get_voltage_string(rsmi_voltage_type_t type) const; + std::string get_voltage_string(amdsmi_voltage_type_t type) const; }; typedef std::shared_ptr RdcSmiDiagnosticPtr; diff --git a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h index 6635f683a3..8c8863f6bb 100644 --- a/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h +++ b/projects/rdc/include/rdc_lib/impl/RdcWatchTableImpl.h @@ -30,13 +30,13 @@ THE SOFTWARE. #include #include +#include "amd_smi/amdsmi.h" #include "rdc_lib/RdcCacheManager.h" #include "rdc_lib/RdcGroupSettings.h" #include "rdc_lib/RdcMetricFetcher.h" #include "rdc_lib/RdcModuleMgr.h" #include "rdc_lib/RdcNotification.h" #include "rdc_lib/RdcWatchTable.h" -#include "rocm_smi/rocm_smi.h" namespace amd { namespace rdc { diff --git a/projects/rdc/include/rdc_lib/impl/RsmiUtils.h b/projects/rdc/include/rdc_lib/impl/SmiUtils.h similarity index 80% rename from projects/rdc/include/rdc_lib/impl/RsmiUtils.h rename to projects/rdc/include/rdc_lib/impl/SmiUtils.h index 92044a107d..0ea34d44e9 100644 --- a/projects/rdc/include/rdc_lib/impl/RsmiUtils.h +++ b/projects/rdc/include/rdc_lib/impl/SmiUtils.h @@ -23,12 +23,16 @@ THE SOFTWARE. #ifndef INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_ #define INCLUDE_RDC_LIB_IMPL_RSMIUTILS_H_ -#include "rocm_smi/rocm_smi.h" +#include "amd_smi/amdsmi.h" +#include "rdc/rdc.h" namespace amd { namespace rdc { -rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi); +rdc_status_t Smi2RdcError(amdsmi_status_t rsmi); +amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id, + amdsmi_processor_handle* processor_handle); +amdsmi_status_t get_processor_count(uint32_t& all_processor_count); } // namespace rdc } // namespace amd diff --git a/projects/rdc/include/rdc_modules/rdc_rocr/TestBase.h b/projects/rdc/include/rdc_modules/rdc_rocr/TestBase.h index 96dc669a24..df05f8d6bd 100644 --- a/projects/rdc/include/rdc_modules/rdc_rocr/TestBase.h +++ b/projects/rdc/include/rdc_modules/rdc_rocr/TestBase.h @@ -63,7 +63,7 @@ class TestBase : public RdcRocrBase { const std::string& get_per_gpu_info() const { return per_gpu_info_; } hsa_status_t FindGPUIndex(hsa_agent_t agent, void* data); - // Return the agent by GPU index in rocm_smi + // Return the agent by GPU index in amd_smi hsa_status_t get_agent_by_gpu_index(uint32_t gpu_index, hsa_agent_t* agent); protected: diff --git a/projects/rdc/protos/rdc.proto b/projects/rdc/protos/rdc.proto index 6d2bbc5363..a31460db0e 100755 --- a/projects/rdc/protos/rdc.proto +++ b/projects/rdc/protos/rdc.proto @@ -28,88 +28,6 @@ syntax = "proto3"; package rdc; -/****************************************************************************/ -/********************************** Rsmi Service ****************************/ -/****************************************************************************/ -service Rsmi { - // RSMI ID services - rpc GetNumDevices (GetNumDevicesRequest) returns(GetNumDevicesResponse) {} - - // RSMI Physical Queries - rpc GetTemperature(GetTemperatureRequest) returns(GetTemperatureResponse){} - rpc GetFanRpms(GetFanRpmsRequest) returns(GetFanRpmsResponse){} - rpc GetFanSpeed(GetFanSpeedRequest) returns(GetFanSpeedResponse){} - rpc GetFanSpeedMax(GetFanSpeedMaxRequest) returns(GetFanSpeedMaxResponse){} -} - -/* rsmi_num_monitor_devices() */ -message GetNumDevicesRequest { -} -message GetNumDevicesResponse { - uint64 val = 1; - uint64 ret_val = 2; -} - -/* GetTemperature */ -/* rsmi_dev_temp_metric_get() */ -message GetTemperatureRequest { - uint32 dv_ind = 1; - uint32 sensor_type = 2; - enum TemperatureMetric { - RSMI_TEMP_CURRENT = 0; - RSMI_TEMP_MAX = 1; - RSMI_TEMP_MIN = 2; - RSMI_TEMP_MAX_HYST = 3; - RSMI_TEMP_MIN_HYST = 4; - RSMI_TEMP_CRITICAL = 5; - RSMI_TEMP_CRITICAL_HYST = 6; - RSMI_TEMP_EMERGENCY = 7; - RSMI_TEMP_EMERGENCY_HYST = 8; - RSMI_TEMP_CRIT_MIN = 9; - RSMI_TEMP_CRIT_MIN_HYST = 10; - RSMI_TEMP_OFFSET = 11; - RSMI_TEMP_LOWEST = 12; - RSMI_TEMP_HIGHEST = 13; - } - TemperatureMetric metric = 3; -} -message GetTemperatureResponse { - int64 temperature = 1; - uint64 ret_val = 2; -} - -/* GetFanRpms */ -/* rsmi_dev_fan_rpms_get() */ -message GetFanRpmsRequest { - uint32 dv_ind = 1; - uint32 sensor_ind = 2; -} -message GetFanRpmsResponse { - int64 rpms = 1; - uint64 ret_val = 2; -} -/* GetFanSpeed */ -/* rsmi_dev_fan_speed_get() */ -message GetFanSpeedRequest { - uint32 dv_ind = 1; - uint32 sensor_ind = 2; -} -message GetFanSpeedResponse { - int64 speed = 1; - uint64 ret_val = 2; -} - -/* GetFanSpeedMax */ -/* rsmi_dev_fan_speed_max_get() */ -message GetFanSpeedMaxRequest { - uint32 dv_ind = 1; - uint32 sensor_ind = 2; -} -message GetFanSpeedMaxResponse { - uint64 max_speed = 1; - uint64 ret_val = 2; -} - /****************************************************************************/ /********************************** RdcAdmin Service ************************/ /****************************************************************************/ diff --git a/projects/rdc/python_binding/rdc_prometheus.py b/projects/rdc/python_binding/rdc_prometheus.py index fe486c534b..00dd558602 100644 --- a/projects/rdc/python_binding/rdc_prometheus.py +++ b/projects/rdc/python_binding/rdc_prometheus.py @@ -37,13 +37,13 @@ class PrometheusReader(RdcReader): if enable_pci_id == True: try: import sys, os - # Relaive path of rocm_smi to map gpu index to PCI id - # change smi_lib_path if the rocm_smi is installed in different folder + # Relaive path of amd_smi to map gpu index to PCI id + # change smi_lib_path if the amd_smi is installed in different folder smi_lib_relative_path = "../../bin" smi_lib_path = os.path.join(sys.path[0], smi_lib_relative_path) - if os.path.exists(smi_lib_path+"/rocm_smi.py"): + if os.path.exists(smi_lib_path+"/amd_smi.py"): sys.path.append(smi_lib_path) - from rocm_smi import getBus, initializeRsmi + from amd_smi import getBus, initializeRsmi initializeRsmi() # Map between gpu indexes and PCIe bus addresses self.index_to_bus_addr = {} diff --git a/projects/rdc/rdc_libs/CMakeLists.txt b/projects/rdc/rdc_libs/CMakeLists.txt index eaa4dff634..6580355bdc 100755 --- a/projects/rdc/rdc_libs/CMakeLists.txt +++ b/projects/rdc/rdc_libs/CMakeLists.txt @@ -46,8 +46,8 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR}) message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR}) message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) -message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR}) -message("--------RSMI Inc Dir: " ${RSMI_INC_DIR}) +message("---------SMI Lib Dir: " ${SMI_LIB_DIR}) +message("---------SMI Inc Dir: " ${SMI_INC_DIR}) message("") @@ -82,8 +82,8 @@ set(CPACK_PACKAGE_FILE_NAME "${RDC_PACKAGE}-${VERSION_STRING}") set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core") set(CPACK_RPM_PACKAGE_REQUIRES "rocm-core") -# link RSMI -link_directories(${RSMI_LIB_DIR}) +# link SMI +link_directories(${SMI_LIB_DIR}) # add librdc_bootstrap.so add_subdirectory(bootstrap) diff --git a/projects/rdc/rdc_libs/bootstrap/CMakeLists.txt b/projects/rdc/rdc_libs/bootstrap/CMakeLists.txt index f73d1e29da..e5031391e3 100644 --- a/projects/rdc/rdc_libs/bootstrap/CMakeLists.txt +++ b/projects/rdc/rdc_libs/bootstrap/CMakeLists.txt @@ -28,7 +28,7 @@ target_include_directories(${BOOTSTRAP_LIB} PRIVATE "${PROJECT_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}/include" "${COMMON_DIR}" - "${RSMI_INC_DIR}" + "${SMI_INC_DIR}" "${ROCM_DIR}/include") target_include_directories(${BOOTSTRAP_LIB} diff --git a/projects/rdc/rdc_libs/rdc/CMakeLists.txt b/projects/rdc/rdc_libs/rdc/CMakeLists.txt index 14ad3d1167..c20ebec7d1 100644 --- a/projects/rdc/rdc_libs/rdc/CMakeLists.txt +++ b/projects/rdc/rdc_libs/rdc/CMakeLists.txt @@ -26,7 +26,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/RdcSmiLib.cc" "${SRC_DIR}/RdcTelemetryModule.cc" "${SRC_DIR}/RdcWatchTableImpl.cc" - "${SRC_DIR}/RsmiUtils.cc") + "${SRC_DIR}/SmiUtils.cc") # TODO: remove all headers? Will just dir be ok after install? set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} @@ -59,16 +59,16 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST} "${INC_DIR}/impl/RdcSmiLib.h" "${INC_DIR}/impl/RdcTelemetryModule.h" "${INC_DIR}/impl/RdcWatchTableImpl.h" - "${INC_DIR}/impl/RsmiUtils.h") + "${INC_DIR}/impl/SmiUtils.h") message("RDC_LIB_INC_LIST=${RDC_LIB_INC_LIST}") add_library(${RDC_LIB} SHARED ${RDC_LIB_SRC_LIST} ${RDC_LIB_INC_LIST}) -target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread rocm_smi64 cap) +target_link_libraries(${RDC_LIB} ${BOOTSTRAP_LIB} pthread amd_smi cap) target_include_directories(${RDC_LIB} PRIVATE "${PROJECT_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}/include" - "${RSMI_INC_DIR}") + "${SMI_INC_DIR}") # Set the VERSION and SOVERSION values set_property(TARGET ${RDC_LIB} PROPERTY diff --git a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 5a27eba907..5765ea7f04 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -23,6 +23,7 @@ THE SOFTWARE. #include +#include "amd_smi/amdsmi.h" #include "common/rdc_fields_supported.h" #include "rdc_lib/RdcException.h" #include "rdc_lib/RdcLogger.h" @@ -35,30 +36,29 @@ THE SOFTWARE. #include "rdc_lib/impl/RdcNotificationImpl.h" #include "rdc_lib/impl/RdcWatchTableImpl.h" #include "rdc_lib/rdc_common.h" -#include "rocm_smi/rocm_smi.h" namespace { -// call the rsmi_init when load library -// and rsmi_shutdown when unload the library. -class rsmi_initializer { - rsmi_initializer() { - // Make sure rsmi will not be initialized multiple times - rsmi_shut_down(); - rsmi_status_t rsmi_ret = rsmi_init(0); - if (rsmi_ret != RSMI_STATUS_SUCCESS) { - throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "RSMI initialize fail"); +// call the smi_init when load library +// and smi_shutdown when unload the library. +class smi_initializer { + smi_initializer() { + // Make sure smi will not be initialized multiple times + amdsmi_shut_down(); + amdsmi_status_t ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS); + if (ret != AMDSMI_STATUS_SUCCESS) { + throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "SMI initialize fail"); } } - ~rsmi_initializer() { rsmi_shut_down(); } + ~smi_initializer() { amdsmi_shut_down(); } public: - static rsmi_initializer& getInstance() { - static rsmi_initializer instance; + static smi_initializer& getInstance() { + static smi_initializer instance; return instance; } }; -static rsmi_initializer& in = rsmi_initializer::getInstance(); +static smi_initializer& in = smi_initializer::getInstance(); } // namespace amd::rdc::RdcHandler* make_handler(rdc_operation_mode_t op_mode) { diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 6f48376281..74e519a64c 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -25,42 +25,39 @@ THE SOFTWARE. #include #include -#include #include //NOLINT #include #include +#include "amd_smi/amdsmi.h" #include "common/rdc_capabilities.h" #include "common/rdc_fields_supported.h" #include "rdc_lib/RdcLogger.h" -#include "rdc_lib/impl/RsmiUtils.h" +#include "rdc_lib/impl/SmiUtils.h" #include "rdc_lib/rdc_common.h" -#include "rocm_smi/rocm_smi.h" namespace amd { namespace rdc { -static const std::unordered_map rdc_evnt_2_rsmi_field = { - {RDC_EVNT_XGMI_0_NOP_TX, RSMI_EVNT_XGMI_0_NOP_TX}, - {RDC_EVNT_XGMI_0_REQ_TX, RSMI_EVNT_XGMI_0_REQUEST_TX}, - {RDC_EVNT_XGMI_0_RESP_TX, RSMI_EVNT_XGMI_0_RESPONSE_TX}, - {RDC_EVNT_XGMI_0_BEATS_TX, RSMI_EVNT_XGMI_0_BEATS_TX}, - {RDC_EVNT_XGMI_1_NOP_TX, RSMI_EVNT_XGMI_1_NOP_TX}, - {RDC_EVNT_XGMI_1_REQ_TX, RSMI_EVNT_XGMI_1_REQUEST_TX}, - {RDC_EVNT_XGMI_1_RESP_TX, RSMI_EVNT_XGMI_1_RESPONSE_TX}, - {RDC_EVNT_XGMI_1_BEATS_TX, RSMI_EVNT_XGMI_1_BEATS_TX}, +static const std::unordered_map rdc_evnt_2_smi_field = { + {RDC_EVNT_XGMI_0_NOP_TX, AMDSMI_EVNT_XGMI_0_NOP_TX}, + {RDC_EVNT_XGMI_0_REQ_TX, AMDSMI_EVNT_XGMI_0_REQUEST_TX}, + {RDC_EVNT_XGMI_0_RESP_TX, AMDSMI_EVNT_XGMI_0_RESPONSE_TX}, + {RDC_EVNT_XGMI_0_BEATS_TX, AMDSMI_EVNT_XGMI_0_BEATS_TX}, + {RDC_EVNT_XGMI_1_NOP_TX, AMDSMI_EVNT_XGMI_1_NOP_TX}, + {RDC_EVNT_XGMI_1_REQ_TX, AMDSMI_EVNT_XGMI_1_REQUEST_TX}, + {RDC_EVNT_XGMI_1_RESP_TX, AMDSMI_EVNT_XGMI_1_RESPONSE_TX}, + {RDC_EVNT_XGMI_1_BEATS_TX, AMDSMI_EVNT_XGMI_1_BEATS_TX}, - {RDC_EVNT_XGMI_0_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_0}, - {RDC_EVNT_XGMI_1_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_1}, - {RDC_EVNT_XGMI_2_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_2}, - {RDC_EVNT_XGMI_3_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_3}, - {RDC_EVNT_XGMI_4_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_4}, - {RDC_EVNT_XGMI_5_THRPUT, RSMI_EVNT_XGMI_DATA_OUT_5}, + {RDC_EVNT_XGMI_0_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_0}, + {RDC_EVNT_XGMI_1_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_1}, + {RDC_EVNT_XGMI_2_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_2}, + {RDC_EVNT_XGMI_3_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_3}, + {RDC_EVNT_XGMI_4_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_4}, + {RDC_EVNT_XGMI_5_THRPUT, AMDSMI_EVNT_XGMI_DATA_OUT_5}, }; -RdcMetricFetcherImpl::RdcMetricFetcherImpl() { - task_started_ = true; - +RdcMetricFetcherImpl::RdcMetricFetcherImpl() : task_started_(true) { // kick off another thread for async fetch updater_ = std::async(std::launch::async, [this]() { while (task_started_) { @@ -95,37 +92,41 @@ uint64_t RdcMetricFetcherImpl::now() { void RdcMetricFetcherImpl::get_ecc_error(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value) { - rsmi_status_t err = RSMI_STATUS_SUCCESS; - uint64_t correctable_err = 0; - uint64_t uncorrectable_err = 0; - rsmi_ras_err_state_t err_state; + amdsmi_status_t err = AMDSMI_STATUS_SUCCESS; + uint64_t correctable_count = 0; + uint64_t uncorrectable_count = 0; + amdsmi_ras_err_state_t err_state; + + amdsmi_processor_handle processor_handle; + err = get_processor_handle_from_id(gpu_index, &processor_handle); if (!value) { return; } - for (uint32_t b = RSMI_GPU_BLOCK_FIRST; b <= RSMI_GPU_BLOCK_LAST; b = b * 2) { - err = rsmi_dev_ecc_status_get(gpu_index, static_cast(b), &err_state); - if (err != RSMI_STATUS_SUCCESS) { + for (uint32_t b = AMDSMI_GPU_BLOCK_FIRST; b <= AMDSMI_GPU_BLOCK_LAST; b = b * 2) { + err = + amdsmi_get_gpu_ecc_status(processor_handle, static_cast(b), &err_state); + if (err != AMDSMI_STATUS_SUCCESS) { RDC_LOG(RDC_INFO, "Get the ecc Status error " << b << ":" << err); continue; } - rsmi_error_count_t ec; - err = rsmi_dev_ecc_count_get(gpu_index, static_cast(b), &ec); + amdsmi_error_count_t ec; + err = amdsmi_get_gpu_ecc_count(processor_handle, static_cast(b), &ec); - if (err == RSMI_STATUS_SUCCESS) { - correctable_err += ec.correctable_err; - uncorrectable_err += ec.uncorrectable_err; + if (err == AMDSMI_STATUS_SUCCESS) { + correctable_count += ec.correctable_count; + uncorrectable_count += ec.uncorrectable_count; } } - value->status = RSMI_STATUS_SUCCESS; + value->status = AMDSMI_STATUS_SUCCESS; value->type = INTEGER; if (field_id == RDC_FI_ECC_CORRECT_TOTAL) { - value->value.l_int = correctable_err; + value->value.l_int = correctable_count; } if (field_id == RDC_FI_ECC_UNCORRECT_TOTAL) { - value->value.l_int = uncorrectable_err; + value->value.l_int = uncorrectable_count; } } @@ -166,7 +167,10 @@ bool RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, rdc_fie void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) { uint32_t gpu_index = key.first; uint64_t sent, received, max_pkt_sz; - rsmi_status_t ret; + amdsmi_status_t ret; + + amdsmi_processor_handle processor_handle; + ret = get_processor_handle_from_id(gpu_index, &processor_handle); // Return if the cache does not expire yet do { @@ -178,7 +182,7 @@ void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) { } } while (0); - ret = rsmi_dev_pci_throughput_get(gpu_index, &sent, &received, &max_pkt_sz); + ret = amdsmi_get_gpu_pci_throughput(processor_handle, &sent, &received, &max_pkt_sz); uint64_t curTime = now(); MetricValue value; @@ -207,12 +211,12 @@ void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) { rx_metric->second.value.status = ret; rx_metric->second.value.ts = curTime; - if (ret == RSMI_STATUS_NOT_SUPPORTED) { + if (ret == AMDSMI_STATUS_NOT_SUPPORTED) { RDC_LOG(RDC_ERROR, "PCIe throughput not supported on GPU " << gpu_index); return; } - if (ret == RSMI_STATUS_SUCCESS) { + if (ret == AMDSMI_STATUS_SUCCESS) { rx_metric->second.value.value.l_int = received; tx_metric->second.value.value.l_int = sent; RDC_LOG(RDC_DEBUG, "Async updated " << gpu_index << ":" @@ -226,16 +230,16 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields( std::vector& results) { // NOLINT const std::set rdc_bulk_fields = { RDC_FI_GPU_CLOCK, // current_gfxclk * 1000000 - RDC_FI_MEMORY_TEMP, // temperature_mem * 1000 - RDC_FI_GPU_TEMP, // temperature_edge * 1000 - RDC_FI_POWER_USAGE, // average_socket_power * 1000000 + RDC_FI_MEMORY_TEMP, // temperature_mem + RDC_FI_GPU_TEMP, // temperature_edge + RDC_FI_POWER_USAGE, // average_socket_power RDC_FI_GPU_UTIL // average_gfx_activity }; // To prevent always call the bulk API even if it is not supported, // the static is used to cache last try. - static rsmi_status_t rs = RSMI_STATUS_SUCCESS; - if (rs != RSMI_STATUS_SUCCESS) { + static amdsmi_status_t rs = AMDSMI_STATUS_SUCCESS; + if (rs != AMDSMI_STATUS_SUCCESS) { results.clear(); return RDC_ST_NOT_SUPPORTED; } @@ -248,13 +252,16 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields( } } - // Call the rocm_smi_lib API to bulk fetch the data + // Call the amd_smi_lib API to bulk fetch the data auto cur_time = now(); auto ite = bulk_fields.begin(); for (; ite != bulk_fields.end(); ite++) { - rsmi_gpu_metrics_t gpu_metrics; - rs = rsmi_dev_gpu_metrics_info_get(ite->first, &gpu_metrics); - if (rs != RSMI_STATUS_SUCCESS) { + amdsmi_gpu_metrics_t gpu_metrics; + amdsmi_processor_handle processor_handle; + rs = get_processor_handle_from_id(ite->first, &processor_handle); + + rs = amdsmi_get_gpu_metrics_info(processor_handle, &gpu_metrics); + if (rs != AMDSMI_STATUS_SUCCESS) { results.clear(); return RDC_ST_NOT_SUPPORTED; } @@ -264,38 +271,46 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields( value.gpu_index = ite->first; value.field_value.field_id = field_id; value.field_value.type = INTEGER; - value.field_value.status = RSMI_STATUS_SUCCESS; + value.field_value.status = AMDSMI_STATUS_SUCCESS; value.field_value.ts = cur_time; switch (field_id) { case RDC_FI_GPU_CLOCK: // current_gfxclk * 1000000 value.field_value.value.l_int = - static_cast(gpu_metrics.current_gfxclk * 1000000); + static_cast(gpu_metrics.current_gfxclk) * 1000000; break; case RDC_FI_MEMORY_TEMP: // temperature_mem * 1000 - value.field_value.value.l_int = static_cast(gpu_metrics.temperature_mem * 1000); + value.field_value.value.l_int = static_cast(gpu_metrics.temperature_mem) * 1000; break; case RDC_FI_GPU_TEMP: // temperature_edge * 1000 - value.field_value.value.l_int = static_cast(gpu_metrics.temperature_edge * 1000); + value.field_value.value.l_int = static_cast(gpu_metrics.temperature_edge) * 1000; break; - case RDC_FI_POWER_USAGE: // average_socket_power * 1000000 - value.field_value.value.l_int = - static_cast(gpu_metrics.average_socket_power * 1000000); + case RDC_FI_POWER_USAGE: // average_socket_power + value.field_value.value.l_int = static_cast(gpu_metrics.average_socket_power); + // Use current_socket_power if average_socket_power is not available + if (value.field_value.value.l_int == 65535) { + RDC_LOG(RDC_DEBUG, "Bulk fetch " + << value.gpu_index << ":" + << "RDC_FI_POWER_USAGE fallback to current_socket_power."); + value.field_value.value.l_int = static_cast(gpu_metrics.current_socket_power); + } + // Ignore if the power is 0, which will fallback to non-bulk fetch. if (value.field_value.value.l_int == 0) { RDC_LOG(RDC_DEBUG, "Bulk fetch " << value.gpu_index << ":" << "RDC_FI_POWER_USAGE fallback to regular way."); continue; } + value.field_value.value.l_int *= 1000000; break; case RDC_FI_GPU_UTIL: // average_gfx_activity value.field_value.value.l_int = static_cast(gpu_metrics.average_gfx_activity); break; default: - value.field_value.status = RSMI_STATUS_NOT_SUPPORTED; + value.field_value.status = AMDSMI_STATUS_NOT_SUPPORTED; break; } - if (value.field_value.status == RSMI_STATUS_SUCCESS) { + if (value.field_value.status == AMDSMI_STATUS_SUCCESS) { results.push_back(value); } } @@ -304,20 +319,23 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields( return RDC_ST_OK; } -static const uint64_t kGig = 1000000000; +constexpr double kGig = 1000000000.0; rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value) { if (!value) { return RDC_ST_BAD_PARAMETER; } - uint64_t i64 = 0; - rsmi_temperature_type_t sensor_type; - rsmi_clk_type_t clk_type; bool async_fetching = false; - RdcFieldKey f_key(gpu_index, field_id); - std::shared_ptr rsmi_data; - double coll_time_sec; + std::shared_ptr smi_data; + + amdsmi_processor_handle processor_handle = {}; + + amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle); + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << gpu_index << " error: " << ret); + return Smi2RdcError(ret); + } if (!is_field_valid(field_id)) { RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id << " which is not supported"); @@ -326,101 +344,121 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field value->ts = now(); value->field_id = field_id; - value->status = RSMI_STATUS_NOT_SUPPORTED; + value->status = AMDSMI_STATUS_NOT_SUPPORTED; - auto read_rsmi_counter = [&](void) { - rsmi_data = get_rsmi_data(f_key); - if (rsmi_data == nullptr) { - value->status = RSMI_STATUS_NOT_SUPPORTED; + auto read_smi_counter = [&](void) { + RdcFieldKey f_key(gpu_index, field_id); + smi_data = get_smi_data(f_key); + if (smi_data == nullptr) { + value->status = AMDSMI_STATUS_NOT_SUPPORTED; return; } - value->status = rsmi_counter_read(rsmi_data->evt_handle, &rsmi_data->counter_val); - value->value.l_int = rsmi_data->counter_val.value; + value->status = amdsmi_gpu_read_counter(smi_data->evt_handle, &smi_data->counter_val); + value->value.l_int = smi_data->counter_val.value; value->type = INTEGER; }; switch (field_id) { - case RDC_FI_GPU_MEMORY_USAGE: - value->status = rsmi_dev_memory_usage_get(gpu_index, RSMI_MEM_TYPE_VRAM, &i64); + case RDC_FI_GPU_MEMORY_USAGE: { + uint64_t u64 = 0; + value->status = amdsmi_get_gpu_memory_usage(processor_handle, AMDSMI_MEM_TYPE_VRAM, &u64); value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(i64); - } - break; - case RDC_FI_GPU_MEMORY_TOTAL: - value->status = rsmi_dev_memory_total_get(gpu_index, RSMI_MEM_TYPE_VRAM, &i64); - value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(i64); - } - break; - case RDC_FI_GPU_COUNT: - uint32_t num_gpu; - value->status = rsmi_num_monitor_devices(&num_gpu); - value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(num_gpu); - } - break; - case RDC_FI_POWER_USAGE: - { - RSMI_POWER_TYPE power_type = RSMI_CURRENT_POWER; - // below call should handle both socket power and regular power - value->status = rsmi_dev_power_get(gpu_index, &i64, &power_type); - value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(i64); + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(u64); } break; } - case RDC_FI_GPU_CLOCK: - case RDC_FI_MEM_CLOCK: - rsmi_frequencies_t f; - clk_type = RSMI_CLK_TYPE_SYS; - if (field_id == RDC_FI_MEM_CLOCK) { - clk_type = RSMI_CLK_TYPE_MEM; - } - value->status = rsmi_dev_gpu_clk_freq_get(gpu_index, clk_type, &f); + case RDC_FI_GPU_MEMORY_TOTAL: { + uint64_t u64 = 0; + value->status = amdsmi_get_gpu_memory_total(processor_handle, AMDSMI_MEM_TYPE_VRAM, &u64); value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(u64); + } + break; + } + case RDC_FI_GPU_COUNT: { + uint32_t processor_count = 0; + // amdsmi is initialized in AMDSMI_INIT_AMD_GPUS mode -> returned sockets are GPUs + value->status = get_processor_count(processor_count); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(processor_count); + } + } break; + case RDC_FI_POWER_USAGE: { + amdsmi_power_info_t power_info = {}; + value->status = amdsmi_get_power_info(processor_handle, &power_info); + value->type = INTEGER; + if (value->status != AMDSMI_STATUS_SUCCESS) { + break; + } + + // Use current_socket_power if average_socket_power is not available + if (power_info.average_socket_power != 65535) { + value->value.l_int = static_cast(power_info.average_socket_power) * 1000 * 1000; + } else { + value->value.l_int = static_cast(power_info.current_socket_power) * 1000 * 1000; + } + + break; + } + case RDC_FI_GPU_CLOCK: + case RDC_FI_MEM_CLOCK: { + amdsmi_clk_type_t clk_type = CLK_TYPE_SYS; + if (field_id == RDC_FI_MEM_CLOCK) { + clk_type = CLK_TYPE_MEM; + } + amdsmi_frequencies_t f = {}; + value->status = amdsmi_get_clk_freq(processor_handle, clk_type, &f); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { value->value.l_int = f.frequency[f.current]; } break; - case RDC_FI_GPU_UTIL: - uint32_t busy_percent; - value->status = rsmi_dev_busy_percent_get(gpu_index, &busy_percent); + } + case RDC_FI_GPU_UTIL: { + amdsmi_engine_usage_t engine_usage; + value->status = amdsmi_get_gpu_activity(processor_handle, &engine_usage); value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(busy_percent); + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(engine_usage.gfx_activity); } break; - case RDC_FI_DEV_NAME: - value->status = rsmi_dev_name_get(gpu_index, value->value.str, RDC_MAX_STR_LENGTH); + } + case RDC_FI_DEV_NAME: { + amdsmi_asic_info_t asic_info; + value->status = amdsmi_get_gpu_asic_info(processor_handle, &asic_info); value->type = STRING; - break; - case RDC_FI_GPU_TEMP: - case RDC_FI_MEMORY_TEMP: - int64_t val_i64; - sensor_type = RSMI_TEMP_TYPE_EDGE; - if (field_id == RDC_FI_MEMORY_TEMP) { - sensor_type = RSMI_TEMP_TYPE_MEMORY; + if (value->status == AMDSMI_STATUS_SUCCESS) { + memcpy(value->value.str, asic_info.market_name, sizeof(asic_info.market_name)); } - value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, RSMI_TEMP_CURRENT, &val_i64); + break; + } + case RDC_FI_GPU_TEMP: + case RDC_FI_MEMORY_TEMP: { + int64_t i64 = 0; + amdsmi_temperature_type_t sensor_type = TEMPERATURE_TYPE_EDGE; + if (field_id == RDC_FI_MEMORY_TEMP) { + sensor_type = TEMPERATURE_TYPE_VRAM; + } + value->status = + amdsmi_get_temp_metric(processor_handle, sensor_type, AMDSMI_TEMP_CURRENT, &i64); // fallback to hotspot temperature as some card may not have edge temperature. - if (sensor_type == RSMI_TEMP_TYPE_EDGE - && value->status == RSMI_STATUS_NOT_SUPPORTED) { - sensor_type = RSMI_TEMP_TYPE_JUNCTION; - value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, - RSMI_TEMP_CURRENT, &val_i64); + if (sensor_type == TEMPERATURE_TYPE_EDGE && value->status == AMDSMI_STATUS_NOT_SUPPORTED) { + sensor_type = TEMPERATURE_TYPE_JUNCTION; + value->status = + amdsmi_get_temp_metric(processor_handle, sensor_type, AMDSMI_TEMP_CURRENT, &i64); } value->type = INTEGER; - if (value->status == RSMI_STATUS_SUCCESS) { - value->value.l_int = val_i64; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = i64 * 1000; } break; + } case RDC_FI_ECC_CORRECT_TOTAL: case RDC_FI_ECC_UNCORRECT_TOTAL: get_ecc_error(gpu_index, field_id, value); @@ -437,31 +475,33 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field case RDC_EVNT_XGMI_1_REQ_TX: case RDC_EVNT_XGMI_1_RESP_TX: case RDC_EVNT_XGMI_1_BEATS_TX: - read_rsmi_counter(); + read_smi_counter(); break; case RDC_EVNT_XGMI_0_THRPUT: case RDC_EVNT_XGMI_1_THRPUT: case RDC_EVNT_XGMI_2_THRPUT: case RDC_EVNT_XGMI_3_THRPUT: case RDC_EVNT_XGMI_4_THRPUT: - case RDC_EVNT_XGMI_5_THRPUT: - read_rsmi_counter(); + case RDC_EVNT_XGMI_5_THRPUT: { + double coll_time_sec = 0; + read_smi_counter(); if (value->status == RDC_ST_OK) { - if (rsmi_data->counter_val.time_running > 0) { - coll_time_sec = static_cast(rsmi_data->counter_val.time_running) / kGig; + if (smi_data->counter_val.time_running > 0) { + coll_time_sec = static_cast(smi_data->counter_val.time_running) / kGig; value->value.l_int = (value->value.l_int * 32) / coll_time_sec; } else { value->value.l_int = 0; } } break; + } default: break; } int64_t latency = now() - value->ts; - if (value->status != RSMI_STATUS_SUCCESS) { + if (value->status != AMDSMI_STATUS_SUCCESS) { if (async_fetching) { //!< Async fetching is not an error RDC_LOG(RDC_DEBUG, "Async fetch " << field_id_string(field_id)); } else { @@ -480,42 +520,45 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field << value->value.str << ", latency " << latency); } - return value->status == RSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR; + return value->status == AMDSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_MSI_ERROR; } -std::shared_ptr RdcMetricFetcherImpl::get_rsmi_data(RdcFieldKey key) { - std::map>::iterator r_info = rsmi_data_.find(key); +std::shared_ptr RdcMetricFetcherImpl::get_smi_data(RdcFieldKey key) { + std::map>::iterator r_info = smi_data_.find(key); - if (r_info != rsmi_data_.end()) { + if (r_info != smi_data_.end()) { return r_info->second; } return nullptr; } -static rdc_status_t init_rsmi_counter(RdcFieldKey fk, rsmi_event_group_t grp, - rsmi_event_handle_t* handle) { - rsmi_status_t ret; +static rdc_status_t init_smi_counter(RdcFieldKey fk, amdsmi_event_group_t grp, + amdsmi_event_handle_t* handle) { + amdsmi_status_t ret; uint32_t counters_available; uint32_t dv_ind = fk.first; rdc_field_t f = fk.second; assert(handle != nullptr); - ret = rsmi_dev_counter_group_supported(dv_ind, grp); + amdsmi_processor_handle processor_handle; + ret = get_processor_handle_from_id(dv_ind, &processor_handle); - if (ret != RSMI_STATUS_SUCCESS) { - return Rsmi2RdcError(ret); + ret = amdsmi_gpu_counter_group_supported(processor_handle, grp); + + if (ret != AMDSMI_STATUS_SUCCESS) { + return Smi2RdcError(ret); } - ret = rsmi_counter_available_counters_get(dv_ind, grp, &counters_available); - if (ret != RSMI_STATUS_SUCCESS) { - return Rsmi2RdcError(ret); + ret = amdsmi_get_gpu_available_counters(processor_handle, grp, &counters_available); + if (ret != AMDSMI_STATUS_SUCCESS) { + return Smi2RdcError(ret); } if (counters_available == 0) { return RDC_ST_INSUFF_RESOURCES; } - rsmi_event_type_t evt = rdc_evnt_2_rsmi_field.at(f); + amdsmi_event_type_t evt = rdc_evnt_2_smi_field.at(f); // Temporarily get DAC capability ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE); @@ -525,12 +568,12 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk, rsmi_event_group_t grp, return RDC_ST_PERM_ERROR; } - ret = rsmi_dev_counter_create(dv_ind, evt, handle); - if (ret != RSMI_STATUS_SUCCESS) { - return Rsmi2RdcError(ret); + ret = amdsmi_gpu_create_counter(processor_handle, evt, handle); + if (ret != AMDSMI_STATUS_SUCCESS) { + return Smi2RdcError(ret); } - ret = rsmi_counter_control(*handle, RSMI_CNTR_CMD_START, nullptr); + ret = amdsmi_gpu_control_counter(*handle, AMDSMI_CNTR_CMD_START, nullptr); // Release DAC capability sc.Relinquish(); @@ -540,11 +583,11 @@ static rdc_status_t init_rsmi_counter(RdcFieldKey fk, rsmi_event_group_t grp, return RDC_ST_PERM_ERROR; } - return Rsmi2RdcError(ret); + return Smi2RdcError(ret); } -rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) { - rsmi_status_t ret; +rdc_status_t RdcMetricFetcherImpl::delete_smi_handle(RdcFieldKey fk) { + amdsmi_status_t ret; switch (fk.second) { case RDC_EVNT_XGMI_0_NOP_TX: @@ -561,52 +604,53 @@ rdc_status_t RdcMetricFetcherImpl::delete_rsmi_handle(RdcFieldKey fk) { case RDC_EVNT_XGMI_3_THRPUT: case RDC_EVNT_XGMI_4_THRPUT: case RDC_EVNT_XGMI_5_THRPUT: { - rsmi_event_handle_t h; - if (rsmi_data_.find(fk) == rsmi_data_.end()) { + amdsmi_event_handle_t h; + if (smi_data_.find(fk) == smi_data_.end()) { return RDC_ST_NOT_SUPPORTED; } - h = rsmi_data_[fk]->evt_handle; + h = smi_data_[fk]->evt_handle; // Stop counting. - ret = rsmi_counter_control(h, RSMI_CNTR_CMD_STOP, nullptr); - if (ret != RSMI_STATUS_SUCCESS) { - rsmi_data_.erase(fk); + ret = amdsmi_gpu_control_counter(h, AMDSMI_CNTR_CMD_STOP, nullptr); + if (ret != AMDSMI_STATUS_SUCCESS) { + smi_data_.erase(fk); - RDC_LOG(RDC_ERROR, "Error in stopping event counter: " << Rsmi2RdcError(ret)); - return Rsmi2RdcError(ret); + RDC_LOG(RDC_ERROR, "Error in stopping event counter: " << Smi2RdcError(ret)); + return Smi2RdcError(ret); } // Release all resources (e.g., counter and memory resources) associated // with evnt_handle. - ret = rsmi_dev_counter_destroy(h); + ret = amdsmi_gpu_destroy_counter(h); - rsmi_data_.erase(fk); - return Rsmi2RdcError(ret); + smi_data_.erase(fk); + return Smi2RdcError(ret); } default: return RDC_ST_NOT_SUPPORTED; } + return RDC_ST_OK; } -rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) { +rdc_status_t RdcMetricFetcherImpl::acquire_smi_handle(RdcFieldKey fk) { rdc_status_t ret = RDC_ST_OK; - auto get_evnt_handle = [&](rsmi_event_group_t grp) { - rsmi_event_handle_t handle; + auto get_evnt_handle = [&](amdsmi_event_group_t grp) { + amdsmi_event_handle_t handle; rdc_status_t result; - if (get_rsmi_data(fk) != nullptr) { + if (get_smi_data(fk) != nullptr) { // This event has already been initialized. return RDC_ST_ALREADY_EXIST; } - result = init_rsmi_counter(fk, grp, &handle); + result = init_smi_counter(fk, grp, &handle); if (result != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, "Failed to init RSMI counter. Return:" << result); + RDC_LOG(RDC_ERROR, "Failed to init SMI counter. Return:" << result); return result; } - auto fsh = std::shared_ptr(new FieldRSMIData); + auto fsh = std::shared_ptr(new FieldSMIData); if (fsh == nullptr) { return RDC_ST_INSUFF_RESOURCES; @@ -614,7 +658,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) { fsh->evt_handle = handle; - rsmi_data_[fk] = fsh; + smi_data_[fk] = fsh; return RDC_ST_OK; }; @@ -628,7 +672,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) { case RDC_EVNT_XGMI_1_REQ_TX: case RDC_EVNT_XGMI_1_RESP_TX: case RDC_EVNT_XGMI_1_BEATS_TX: - ret = get_evnt_handle(RSMI_EVNT_GRP_XGMI); + ret = get_evnt_handle(AMDSMI_EVNT_GRP_XGMI); break; case RDC_EVNT_XGMI_0_THRPUT: @@ -637,7 +681,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_rsmi_handle(RdcFieldKey fk) { case RDC_EVNT_XGMI_3_THRPUT: case RDC_EVNT_XGMI_4_THRPUT: case RDC_EVNT_XGMI_5_THRPUT: - ret = get_evnt_handle(RSMI_EVNT_GRP_XGMI_DATA_OUT); + ret = get_evnt_handle(AMDSMI_EVNT_GRP_XGMI_DATA_OUT); break; default: diff --git a/projects/rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc index 90051e71a0..dac2048e16 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcModuleMgrImpl.cc @@ -21,6 +21,7 @@ THE SOFTWARE. */ #include "rdc_lib/impl/RdcModuleMgrImpl.h" +#include #include #include diff --git a/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc index 72e9d40e30..47434523b0 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcNotificationImpl.cc @@ -24,35 +24,34 @@ THE SOFTWARE. #include #include +#include #include #include // NOLINT #include #include +#include "amd_smi/amdsmi.h" #include "common/rdc_capabilities.h" #include "rdc/rdc.h" #include "rdc_lib/RdcLogger.h" -#include "rdc_lib/impl/RdcSmiLib.h" -#include "rdc_lib/impl/RdcTelemetryModule.h" -#include "rdc_lib/impl/RsmiUtils.h" -#include "rocm_smi/rocm_smi.h" +#include "rdc_lib/impl/SmiUtils.h" namespace amd { namespace rdc { -static std::unordered_map rdc_2_rsmi_event_notif_map = { - {RDC_EVNT_NOTIF_VMFAULT, RSMI_EVT_NOTIF_VMFAULT}, - {RDC_EVNT_NOTIF_FIRST, RSMI_EVT_NOTIF_FIRST}, - {RDC_EVNT_NOTIF_THERMAL_THROTTLE, RSMI_EVT_NOTIF_THERMAL_THROTTLE}, - {RDC_EVNT_NOTIF_PRE_RESET, RSMI_EVT_NOTIF_GPU_PRE_RESET}, - {RDC_EVNT_NOTIF_POST_RESET, RSMI_EVT_NOTIF_GPU_POST_RESET}, +static std::unordered_map rdc_2_smi_event_notif_map = { + {RDC_EVNT_NOTIF_VMFAULT, AMDSMI_EVT_NOTIF_VMFAULT}, + {RDC_EVNT_NOTIF_FIRST, AMDSMI_EVT_NOTIF_FIRST}, + {RDC_EVNT_NOTIF_THERMAL_THROTTLE, AMDSMI_EVT_NOTIF_THERMAL_THROTTLE}, + {RDC_EVNT_NOTIF_PRE_RESET, AMDSMI_EVT_NOTIF_GPU_PRE_RESET}, + {RDC_EVNT_NOTIF_POST_RESET, AMDSMI_EVT_NOTIF_GPU_POST_RESET}, }; -static std::unordered_map rsmi_event_notif_2_rdc_map = { - {RSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT}, - {RSMI_EVT_NOTIF_FIRST, RDC_EVNT_NOTIF_FIRST}, - {RSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE}, - {RSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET}, - {RSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET}, +static std::unordered_map smi_event_notif_2_rdc_map = { + {AMDSMI_EVT_NOTIF_VMFAULT, RDC_EVNT_NOTIF_VMFAULT}, + {AMDSMI_EVT_NOTIF_FIRST, RDC_EVNT_NOTIF_FIRST}, + {AMDSMI_EVT_NOTIF_THERMAL_THROTTLE, RDC_EVNT_NOTIF_THERMAL_THROTTLE}, + {AMDSMI_EVT_NOTIF_GPU_PRE_RESET, RDC_EVNT_NOTIF_PRE_RESET}, + {AMDSMI_EVT_NOTIF_GPU_POST_RESET, RDC_EVNT_NOTIF_POST_RESET}, }; // This const determines space allocated on stack for notification events. @@ -63,22 +62,22 @@ RdcNotificationImpl::RdcNotificationImpl() {} RdcNotificationImpl::~RdcNotificationImpl() {} bool RdcNotificationImpl::is_notification_event(rdc_field_t field) const { - if (rdc_2_rsmi_event_notif_map.find(field) == rdc_2_rsmi_event_notif_map.end()) { + if (rdc_2_smi_event_notif_map.find(field) == rdc_2_smi_event_notif_map.end()) { return false; } return true; } rdc_status_t RdcNotificationImpl::set_listen_events(const std::vector fk_arr) { - rsmi_status_t ret; + amdsmi_status_t ret; std::map new_masks; for (uint32_t i = 0; i < fk_arr.size(); ++i) { - if (rdc_2_rsmi_event_notif_map.find(fk_arr[i].second) == rdc_2_rsmi_event_notif_map.end()) { + if (rdc_2_smi_event_notif_map.find(fk_arr[i].second) == rdc_2_smi_event_notif_map.end()) { continue; } new_masks[fk_arr[i].first] |= - RSMI_EVENT_MASK_FROM_INDEX(rdc_2_rsmi_event_notif_map[fk_arr[i].second]); + AMDSMI_EVENT_MASK_FROM_INDEX(rdc_2_smi_event_notif_map[fk_arr[i].second]); } std::map::iterator it = new_masks.begin(); @@ -90,6 +89,15 @@ rdc_status_t RdcNotificationImpl::set_listen_events(const std::vectorfirst, &processor_handle); + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "Failed to get processor handle for GPU " << it->first << " error: " << ret); + return Smi2RdcError(ret); + } + // Temporarily get DAC capability ScopedCapability sc(CAP_DAC_OVERRIDE, CAP_EFFECTIVE); @@ -98,15 +106,15 @@ rdc_status_t RdcNotificationImpl::set_listen_events(const std::vectorfirst); - if (ret != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, "rsmi_event_notification_init() returned " + ret = amdsmi_init_gpu_event_notification(processor_handle); + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "amdsmi_init_gpu_event_notification() returned " << ret << " for device " << it->first << ". " << std::endl << " Will not listen for events on this device"); continue; } - ret = rsmi_event_notification_mask_set(it->first, it->second); + ret = amdsmi_set_gpu_event_notification_mask(processor_handle, it->second); // Release DAC capability sc.Relinquish(); @@ -115,14 +123,14 @@ rdc_status_t RdcNotificationImpl::set_listen_events(const std::vectorfirst] = it->second; RDC_LOG(RDC_INFO, "Event notification mask for gpu " << it->first << "is set to 0x" << std::hex << it->second); } else { - RDC_LOG(RDC_INFO, - "rsmi_event_notification_mask_set() returned " << ret << " for device " << it->first); - return Rsmi2RdcError(ret); + RDC_LOG(RDC_INFO, "amdsmi_set_gpu_event_notification_mask() returned " + << ret << " for device " << it->first); + return Smi2RdcError(ret); } } return RDC_ST_OK; @@ -136,12 +144,12 @@ rdc_status_t RdcNotificationImpl::listen(rdc_evnt_notification_t* events, uint32 } uint32_t f_cnt = std::min(*num_events, kMaxRSMIEvents); - rsmi_evt_notification_data_t rsmi_events[kMaxRSMIEvents]; + amdsmi_evt_notification_data_t smi_events[kMaxRSMIEvents]; - rsmi_status_t ret = rsmi_event_notification_get(timeout_ms, &f_cnt, rsmi_events); + amdsmi_status_t ret = amdsmi_get_gpu_event_notification(timeout_ms, &f_cnt, smi_events); - if (ret != RSMI_STATUS_SUCCESS) { - return Rsmi2RdcError(ret); + if (ret != AMDSMI_STATUS_SUCCESS) { + return Smi2RdcError(ret); } struct timeval tv; gettimeofday(&tv, NULL); @@ -149,35 +157,44 @@ rdc_status_t RdcNotificationImpl::listen(rdc_evnt_notification_t* events, uint32 *num_events = f_cnt; for (uint32_t i = 0; i < f_cnt; ++i) { - assert(rsmi_event_notif_2_rdc_map.find(rsmi_events[i].event) != - rsmi_event_notif_2_rdc_map.end()); - events[i].gpu_id = rsmi_events[i].dv_ind; - events[i].field.field_id = rsmi_event_notif_2_rdc_map[rsmi_events[i].event]; + assert(smi_event_notif_2_rdc_map.find(smi_events[i].event) != smi_event_notif_2_rdc_map.end()); + uint64_t bdfid; + amdsmi_get_gpu_bdf_id(smi_events[i].processor_handle, &bdfid); + events[i].gpu_id = bdfid; + events[i].field.field_id = smi_event_notif_2_rdc_map[smi_events[i].event]; events[i].field.status = RDC_ST_OK; events[i].field.ts = now; events[i].field.type = STRING; - strncpy_with_null(events[i].field.value.str, rsmi_events[i].message, RDC_MAX_STR_LENGTH); + strncpy_with_null(events[i].field.value.str, smi_events[i].message, RDC_MAX_STR_LENGTH); } return RDC_ST_OK; } rdc_status_t RdcNotificationImpl::stop_listening(uint32_t gpu_id) { - rsmi_status_t ret; + amdsmi_status_t ret; - ret = rsmi_event_notification_mask_set(gpu_id, 0); - if (ret != RSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, - "rsmi_event_notification_mask_set() returned " << ret << " for device " << gpu_id); + // Get processor handle from GPU id + amdsmi_processor_handle processor_handle; + ret = get_processor_handle_from_id(gpu_id, &processor_handle); + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << gpu_id << " error: " << ret); + return Smi2RdcError(ret); } - ret = rsmi_event_notification_stop(gpu_id); - if (ret == RSMI_STATUS_SUCCESS) { + ret = amdsmi_set_gpu_event_notification_mask(processor_handle, 0); + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "amdsmi_set_gpu_event_notification_mask() returned " << ret << " for device " + << gpu_id); + } + + ret = amdsmi_stop_gpu_event_notification(processor_handle); + if (ret == AMDSMI_STATUS_SUCCESS) { std::lock_guard guard(notif_mutex_); gpu_evnt_notif_masks_[gpu_id] = 0; } else { RDC_LOG(RDC_ERROR, - "rsmi_event_notification_stop() returned " << ret << " for device " << gpu_id); + "amdsmi_stop_gpu_event_notification() returned " << ret << " for device " << gpu_id); } return RDC_ST_OK; } diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc index 69472db791..06a200630a 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiDiagnosticImpl.cc @@ -21,21 +21,24 @@ THE SOFTWARE. */ #include "rdc_lib/impl/RdcSmiDiagnosticImpl.h" +#include + #include #include #include +#include "rdc/rdc.h" #include "rdc_lib/RdcLogger.h" -#include "rdc_lib/impl/RsmiUtils.h" +#include "rdc_lib/impl/SmiUtils.h" #include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { RdcSmiDiagnosticImpl::RdcSmiDiagnosticImpl() {} -rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) { +rdc_status_t RdcSmiDiagnosticImpl::check_smi_process_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { if (result == nullptr) { return RDC_ST_BAD_PARAMETER; } @@ -43,14 +46,14 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RD result->test_case = RDC_DIAG_COMPUTE_PROCESS; result->status = RDC_DIAG_RESULT_SKIP; result->per_gpu_result_count = 0; - rsmi_status_t err = RSMI_STATUS_SUCCESS; + amdsmi_status_t err = AMDSMI_STATUS_SUCCESS; uint32_t num_items = 0; - err = rsmi_compute_process_info_get(nullptr, &num_items); - if (err != RSMI_STATUS_SUCCESS) { + err = amdsmi_get_gpu_compute_process_info(nullptr, &num_items); + if (err != AMDSMI_STATUS_SUCCESS) { RDC_LOG(RDC_ERROR, "Fail to get process information: " << err); - strncpy_with_null(result->info, "Fail to retreive process information from rocm_smi_lib", + strncpy_with_null(result->info, "Fail to retreive process information from amd_smi_lib", MAX_DIAG_MSG_LENGTH); - return Rsmi2RdcError(err); + return Smi2RdcError(err); } // No process found @@ -63,13 +66,13 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RD std::string info; // Find details of the process running on each GPU - std::vector procs(num_items); - err = - rsmi_compute_process_info_get(reinterpret_cast(&procs[0]), &num_items); - if (err != RSMI_STATUS_SUCCESS) { + std::vector procs(num_items); + err = amdsmi_get_gpu_compute_process_info(reinterpret_cast(&procs[0]), + &num_items); + if (err != AMDSMI_STATUS_SUCCESS) { RDC_LOG(RDC_INFO, "Fail to get process detail information: " << err); strncpy_with_null(result->info, info.c_str(), MAX_DIAG_MSG_LENGTH); - return Rsmi2RdcError(err); + return Smi2RdcError(err); } std::map> pids_per_gpu; @@ -85,17 +88,18 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RD // Get the num_devices the process is running uint32_t num_devices = 0; - err = rsmi_compute_process_gpus_get(procs[i].process_id, nullptr, &num_devices); - if (err != RSMI_STATUS_SUCCESS || num_devices == 0) { + amdsmi_status_t err; + err = amdsmi_get_gpu_compute_process_gpus(procs[i].process_id, nullptr, &num_devices); + if (err != AMDSMI_STATUS_SUCCESS || num_devices == 0) { RDC_LOG(RDC_INFO, "Fail to get process GPUs detail information: " << err); continue; } // Get the details of devices std::vector device_details(num_devices); - err = rsmi_compute_process_gpus_get( + err = amdsmi_get_gpu_compute_process_gpus( procs[i].process_id, reinterpret_cast(&device_details[0]), &num_devices); - if (err != RSMI_STATUS_SUCCESS) { + if (err != AMDSMI_STATUS_SUCCESS) { RDC_LOG(RDC_INFO, "Fail to get process GPUs detail information: " << err); continue; } @@ -147,22 +151,22 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_process_info(uint32_t gpu_index[RD return RDC_ST_OK; } -std::string RdcSmiDiagnosticImpl::get_temperature_string(rsmi_temperature_type_t type) const { +std::string RdcSmiDiagnosticImpl::get_temperature_string(amdsmi_temperature_type_t type) const { switch (type) { - case RSMI_TEMP_TYPE_EDGE: + case TEMPERATURE_TYPE_EDGE: return "Edge"; - case RSMI_TEMP_TYPE_JUNCTION: + case TEMPERATURE_TYPE_JUNCTION: return "Junction"; - case RSMI_TEMP_TYPE_MEMORY: + case TEMPERATURE_TYPE_VRAM: return "Memory"; default: return "Unknown"; } } -std::string RdcSmiDiagnosticImpl::get_voltage_string(rsmi_voltage_type_t type) const { +std::string RdcSmiDiagnosticImpl::get_voltage_string(amdsmi_voltage_type_t type) const { switch (type) { - case RSMI_VOLT_TYPE_VDDGFX: + case AMDSMI_VOLT_TYPE_VDDGFX: return "Vddgfx voltage"; default: return "Unknown"; @@ -170,46 +174,49 @@ std::string RdcSmiDiagnosticImpl::get_voltage_string(rsmi_voltage_type_t type) c } // Show topology type -rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) { +rdc_status_t RdcSmiDiagnosticImpl::check_smi_topo_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { if (result == nullptr) { return RDC_ST_BAD_PARAMETER; } *result = {}; result->test_case = RDC_DIAG_NODE_TOPOLOGY; - const std::map link_to_string = { - {RSMI_IOLINK_TYPE_UNDEFINED, "Undefined"}, - {RSMI_IOLINK_TYPE_PCIEXPRESS, "PCI Express"}, - {RSMI_IOLINK_TYPE_XGMI, "XGMI"}, - {RSMI_IOLINK_TYPE_NUMIOLINKTYPES, "IO Link"}}; + const std::map link_to_string = { + {AMDSMI_IOLINK_TYPE_UNDEFINED, "Undefined"}, + {AMDSMI_IOLINK_TYPE_PCIEXPRESS, "PCI Express"}, + {AMDSMI_IOLINK_TYPE_XGMI, "XGMI"}, + {AMDSMI_IOLINK_TYPE_NUMIOLINKTYPES, "IO Link"}}; result->status = RDC_DIAG_RESULT_SKIP; result->per_gpu_result_count = 0; - rsmi_status_t err = RSMI_STATUS_SUCCESS; + amdsmi_status_t err = AMDSMI_STATUS_SUCCESS; std::string info = ""; for (uint32_t i = 0; i < gpu_count; i++) { for (uint32_t j = 0; j < gpu_count; j++) { if (gpu_index[i] == gpu_index[j]) continue; + std::pair ph; + err = get_processor_handle_from_id(gpu_index[i], &ph.first); + err = get_processor_handle_from_id(gpu_index[i], &ph.second); uint64_t weight; - err = rsmi_topo_get_link_weight(gpu_index[i], gpu_index[j], &weight); - if (err != RSMI_STATUS_SUCCESS) { + err = amdsmi_topo_get_link_weight(ph.first, ph.second, &weight); + if (err != AMDSMI_STATUS_SUCCESS) { result->status = RDC_DIAG_RESULT_FAIL; result->details.code = err; std::string err_info = "rsmi_topo_get_link_weight("; - err_info += std::to_string(gpu_index[i]) + ","; - err_info += std::to_string(gpu_index[j]) + ", &weight)"; + err_info += std::to_string(i) + ","; + err_info += std::to_string(j) + ", &weight)"; err_info += " fail"; strncpy_with_null(result->details.msg, err_info.c_str(), MAX_DIAG_MSG_LENGTH); strncpy_with_null(result->info, err_info.c_str(), MAX_DIAG_MSG_LENGTH); return RDC_ST_MSI_ERROR; } - info += std::to_string(gpu_index[i]) + "=>"; - info += std::to_string(gpu_index[j]) + " weight:"; + info += std::to_string(i) + "=>"; + info += std::to_string(j) + " weight:"; info += std::to_string(weight) + " "; } } @@ -223,9 +230,9 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_topo_info(uint32_t gpu_index[RDC_M return RDC_ST_OK; } -rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], - uint32_t gpu_count, - rdc_diag_test_result_t* result) { +rdc_status_t RdcSmiDiagnosticImpl::check_smi_param_info(uint32_t gpu_index[RDC_MAX_NUM_DEVICES], + uint32_t gpu_count, + rdc_diag_test_result_t* result) { if (result == nullptr) { return RDC_ST_BAD_PARAMETER; } @@ -237,27 +244,27 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info(uint32_t gpu_index[RDC_ for (uint32_t i = 0; i < gpu_count; i++) { // temperature - for (rsmi_temperature_type_t sensor_type = RSMI_TEMP_TYPE_FIRST; - sensor_type != RSMI_TEMP_TYPE_LAST;) { + for (amdsmi_temperature_type_t sensor_type = TEMPERATURE_TYPE_FIRST; + sensor_type < TEMPERATURE_TYPE__MAX;) { auto status = check_temperature_level(gpu_index[i], sensor_type, result->info, result->gpu_results[i].gpu_result.msg); // Set to higher error level if (status > result->status) { result->status = status; } - sensor_type = static_cast(sensor_type + 1); + sensor_type = static_cast(sensor_type + 1); } // Voltage - for (rsmi_voltage_type_t sensor_type = RSMI_VOLT_TYPE_FIRST; - sensor_type != RSMI_VOLT_TYPE_LAST;) { + for (amdsmi_voltage_type_t sensor_type = AMDSMI_VOLT_TYPE_FIRST; + sensor_type < AMDSMI_VOLT_TYPE_LAST;) { auto status = check_voltage_level(gpu_index[i], sensor_type, result->info, result->gpu_results[i].gpu_result.msg); // Set to higher error level if (status > result->status) { result->status = status; } - sensor_type = static_cast(sensor_type + 1); + sensor_type = static_cast(sensor_type + 1); } result->gpu_results->gpu_index = gpu_index[i]; result->per_gpu_result_count++; @@ -266,24 +273,25 @@ rdc_status_t RdcSmiDiagnosticImpl::check_rsmi_param_info(uint32_t gpu_index[RDC_ } rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level( - uint32_t gpu_index, rsmi_temperature_type_t type, char msg[MAX_DIAG_MSG_LENGTH], + uint32_t gpu_index, amdsmi_temperature_type_t type, char msg[MAX_DIAG_MSG_LENGTH], char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) { rdc_diag_result_t result = RDC_DIAG_RESULT_PASS; - rsmi_temperature_metric_t met = RSMI_TEMP_CURRENT; - rsmi_status_t err = RSMI_STATUS_SUCCESS; + amdsmi_temperature_metric_t met = AMDSMI_TEMP_CURRENT; + amdsmi_status_t err = AMDSMI_STATUS_SUCCESS; int64_t current_temp = 0; std::string info = msg; std::string per_gpu_info = per_gpu_msg; + amdsmi_processor_handle processor_handle; + get_processor_handle_from_id(gpu_index, &processor_handle); - err = rsmi_dev_temp_metric_get(gpu_index, type, met, ¤t_temp); - - if (err != RSMI_STATUS_SUCCESS) return result; + err = amdsmi_get_temp_metric(processor_handle, type, met, ¤t_temp); + if (err != AMDSMI_STATUS_SUCCESS) return result; // Max temperature - met = RSMI_TEMP_MAX; + met = AMDSMI_TEMP_MAX; int64_t max_temp = 0; - err = rsmi_dev_temp_metric_get(gpu_index, type, met, &max_temp); - if (err == RSMI_STATUS_SUCCESS) { + err = amdsmi_get_temp_metric(processor_handle, type, met, &max_temp); + if (err == AMDSMI_STATUS_SUCCESS) { if (current_temp >= max_temp) { result = RDC_DIAG_RESULT_WARN; per_gpu_info += "Max "; @@ -305,10 +313,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level( } } - met = RSMI_TEMP_MIN; + met = AMDSMI_TEMP_MIN; int64_t min_temp = 0; - err = rsmi_dev_temp_metric_get(gpu_index, type, met, &min_temp); - if (err == RSMI_STATUS_SUCCESS) { + err = amdsmi_get_temp_metric(processor_handle, type, met, &min_temp); + if (err == AMDSMI_STATUS_SUCCESS) { if (current_temp <= min_temp) { result = RDC_DIAG_RESULT_WARN; per_gpu_info += "Min "; @@ -329,10 +337,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level( } } - met = RSMI_TEMP_CRITICAL; + met = AMDSMI_TEMP_CRITICAL; int64_t critical_temp = 0; - err = rsmi_dev_temp_metric_get(gpu_index, type, met, &critical_temp); - if (err == RSMI_STATUS_SUCCESS) { + err = amdsmi_get_temp_metric(processor_handle, type, met, &critical_temp); + if (err == AMDSMI_STATUS_SUCCESS) { if (current_temp >= critical_temp) { result = RDC_DIAG_RESULT_FAIL; per_gpu_info += "Critical "; @@ -353,10 +361,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level( } } - met = RSMI_TEMP_EMERGENCY; + met = AMDSMI_TEMP_EMERGENCY; int64_t emergency_temp = 0; - err = rsmi_dev_temp_metric_get(gpu_index, type, met, &emergency_temp); - if (err == RSMI_STATUS_SUCCESS) { + err = amdsmi_get_temp_metric(processor_handle, type, met, &emergency_temp); + if (err == AMDSMI_STATUS_SUCCESS) { if (current_temp >= critical_temp) { result = RDC_DIAG_RESULT_FAIL; per_gpu_info += "Emergency "; @@ -377,10 +385,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level( } } - met = RSMI_TEMP_CRIT_MIN; + met = AMDSMI_TEMP_CRIT_MIN; int64_t critical_min_temp = 0; - err = rsmi_dev_temp_metric_get(gpu_index, type, met, &critical_min_temp); - if (err == RSMI_STATUS_SUCCESS) { + err = amdsmi_get_temp_metric(processor_handle, type, met, &critical_min_temp); + if (err == AMDSMI_STATUS_SUCCESS) { if (current_temp <= critical_min_temp) { result = RDC_DIAG_RESULT_FAIL; per_gpu_info += "Critical Min "; @@ -408,24 +416,26 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_temperature_level( } rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index, - rsmi_voltage_type_t type, + amdsmi_voltage_type_t type, char msg[MAX_DIAG_MSG_LENGTH], char per_gpu_msg[MAX_DIAG_MSG_LENGTH]) { rdc_diag_result_t result = RDC_DIAG_RESULT_PASS; - rsmi_voltage_metric_t met = RSMI_VOLT_CURRENT; - rsmi_status_t err = RSMI_STATUS_SUCCESS; + amdsmi_voltage_metric_t met = AMDSMI_VOLT_CURRENT; + amdsmi_status_t err = AMDSMI_STATUS_SUCCESS; int64_t current_voltage = 0; std::string info = msg; std::string per_gpu_info = per_gpu_msg; + amdsmi_processor_handle processor_handle; + get_processor_handle_from_id(gpu_index, &processor_handle); - err = rsmi_dev_volt_metric_get(gpu_index, type, met, ¤t_voltage); - if (err != RSMI_STATUS_SUCCESS) return result; + err = amdsmi_get_gpu_volt_metric(processor_handle, type, met, ¤t_voltage); + if (err != AMDSMI_STATUS_SUCCESS) return result; // Max voltage - met = RSMI_VOLT_MAX; + met = AMDSMI_VOLT_MAX; int64_t max_volt = 0; - err = rsmi_dev_volt_metric_get(gpu_index, type, met, &max_volt); - if (err == RSMI_STATUS_SUCCESS) { + err = amdsmi_get_gpu_volt_metric(processor_handle, type, met, &max_volt); + if (err == AMDSMI_STATUS_SUCCESS) { if (current_voltage >= max_volt) { result = RDC_DIAG_RESULT_WARN; per_gpu_info += "Max "; @@ -448,10 +458,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index, } // Min voltage - met = RSMI_VOLT_MIN; + met = AMDSMI_VOLT_MIN; int64_t min_volt = 0; - err = rsmi_dev_volt_metric_get(gpu_index, type, met, &min_volt); - if (err == RSMI_STATUS_SUCCESS) { + err = amdsmi_get_gpu_volt_metric(processor_handle, type, met, &min_volt); + if (err == AMDSMI_STATUS_SUCCESS) { if (current_voltage <= min_volt) { result = RDC_DIAG_RESULT_WARN; per_gpu_info += "Min "; @@ -474,10 +484,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index, } // Max Critical voltage - met = RSMI_VOLT_MAX_CRIT; + met = AMDSMI_VOLT_MAX_CRIT; int64_t critical_max_volt = 0; - err = rsmi_dev_volt_metric_get(gpu_index, type, met, &critical_max_volt); - if (err == RSMI_STATUS_SUCCESS) { + err = amdsmi_get_gpu_volt_metric(nullptr, type, met, &critical_max_volt); + if (err == AMDSMI_STATUS_SUCCESS) { if (current_voltage >= critical_max_volt) { result = RDC_DIAG_RESULT_FAIL; per_gpu_info += "Critical Max "; @@ -500,10 +510,10 @@ rdc_diag_result_t RdcSmiDiagnosticImpl::check_voltage_level(uint32_t gpu_index, } // Min Critical voltage - met = RSMI_VOLT_MIN_CRIT; + met = AMDSMI_VOLT_MIN_CRIT; int64_t critical_min_volt = 0; - err = rsmi_dev_volt_metric_get(gpu_index, type, met, &critical_min_volt); - if (err == RSMI_STATUS_SUCCESS) { + err = amdsmi_get_gpu_volt_metric(nullptr, type, met, &critical_min_volt); + if (err == AMDSMI_STATUS_SUCCESS) { if (current_voltage <= critical_min_volt) { result = RDC_DIAG_RESULT_FAIL; per_gpu_info += "Critical Min "; diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc index 1e69b67702..d6c9861440 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc @@ -42,8 +42,8 @@ RdcSmiLib::RdcSmiLib(const RdcMetricFetcherPtr& mf) } } -// Bulk fetch wrapper for the rocm_smi_lib. This will be replaced after -// rocm_smi_lib can support bulk fetch. +// Bulk fetch wrapper for the amd_smi_lib. This will be replaced after +// amd_smi_lib can support bulk fetch. rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, uint32_t fields_count, rdc_field_value_f callback, @@ -52,7 +52,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, return RDC_ST_BAD_PARAMETER; } - RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from rocm_smi_lib."); + RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from amd_smi_lib."); // Bulk fetch fields std::vector bulk_results; @@ -60,7 +60,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_value_get(rdc_gpu_field_t* fields, rdc_status_t status = metric_fetcher_->bulk_fetch_smi_fields(fields, fields_count, bulk_results); RDC_LOG(RDC_DEBUG, "Bulk fetched " << bulk_results.size() - << " fields from rocm_smi_lib which return " << status); + << " fields from amd_smi_lib which return " << status); if (bulk_results.size() > 0) { rdc_status_t status = callback(&bulk_results[0], bulk_results.size(), user_data); if (status != RDC_ST_OK) { @@ -116,12 +116,12 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_watch(rdc_gpu_field_t* fields, uint } for (uint32_t i = 0; i < fields_count; i++) { - ret = metric_fetcher_->acquire_rsmi_handle({fields[i].gpu_index, fields[i].field_id}); + ret = metric_fetcher_->acquire_smi_handle({fields[i].gpu_index, fields[i].field_id}); if (ret != RDC_ST_OK) { - RDC_LOG(RDC_ERROR, "Failed to acquire rocm_smi handle for field."); + RDC_LOG(RDC_ERROR, "Failed to acquire amd_smi handle for field."); } } - RDC_LOG(RDC_DEBUG, "acquire " << fields_count << " field handles from rocm_smi_lib"); + RDC_LOG(RDC_DEBUG, "acquire " << fields_count << " field handles from amd_smi_lib"); return RDC_ST_OK; } @@ -133,9 +133,9 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_unwatch(rdc_gpu_field_t* fields, } for (uint32_t i = 0; i < fields_count; i++) { - metric_fetcher_->delete_rsmi_handle({fields[i].gpu_index, fields[i].field_id}); + metric_fetcher_->delete_smi_handle({fields[i].gpu_index, fields[i].field_id}); } - RDC_LOG(RDC_DEBUG, "delete " << fields_count << " field handles from rocm_smi_lib"); + RDC_LOG(RDC_DEBUG, "delete " << fields_count << " field handles from amd_smi_lib"); return RDC_ST_OK; } @@ -146,7 +146,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI return RDC_ST_BAD_PARAMETER; } - // List of fields supported by rocm_smi_lib + // List of fields supported by amd_smi_lib const std::vector fields{ RDC_FI_GPU_COUNT, RDC_FI_DEV_NAME, RDC_FI_GPU_CLOCK, RDC_FI_MEM_CLOCK, @@ -192,11 +192,11 @@ rdc_status_t RdcSmiLib::rdc_test_case_run(rdc_diag_test_cases_t test_case, } switch (test_case) { case RDC_DIAG_COMPUTE_PROCESS: - return smi_diag_->check_rsmi_process_info(gpu_index, gpu_count, result); + return smi_diag_->check_smi_process_info(gpu_index, gpu_count, result); case RDC_DIAG_NODE_TOPOLOGY: - return smi_diag_->check_rsmi_topo_info(gpu_index, gpu_count, result); + return smi_diag_->check_smi_topo_info(gpu_index, gpu_count, result); case RDC_DIAG_GPU_PARAMETERS: - return smi_diag_->check_rsmi_param_info(gpu_index, gpu_count, result); + return smi_diag_->check_smi_param_info(gpu_index, gpu_count, result); default: return RDC_ST_NOT_SUPPORTED; } diff --git a/projects/rdc/rdc_libs/rdc/src/RsmiUtils.cc b/projects/rdc/rdc_libs/rdc/src/RsmiUtils.cc deleted file mode 100644 index 63bb284faa..0000000000 --- a/projects/rdc/rdc_libs/rdc/src/RsmiUtils.cc +++ /dev/null @@ -1,72 +0,0 @@ -/* -Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "rdc/rdc.h" -#include "rocm_smi/rocm_smi.h" - -namespace amd { -namespace rdc { - -rdc_status_t Rsmi2RdcError(rsmi_status_t rsmi) { - switch (rsmi) { - case RSMI_STATUS_SUCCESS: - return RDC_ST_OK; - - case RSMI_STATUS_INVALID_ARGS: - return RDC_ST_BAD_PARAMETER; - - case RSMI_STATUS_NOT_SUPPORTED: - return RDC_ST_NOT_SUPPORTED; - - case RSMI_STATUS_NOT_FOUND: - return RDC_ST_NOT_FOUND; - - case RSMI_STATUS_OUT_OF_RESOURCES: - return RDC_ST_INSUFF_RESOURCES; - - case RSMI_STATUS_FILE_ERROR: - return RDC_ST_FILE_ERROR; - - case RSMI_STATUS_NO_DATA: - return RDC_ST_NO_DATA; - - case RSMI_STATUS_PERMISSION: - return RDC_ST_PERM_ERROR; - - case RSMI_STATUS_BUSY: - case RSMI_STATUS_UNKNOWN_ERROR: - case RSMI_STATUS_INTERNAL_EXCEPTION: - case RSMI_STATUS_INPUT_OUT_OF_BOUNDS: - case RSMI_STATUS_INIT_ERROR: - case RSMI_STATUS_NOT_YET_IMPLEMENTED: - case RSMI_STATUS_INSUFFICIENT_SIZE: - case RSMI_STATUS_INTERRUPT: - case RSMI_STATUS_UNEXPECTED_SIZE: - case RSMI_STATUS_UNEXPECTED_DATA: - case RSMI_STATUS_REFCOUNT_OVERFLOW: - default: - return RDC_ST_UNKNOWN_ERROR; - } -} - -} // namespace rdc -} // namespace amd diff --git a/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc b/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc new file mode 100644 index 0000000000..eec1af625f --- /dev/null +++ b/projects/rdc/rdc_libs/rdc/src/SmiUtils.cc @@ -0,0 +1,142 @@ +/* +Copyright (c) 2020 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "rdc_lib/impl/SmiUtils.h" + +#include +#include + +#include "amd_smi/amdsmi.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcLogger.h" + +namespace amd { +namespace rdc { + +rdc_status_t Smi2RdcError(amdsmi_status_t rsmi) { + switch (rsmi) { + case AMDSMI_STATUS_SUCCESS: + return RDC_ST_OK; + + case AMDSMI_STATUS_INVAL: + return RDC_ST_BAD_PARAMETER; + + case AMDSMI_STATUS_NOT_SUPPORTED: + return RDC_ST_NOT_SUPPORTED; + + case AMDSMI_STATUS_NOT_FOUND: + return RDC_ST_NOT_FOUND; + + case AMDSMI_STATUS_OUT_OF_RESOURCES: + return RDC_ST_INSUFF_RESOURCES; + + case AMDSMI_STATUS_FILE_ERROR: + return RDC_ST_FILE_ERROR; + + case AMDSMI_STATUS_NO_DATA: + return RDC_ST_NO_DATA; + + case AMDSMI_STATUS_NO_PERM: + return RDC_ST_PERM_ERROR; + + case AMDSMI_STATUS_BUSY: + case AMDSMI_STATUS_UNKNOWN_ERROR: + case AMDSMI_STATUS_INTERNAL_EXCEPTION: + case AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS: + case AMDSMI_STATUS_INIT_ERROR: + case AMDSMI_STATUS_NOT_YET_IMPLEMENTED: + case AMDSMI_STATUS_INSUFFICIENT_SIZE: + case AMDSMI_STATUS_INTERRUPT: + case AMDSMI_STATUS_UNEXPECTED_SIZE: + case AMDSMI_STATUS_UNEXPECTED_DATA: + case AMDSMI_STATUS_REFCOUNT_OVERFLOW: + default: + return RDC_ST_UNKNOWN_ERROR; + } +} + +amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id, + amdsmi_processor_handle* processor_handle) { + uint32_t socket_count; + uint32_t processor_count; + auto ret = amdsmi_get_socket_handles(&socket_count, nullptr); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + std::vector sockets(socket_count); + std::vector all_processors{}; + ret = amdsmi_get_socket_handles(&socket_count, sockets.data()); + for (auto& socket : sockets) { + ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + std::vector processors(processor_count); + ret = amdsmi_get_processor_handles(socket, &processor_count, processors.data()); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + + for (auto& processor : processors) { + processor_type_t processor_type = {}; + ret = amdsmi_get_processor_type(processor, &processor_type); + if (processor_type != AMD_GPU) { + RDC_LOG(RDC_ERROR, "Expect AMD_GPU device type!"); + return AMDSMI_STATUS_NOT_SUPPORTED; + } + all_processors.push_back(processor); + } + } + + if (gpu_id >= all_processors.size()) { + return AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS; + } + + // Get processor handle from GPU id + *processor_handle = all_processors[gpu_id]; + + return AMDSMI_STATUS_SUCCESS; +} + +amdsmi_status_t get_processor_count(uint32_t& all_processor_count) { + uint32_t total_processor_count = 0; + uint32_t socket_count; + auto ret = amdsmi_get_socket_handles(&socket_count, nullptr); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + std::vector sockets(socket_count); + ret = amdsmi_get_socket_handles(&socket_count, sockets.data()); + for (auto& socket : sockets) { + uint32_t processor_count; + ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr); + if (ret != AMDSMI_STATUS_SUCCESS) { + return ret; + } + total_processor_count += processor_count; + } + all_processor_count = total_processor_count; + return AMDSMI_STATUS_SUCCESS; +} + +} // namespace rdc +} // namespace amd diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/CMakeLists.txt b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/CMakeLists.txt index 39b09211bf..d2d03debd1 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/CMakeLists.txt +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/CMakeLists.txt @@ -34,7 +34,7 @@ if(BUILD_ROCPTEST) "${PROJECT_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}/include" "${COMMON_DIR}" - "${RSMI_INC_DIR}" + "${SMI_INC_DIR}" "${ROCM_DIR}/include" "${ROCM_DIR}/include/hsa") diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/CMakeLists.txt b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/CMakeLists.txt index c406fe6fb5..d61c87133f 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/CMakeLists.txt +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocr/CMakeLists.txt @@ -45,7 +45,7 @@ if(BUILD_ROCRTEST) "${PROJECT_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}/include" "${COMMON_DIR}" - "${RSMI_INC_DIR}" + "${SMI_INC_DIR}" "${ROCM_DIR}/include") # Set the VERSION and SOVERSION values diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/CMakeLists.txt b/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/CMakeLists.txt index 07fc5f5063..52530cf800 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/CMakeLists.txt +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rvs/CMakeLists.txt @@ -32,20 +32,18 @@ if(BUILD_RVS) find_package(hsa-runtime64 REQUIRED) find_package(rvs REQUIRED HINTS ${ROCM_DIR}/lib/cmake) - find_library(rvslib REQUIRED - NAMES rvslib) ## additional libraries set(COMBINED_LIBS rocblas hsakmt hsa-runtime64 hip::amdhip64 yaml-cpp) set(RDC_LIB_MODULES ${RDC_LIB_MODULES} ${RDC_RVS_LIB} PARENT_SCOPE) add_library(${RDC_RVS_LIB} SHARED ${RDC_RVS_LIB_SRC_LIST} ${RDC_RVS_LIB_INC_LIST}) - target_link_libraries(${RDC_RVS_LIB} PRIVATE ${RDC_LIB} ${BOOTSTRAP_LIB} ${rvslib} pthread dl ${COMBINED_LIBS}) + target_link_libraries(${RDC_RVS_LIB} PRIVATE ${RDC_LIB} ${BOOTSTRAP_LIB} ${rvs} pthread dl ${COMBINED_LIBS}) target_include_directories(${RDC_RVS_LIB} PRIVATE "${PROJECT_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}/include" "${COMMON_DIR}" - "${RSMI_INC_DIR}" + "${SMI_INC_DIR}" "${ROCM_DIR}/include" "${ROCM_DIR}/include/hsa" "${ROCM_VALIDATION_SUITE_INCLUDE_DIR}") diff --git a/projects/rdc/rdci/CMakeLists.txt b/projects/rdc/rdci/CMakeLists.txt index bba31d962f..264d92bcfc 100644 --- a/projects/rdc/rdci/CMakeLists.txt +++ b/projects/rdc/rdci/CMakeLists.txt @@ -34,8 +34,8 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR}) message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR}) message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) -message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR}) -message("--------RSMI Inc Dir: " ${RSMI_INC_DIR}) +message("--------SMI Lib Dir: " ${SMI_LIB_DIR}) +message("--------SMI Inc Dir: " ${SMI_INC_DIR}) message("-------GRPC ROOT Dir: " ${GRPC_ROOT}) message("") diff --git a/projects/rdc/server/CMakeLists.txt b/projects/rdc/server/CMakeLists.txt index 6e525bb7b6..f08b0d912f 100755 --- a/projects/rdc/server/CMakeLists.txt +++ b/projects/rdc/server/CMakeLists.txt @@ -31,8 +31,8 @@ message("----------Proj Src Dir: " ${PROJECT_SOURCE_DIR}) message("----------Proj Bld Dir: " ${PROJECT_BINARY_DIR}) message("----------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) message("----------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) -message("----------RSMI Lib Dir: " ${RSMI_LIB_DIR}) -message("----------RSMI Inc Dir: " ${RSMI_INC_DIR}) +message("----------SMI Lib Dir: " ${SMI_LIB_DIR}) +message("----------SMI Inc Dir: " ${SMI_INC_DIR}) message("---------GRPC Root Dir: " ${GRPC_ROOT}) message("") @@ -59,7 +59,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include "${PROJECT_SOURCE_DIR}/include" "${GRPC_ROOT}/include" "${PROTOB_OUT_DIR}" - "${RSMI_INC_DIR}" + "${SMI_INC_DIR}" "${PROJECT_SOURCE_DIR}") set(SERVER_SRC_LIST @@ -68,7 +68,6 @@ set(SERVER_SRC_LIST "${PROTOBUF_GENERATED_SRCS}" "${SRC_DIR}/rdc_admin_service.cc" "${SRC_DIR}/rdc_api_service.cc" - "${SRC_DIR}/rdc_rsmi_service.cc" "${SRC_DIR}/rdc_server_main.cc") message("SERVER_SRC_LIST=${SERVER_SRC_LIST}") @@ -76,7 +75,7 @@ set(SERVER_DAEMON_EXE "rdcd") configure_file("rdc.service.in" "${PROJECT_BINARY_DIR}/rdc.service" @ONLY) set(SERVICE_FILE_NAME "rdc.service") -link_directories(${RSMI_LIB_DIR}) +link_directories(${SMI_LIB_DIR}) add_executable(${SERVER_DAEMON_EXE} "${SERVER_SRC_LIST}") @@ -85,7 +84,7 @@ set_target_properties(${SERVER_DAEMON_EXE} PROPERTIES INSTALL_RPATH "\$ORIGIN/../lib") target_link_libraries(${SERVER_DAEMON_EXE} pthread rt gRPC::grpc++ - cap dl rocm_smi64 rdc_bootstrap) + cap dl amd_smi rdc_bootstrap) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${SERVER_DAEMON_EXE} PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE GROUP_READ diff --git a/projects/rdc/server/include/rdc/rdc_admin_service.h b/projects/rdc/server/include/rdc/rdc_admin_service.h index 71730ba96d..396a0838de 100644 --- a/projects/rdc/server/include/rdc/rdc_admin_service.h +++ b/projects/rdc/server/include/rdc/rdc_admin_service.h @@ -22,9 +22,9 @@ THE SOFTWARE. #ifndef SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_ #define SERVER_INCLUDE_RDC_RDC_ADMIN_SERVICE_H_ +#include "amd_smi/amdsmi.h" #include "rdc.grpc.pb.h" // NOLINT #include "rdc/rdc_admin_service.h" -#include "rocm_smi/rocm_smi.h" namespace amd { namespace rdc { diff --git a/projects/rdc/server/include/rdc/rdc_rsmi_service.h b/projects/rdc/server/include/rdc/rdc_rsmi_service.h deleted file mode 100644 index ea68c1a0f3..0000000000 --- a/projects/rdc/server/include/rdc/rdc_rsmi_service.h +++ /dev/null @@ -1,65 +0,0 @@ -/* -Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ -#ifndef SERVER_INCLUDE_RDC_RDC_RSMI_SERVICE_H_ -#define SERVER_INCLUDE_RDC_RDC_RSMI_SERVICE_H_ - -#include "rdc.grpc.pb.h" // NOLINT -#include "rdc/rdc_rsmi_service.h" -#include "rocm_smi/rocm_smi.h" - -namespace amd { -namespace rdc { - -class RsmiServiceImpl final : public ::rdc::Rsmi::Service { - public: - RsmiServiceImpl(); - ~RsmiServiceImpl(); - - rsmi_status_t Initialize(uint64_t rsmi_init_flags = 0); - - ::grpc::Status GetNumDevices(::grpc::ServerContext* context, - const ::rdc::GetNumDevicesRequest* request, - ::rdc::GetNumDevicesResponse* reply) override; - - ::grpc::Status GetTemperature(::grpc::ServerContext* context, - const ::rdc::GetTemperatureRequest* request, - ::rdc::GetTemperatureResponse* response) override; - - ::grpc::Status GetFanRpms(::grpc::ServerContext* context, const ::rdc::GetFanRpmsRequest* request, - ::rdc::GetFanRpmsResponse* response) override; - - ::grpc::Status GetFanSpeed(::grpc::ServerContext* context, - const ::rdc::GetFanSpeedRequest* request, - ::rdc::GetFanSpeedResponse* response) override; - - ::grpc::Status GetFanSpeedMax(::grpc::ServerContext* context, - const ::rdc::GetFanSpeedMaxRequest* request, - ::rdc::GetFanSpeedMaxResponse* response) override; - - private: - bool rsmi_initialized_; -}; - -} // namespace rdc -} // namespace amd - -#endif // SERVER_INCLUDE_RDC_RDC_RSMI_SERVICE_H_ diff --git a/projects/rdc/server/include/rdc/rdc_server_main.h b/projects/rdc/server/include/rdc/rdc_server_main.h index 73bd4f5050..c8e7ceb16d 100644 --- a/projects/rdc/server/include/rdc/rdc_server_main.h +++ b/projects/rdc/server/include/rdc/rdc_server_main.h @@ -29,7 +29,6 @@ THE SOFTWARE. #include "rdc/rdc_admin_service.h" #include "rdc/rdc_api_service.h" -#include "rdc/rdc_rsmi_service.h" typedef struct { std::string listen_address; @@ -49,9 +48,6 @@ class RDCServer { void Run(void); void ShutDown(void); - bool start_rsmi_service(void) const { return start_rsmi_service_; } - void set_start_rsmi_service(bool s) { start_rsmi_service_ = s; } - bool start_rdc_admin_service(void) const { return start_rdc_admin_service_; } void set_start_rdc_admin_service(bool s) { start_rdc_admin_service_ = s; } @@ -68,8 +64,6 @@ class RDCServer { bool secure_creds_; bool use_pinned_certs_; bool log_debug_; - bool start_rsmi_service_; - amd::rdc::RsmiServiceImpl* rsmi_service_; RdcdCmdLineOpts* cmd_line_; bool start_rdc_admin_service_; diff --git a/projects/rdc/server/src/rdc_rsmi_service.cc b/projects/rdc/server/src/rdc_rsmi_service.cc deleted file mode 100644 index 431586f09a..0000000000 --- a/projects/rdc/server/src/rdc_rsmi_service.cc +++ /dev/null @@ -1,175 +0,0 @@ - -/* -Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ -#include "rdc/rdc_rsmi_service.h" - -#include -#include - -#include -#include -#include -#include - -#include "rdc.grpc.pb.h" // NOLINT -#include "rocm_smi/rocm_smi.h" - -namespace amd { -namespace rdc { - -RsmiServiceImpl::RsmiServiceImpl() : rsmi_initialized_(false) {} - -RsmiServiceImpl::~RsmiServiceImpl() { - if (rsmi_initialized_) { - rsmi_status_t rsmi_ret = rsmi_shut_down(); - rsmi_initialized_ = false; - assert(rsmi_ret == RSMI_STATUS_SUCCESS); - } -} - -// rsmi and rdc currently happen to have a 1-to-1 mapping, but -// have this function in case that changes -static rsmi_temperature_metric_t rdc_temp2rsmi_temp( - ::rdc::GetTemperatureRequest_TemperatureMetric rdc_temp) { - return static_cast(rdc_temp); -} - -rsmi_status_t RsmiServiceImpl::Initialize(uint64_t rsmi_init_flags) { - rsmi_status_t rsmi_ret = rsmi_init(rsmi_init_flags); - if (rsmi_ret != RSMI_STATUS_SUCCESS) { - std::cout << "rsmi_init() returned error" << std::endl; - } else { - rsmi_initialized_ = true; - } - return rsmi_ret; -} - -::grpc::Status RsmiServiceImpl::GetNumDevices(::grpc::ServerContext* context, - const ::rdc::GetNumDevicesRequest* request, - ::rdc::GetNumDevicesResponse* reply) { - assert(reply != nullptr); - uint32_t num_devices; - - (void)context; // Quiet warning for now; - (void)request; - assert(reply != nullptr); - - rsmi_status_t ret = rsmi_num_monitor_devices(&num_devices); - - // TODO(cfreehil) replace below with macro - if (ret != RSMI_STATUS_SUCCESS) { - std::cout << "rsmi_num_monitor_devices() returned error" << std::endl; - } - reply->set_val(num_devices); - reply->set_ret_val(ret); - - return ::grpc::Status::OK; -} - -::grpc::Status RsmiServiceImpl::GetTemperature(::grpc::ServerContext* context, - const ::rdc::GetTemperatureRequest* request, - ::rdc::GetTemperatureResponse* response) { - (void)context; // Quiet warning for now; - assert(response != nullptr); - - int64_t temperature; - rsmi_status_t ret = rsmi_dev_temp_metric_get(request->dv_ind(), request->sensor_type(), - rdc_temp2rsmi_temp(request->metric()), &temperature); - - response->set_temperature(temperature); - response->set_ret_val(ret); - return ::grpc::Status::OK; -} - -::grpc::Status RsmiServiceImpl::GetFanRpms(::grpc::ServerContext* context, - const ::rdc::GetFanRpmsRequest* request, - ::rdc::GetFanRpmsResponse* response) { - (void)context; // Quiet warning for now; - assert(response != nullptr); - - int64_t rpms; - rsmi_status_t ret = rsmi_dev_fan_rpms_get(request->dv_ind(), request->sensor_ind(), &rpms); - - response->set_rpms(rpms); - response->set_ret_val(ret); - return ::grpc::Status::OK; -} - -::grpc::Status RsmiServiceImpl::GetFanSpeed(::grpc::ServerContext* context, - const ::rdc::GetFanSpeedRequest* request, - ::rdc::GetFanSpeedResponse* response) { - (void)context; // Quiet warning for now; - assert(response != nullptr); - - int64_t speed; - rsmi_status_t ret = rsmi_dev_fan_speed_get(request->dv_ind(), request->sensor_ind(), &speed); - - response->set_speed(speed); - response->set_ret_val(ret); - return ::grpc::Status::OK; -} - -::grpc::Status RsmiServiceImpl::GetFanSpeedMax(::grpc::ServerContext* context, - const ::rdc::GetFanSpeedMaxRequest* request, - ::rdc::GetFanSpeedMaxResponse* response) { - (void)context; // Quiet warning for now; - assert(response != nullptr); - - uint64_t max_speed; - rsmi_status_t ret = - rsmi_dev_fan_speed_max_get(request->dv_ind(), request->sensor_ind(), &max_speed); - - response->set_max_speed(max_speed); - response->set_ret_val(ret); - return ::grpc::Status::OK; -} - -// TODO(cfreehil): read server config from YAML file. Config can include things -// like server address, Secure/Insecure creds, rsmi_init flags, etc. -void RunServer() { - std::string server_address("0.0.0.0:50051"); - RsmiServiceImpl service; - - ::grpc::ServerBuilder builder; - // Listen on the given address without any authentication mechanism. - builder.AddListeningPort(server_address, grpc::InsecureServerCredentials()); - // Register "service" as the instance through which we'll communicate with - // clients. In this case it corresponds to an *synchronous* service. - builder.RegisterService(&service); - // Finally assemble the server. - std::unique_ptr<::grpc::Server> server(builder.BuildAndStart()); - std::cout << "Server listening on " << server_address << std::endl; - - uint64_t flags = 0; // TODO(cfreehil) Read this from config file - rsmi_status_t rsmi_ret = rsmi_init(flags); - // TODO(cfreehil): check rsmi return code - // Wait for the server to shutdown. Note that some other thread must be - // responsible for shutting down the server for this call to ever return. - if (rsmi_ret != RSMI_STATUS_SUCCESS) { - std::cout << "rsmi_init() returned error. Exiting" << std::endl; - return; - } - server->Wait(); -} - -} // namespace rdc -} // namespace amd diff --git a/projects/rdc/server/src/rdc_server_main.cc b/projects/rdc/server/src/rdc_server_main.cc index ba4add2da3..222f814df1 100644 --- a/projects/rdc/server/src/rdc_server_main.cc +++ b/projects/rdc/server/src/rdc_server_main.cc @@ -38,12 +38,11 @@ THE SOFTWARE. #include #include +#include "amd_smi/amdsmi.h" #include "common/rdc_capabilities.h" #include "common/rdc_utils.h" #include "rdc.grpc.pb.h" // NOLINT #include "rdc/rdc_api_service.h" -#include "rdc/rdc_rsmi_service.h" -#include "rocm_smi/rocm_smi.h" // TODO(cfreehil): // The following need to be made configurable (e.g., from YAML): @@ -76,8 +75,7 @@ static const char* kDefaultListenAddress = "0.0.0.0"; static const char* kDefaultListenPort = "50051"; static const uint32_t kRSMIUMask = 027; -RDCServer::RDCServer() - : secure_creds_(false), rsmi_service_(nullptr), rdc_admin_service_(nullptr) {} +RDCServer::RDCServer() : secure_creds_(false), rdc_admin_service_(nullptr) {} RDCServer::~RDCServer() {} @@ -195,18 +193,6 @@ void RDCServer::Run() { builder.RegisterService(rdc_admin_service_); } - if (start_rsmi_service()) { - rsmi_service_ = new amd::rdc::RsmiServiceImpl(); - builder.RegisterService(rsmi_service_); - - rsmi_status_t ret = rsmi_service_->Initialize(0); - - if (ret != RSMI_STATUS_SUCCESS) { - std::cerr << "Failed to start RSMI service. ret = " << ret << std::endl; - return; - } - } - if (start_api_service()) { api_service_ = new amd::rdc::RdcAPIServiceImpl(); builder.RegisterService(api_service_); @@ -287,11 +273,6 @@ static int FileOwner(const char* fn, std::string* owner) { void RDCServer::ShutDown(void) { server_->Shutdown(); - if (rsmi_service_) { - delete rsmi_service_; - rsmi_service_ = nullptr; - } - if (rdc_admin_service_) { delete rdc_admin_service_; rdc_admin_service_ = nullptr; @@ -673,7 +654,6 @@ int main(int argc, char** argv) { } // TODO(cfreehil): Eventually, set these by reading a config file - rdc_server.set_start_rsmi_service(true); rdc_server.set_start_rdc_admin_service(true); rdc_server.set_start_api_service(true); diff --git a/projects/rdc/src/rdc64Config.in b/projects/rdc/src/rdc64Config.in index 187c0e9386..9d0906efe3 100755 --- a/projects/rdc/src/rdc64Config.in +++ b/projects/rdc/src/rdc64Config.in @@ -26,9 +26,9 @@ THE SOFTWARE. // This file is generated on build. -#define rocm_smi_VERSION_MAJOR @rocm_smi_VERSION_MAJOR@ -#define rocm_smi_VERSION_MINOR @rocm_smi_VERSION_MINOR@ -#define rocm_smi_VERSION_PATCH @rocm_smi_VERSION_PATCH@ -#define rocm_smi_VERSION_BUILD "@rocm_smi_VERSION_BUILD@" +#define amd_smi_VERSION_MAJOR @amd_smi_VERSION_MAJOR@ +#define amd_smi_VERSION_MINOR @amd_smi_VERSION_MINOR@ +#define amd_smi_VERSION_PATCH @amd_smi_VERSION_PATCH@ +#define amd_smi_VERSION_BUILD "@amd_smi_VERSION_BUILD@" #endif // INCLUDE_RDC_RDC64CONFIG_H_ diff --git a/projects/rdc/tests/example/CMakeLists.txt b/projects/rdc/tests/example/CMakeLists.txt index eed3eb7ff4..a07a0924eb 100755 --- a/projects/rdc/tests/example/CMakeLists.txt +++ b/projects/rdc/tests/example/CMakeLists.txt @@ -45,7 +45,7 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR}) message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR}) message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) -message("--------RSMI Inc Dir: " ${RSMI_INC_DIR}) +message("--------SMI Inc Dir: " ${SMI_INC_DIR}) message("") set(SRC_DIR "${PROJECT_SOURCE_DIR}/tests/example") @@ -69,7 +69,7 @@ add_executable(${TEST_CLIENT_EXE} "${EXAMPLE_SRC_LIST}") target_include_directories(${TEST_CLIENT_EXE} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../../client/include" - "${RSMI_INC_DIR}") + "${SMI_INC_DIR}") target_link_libraries(${TEST_CLIENT_EXE} rdc_client) diff --git a/projects/rdc/tests/example/rdc_client_test.cc b/projects/rdc/tests/example/rdc_client_test.cc index 4c380ae31f..30a74e8dcc 100644 --- a/projects/rdc/tests/example/rdc_client_test.cc +++ b/projects/rdc/tests/example/rdc_client_test.cc @@ -28,7 +28,7 @@ THE SOFTWARE. #include -#include "rocm_smi/rocm_smi.h" +#include "amd_smi/amdsmi.h" #define CHK_RET_STATUS(RET) \ if ((RET) != RDC_STATUS_SUCCESS) { \ diff --git a/projects/rdc/tests/rdc_tests/CMakeLists.txt b/projects/rdc/tests/rdc_tests/CMakeLists.txt index 7d2c4e0e58..cac77fd7bd 100755 --- a/projects/rdc/tests/rdc_tests/CMakeLists.txt +++ b/projects/rdc/tests/rdc_tests/CMakeLists.txt @@ -41,8 +41,8 @@ message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR}) message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR}) message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) -message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR}) -message("--------RSMI Inc Dir: " ${RSMI_INC_DIR}) +message("--------SMI Lib Dir: " ${SMI_LIB_DIR}) +message("--------SMI Inc Dir: " ${SMI_INC_DIR}) message("") set(SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}) @@ -63,7 +63,7 @@ aux_source_directory(${SRC_DIR} rdctstSources) # Other source directories aux_source_directory(${SRC_DIR}/functional functionalSources) -link_directories(${ROCM_INSTALL_DIR} ${RSMI_LIB_DIR}) +link_directories(${ROCM_INSTALL_DIR} ${SMI_LIB_DIR}) # Build rules add_executable(${RDCTST} ${rdctstSources} ${functionalSources}) @@ -72,7 +72,7 @@ add_executable(${RDCTST} ${rdctstSources} ${functionalSources}) target_include_directories( ${RDCTST} PUBLIC ${PROJECT_SOURCE_DIR}/include - PUBLIC ${RSMI_INC_DIR} + PUBLIC ${SMI_INC_DIR} PUBLIC ${SRC_DIR}/..) target_link_libraries(${RDCTST} diff --git a/projects/rdc/tests/rdc_tests/main.cc b/projects/rdc/tests/rdc_tests/main.cc index ce415cb70d..82984e7f17 100644 --- a/projects/rdc/tests/rdc_tests/main.cc +++ b/projects/rdc/tests/rdc_tests/main.cc @@ -29,6 +29,7 @@ THE SOFTWARE. #include #include +#include "amd_smi/amdsmi.h" #include "functional/rdci_discovery.h" #include "functional/rdci_dmon.h" #include "functional/rdci_fieldgroup.h" @@ -37,7 +38,6 @@ THE SOFTWARE. #include "rdc/rdc.h" #include "rdc_tests/test_base.h" #include "rdc_tests/test_common.h" -#include "rocm_smi/rocm_smi.h" static RDCTstGlobals* sRDCGlvalues = nullptr; diff --git a/projects/rdc/tests/rdc_tests/test_base.cc b/projects/rdc/tests/rdc_tests/test_base.cc index de52e13e50..cfa34f56b2 100644 --- a/projects/rdc/tests/rdc_tests/test_base.cc +++ b/projects/rdc/tests/rdc_tests/test_base.cc @@ -24,8 +24,8 @@ THE SOFTWARE. #include #include +#include "amd_smi/amdsmi.h" #include "rdc_tests/test_common.h" -#include "rocm_smi/rocm_smi.h" static const int kOutputLineLength = 80; static const char kLabelDelimiter[] = "####"; diff --git a/projects/rdc/tests/rdc_tests/test_common.cc b/projects/rdc/tests/rdc_tests/test_common.cc index 58b5fd2dda..8d65e1c1e9 100644 --- a/projects/rdc/tests/rdc_tests/test_common.cc +++ b/projects/rdc/tests/rdc_tests/test_common.cc @@ -30,8 +30,8 @@ THE SOFTWARE. #include #include +#include "amd_smi/amdsmi.h" #include "rdc_tests/test_base.h" -#include "rocm_smi/rocm_smi.h" /*static const std::map kGRPCChanState = { @@ -47,40 +47,40 @@ THE SOFTWARE. }, }; */ -static const std::map kBlockNameMap = { - {RSMI_GPU_BLOCK_UMC, "UMC"}, {RSMI_GPU_BLOCK_SDMA, "SDMA"}, - {RSMI_GPU_BLOCK_GFX, "GFX"}, {RSMI_GPU_BLOCK_MMHUB, "MMHUB"}, - {RSMI_GPU_BLOCK_ATHUB, "ATHUB"}, {RSMI_GPU_BLOCK_PCIE_BIF, "PCIE_BIF"}, - {RSMI_GPU_BLOCK_HDP, "HDP"}, {RSMI_GPU_BLOCK_XGMI_WAFL, "XGMI_WAFL"}, - {RSMI_GPU_BLOCK_DF, "DF"}, {RSMI_GPU_BLOCK_SMN, "SMN"}, - {RSMI_GPU_BLOCK_SEM, "SEM"}, {RSMI_GPU_BLOCK_MP0, "MP0"}, - {RSMI_GPU_BLOCK_MP1, "MP1"}, {RSMI_GPU_BLOCK_FUSE, "FUSE"}, +static const std::map kBlockNameMap = { + {AMDSMI_GPU_BLOCK_UMC, "UMC"}, {AMDSMI_GPU_BLOCK_SDMA, "SDMA"}, + {AMDSMI_GPU_BLOCK_GFX, "GFX"}, {AMDSMI_GPU_BLOCK_MMHUB, "MMHUB"}, + {AMDSMI_GPU_BLOCK_ATHUB, "ATHUB"}, {AMDSMI_GPU_BLOCK_PCIE_BIF, "PCIE_BIF"}, + {AMDSMI_GPU_BLOCK_HDP, "HDP"}, {AMDSMI_GPU_BLOCK_XGMI_WAFL, "XGMI_WAFL"}, + {AMDSMI_GPU_BLOCK_DF, "DF"}, {AMDSMI_GPU_BLOCK_SMN, "SMN"}, + {AMDSMI_GPU_BLOCK_SEM, "SEM"}, {AMDSMI_GPU_BLOCK_MP0, "MP0"}, + {AMDSMI_GPU_BLOCK_MP1, "MP1"}, {AMDSMI_GPU_BLOCK_FUSE, "FUSE"}, }; -static_assert(RSMI_GPU_BLOCK_LAST == RSMI_GPU_BLOCK_FUSE, "kBlockNameMap needs to be updated"); +static_assert(AMDSMI_GPU_BLOCK_LAST == AMDSMI_GPU_BLOCK_FUSE, "kBlockNameMap needs to be updated"); static const char* kRasErrStateStrings[] = { - "None", // RSMI_RAS_ERR_STATE_NONE - "Disabled", // RSMI_RAS_ERR_STATE_DISABLED - "Error Unknown", // RSMI_RAS_ERR_STATE_PARITY - "Single, Correctable", // RSMI_RAS_ERR_STATE_SING_C - "Multiple, Uncorrectable", // RSMI_RAS_ERR_STATE_MULT_UC - "Poison" // RSMI_RAS_ERR_STATE_POISON - "Off", // RSMI_RAS_ERR_STATE_DISABLED - "On", // RSMI_RAS_ERR_STATE_ENABLED + "None", // AMDSMI_RAS_ERR_STATE_NONE + "Disabled", // AMDSMI_RAS_ERR_STATE_DISABLED + "Error Unknown", // AMDSMI_RAS_ERR_STATE_PARITY + "Single, Correctable", // AMDSMI_RAS_ERR_STATE_SING_C + "Multiple, Uncorrectable", // AMDSMI_RAS_ERR_STATE_MULT_UC + "Poison" // AMDSMI_RAS_ERR_STATE_POISON + "Off", // AMDSMI_RAS_ERR_STATE_DISABLED + "On", // AMDSMI_RAS_ERR_STATE_ENABLED }; -static_assert(sizeof(kRasErrStateStrings) / sizeof(char*) == (RSMI_RAS_ERR_STATE_LAST + 1), +static_assert(sizeof(kRasErrStateStrings) / sizeof(char*) == (AMDSMI_RAS_ERR_STATE_LAST + 1), "kErrStateNameMap needs to be updated"); -static const std::map kErrStateNameMap = { - {RSMI_RAS_ERR_STATE_NONE, kRasErrStateStrings[RSMI_RAS_ERR_STATE_NONE]}, - {RSMI_RAS_ERR_STATE_DISABLED, kRasErrStateStrings[RSMI_RAS_ERR_STATE_DISABLED]}, - {RSMI_RAS_ERR_STATE_PARITY, kRasErrStateStrings[RSMI_RAS_ERR_STATE_PARITY]}, - {RSMI_RAS_ERR_STATE_SING_C, kRasErrStateStrings[RSMI_RAS_ERR_STATE_SING_C]}, - {RSMI_RAS_ERR_STATE_MULT_UC, kRasErrStateStrings[RSMI_RAS_ERR_STATE_MULT_UC]}, - {RSMI_RAS_ERR_STATE_POISON, kRasErrStateStrings[RSMI_RAS_ERR_STATE_POISON]}, - {RSMI_RAS_ERR_STATE_ENABLED, kRasErrStateStrings[RSMI_RAS_ERR_STATE_ENABLED]}, +static const std::map kErrStateNameMap = { + {AMDSMI_RAS_ERR_STATE_NONE, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_NONE]}, + {AMDSMI_RAS_ERR_STATE_DISABLED, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_DISABLED]}, + {AMDSMI_RAS_ERR_STATE_PARITY, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_PARITY]}, + {AMDSMI_RAS_ERR_STATE_SING_C, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_SING_C]}, + {AMDSMI_RAS_ERR_STATE_MULT_UC, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_MULT_UC]}, + {AMDSMI_RAS_ERR_STATE_POISON, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_POISON]}, + {AMDSMI_RAS_ERR_STATE_ENABLED, kRasErrStateStrings[AMDSMI_RAS_ERR_STATE_ENABLED]}, }; -static_assert(RSMI_RAS_ERR_STATE_LAST == RSMI_RAS_ERR_STATE_ENABLED, +static_assert(AMDSMI_RAS_ERR_STATE_LAST == AMDSMI_RAS_ERR_STATE_ENABLED, "kErrStateNameMap needs to be updated"); static const struct option long_options[] = { @@ -207,25 +207,35 @@ uint32_t ProcessCmdline(RDCTstGlobals* test, int arg_cnt, char** arg_list) { return 1; } -const char* GetBlockNameStr(rsmi_gpu_block_t id) { return kBlockNameMap.at(id); } -const char* GetErrStateNameStr(rsmi_ras_err_state_t st) { return kErrStateNameMap.at(st); } +const char* GetBlockNameStr(amdsmi_gpu_block_t id) { return kBlockNameMap.at(id); } +const char* GetErrStateNameStr(amdsmi_ras_err_state_t st) { return kErrStateNameMap.at(st); } /*const char *GetGRPCChanStateStr(grpc_connectivity_state st) { return kGRPCChanState.at(st); }*/ -const char* FreqEnumToStr(rsmi_clk_type rsmi_clk) { - static_assert(RSMI_CLK_TYPE_LAST == RSMI_CLK_TYPE_MEM, "FreqEnumToStr() needs to be updated"); +const char* FreqEnumToStr(amdsmi_clk_type_t rsmi_clk) { + static_assert(CLK_TYPE__MAX == CLK_TYPE_DCLK1, "FreqEnumToStr() needs to be updated"); switch (rsmi_clk) { - case RSMI_CLK_TYPE_SYS: + case CLK_TYPE_SYS: return "System clock"; - case RSMI_CLK_TYPE_DF: + case CLK_TYPE_DF: return "Data Fabric clock"; - case RSMI_CLK_TYPE_DCEF: + case CLK_TYPE_DCEF: return "Display Controller Engine clock"; - case RSMI_CLK_TYPE_SOC: + case CLK_TYPE_SOC: return "SOC clock"; - case RSMI_CLK_TYPE_MEM: + case CLK_TYPE_MEM: return "Memory clock"; + case CLK_TYPE_PCIE: + return "PCIe clock"; + case CLK_TYPE_VCLK0: + return "VCLK0 clock"; + case CLK_TYPE_VCLK1: + return "VCLK1 clock"; + case CLK_TYPE_DCLK0: + return "DCLK0 clock"; + case CLK_TYPE_DCLK1: + return "DCLK1 clock"; default: return "Invalid Clock ID"; } diff --git a/projects/rdc/tests/rdc_tests/test_common.h b/projects/rdc/tests/rdc_tests/test_common.h index 065fdfc9ec..f902c76d8c 100644 --- a/projects/rdc/tests/rdc_tests/test_common.h +++ b/projects/rdc/tests/rdc_tests/test_common.h @@ -27,7 +27,7 @@ THE SOFTWARE. #include #include -#include "rocm_smi/rocm_smi.h" +#include "amd_smi/amdsmi.h" struct RDCTstGlobals { uint32_t verbosity; @@ -45,10 +45,10 @@ struct RDCTstGlobals { uint32_t ProcessCmdline(RDCTstGlobals* test, int arg_cnt, char** arg_list); void PrintTestHeader(uint32_t dv_ind); -const char* GetBlockNameStr(rsmi_gpu_block_t id); -const char* GetErrStateNameStr(rsmi_ras_err_state_t st); +const char* GetBlockNameStr(amdsmi_gpu_block_t id); +const char* GetErrStateNameStr(amdsmi_ras_err_state_t st); // const char *GetGRPCChanStateStr(grpc_connectivity_state st); -const char* FreqEnumToStr(rsmi_clk_type rsmi_clk); +const char* FreqEnumToStr(amdsmi_clk_type_t rsmi_clk); #if ENABLE_SMI void DumpMonitorInfo(const TestBase* test); diff --git a/projects/rdc/tests/rdc_tests/test_utils.cc b/projects/rdc/tests/rdc_tests/test_utils.cc index 0bf6ddd5cd..e0eee599b8 100644 --- a/projects/rdc/tests/rdc_tests/test_utils.cc +++ b/projects/rdc/tests/rdc_tests/test_utils.cc @@ -47,30 +47,88 @@ #include -#include "rocm_smi/rocm_smi.h" +#include "amd_smi/amdsmi.h" -static const std::map kDevFWNameMap = { - {RSMI_FW_BLOCK_ASD, "asd"}, - {RSMI_FW_BLOCK_CE, "ce"}, - {RSMI_FW_BLOCK_DMCU, "dmcu"}, - {RSMI_FW_BLOCK_MC, "mc"}, - {RSMI_FW_BLOCK_ME, "me"}, - {RSMI_FW_BLOCK_MEC, "mec"}, - {RSMI_FW_BLOCK_MEC2, "mec2"}, - {RSMI_FW_BLOCK_PFP, "pfp"}, - {RSMI_FW_BLOCK_RLC, "rlc"}, - {RSMI_FW_BLOCK_RLC_SRLC, "rlc_srlc"}, - {RSMI_FW_BLOCK_RLC_SRLG, "rlc_srlg"}, - {RSMI_FW_BLOCK_RLC_SRLS, "rlc_srls"}, - {RSMI_FW_BLOCK_SDMA, "sdma"}, - {RSMI_FW_BLOCK_SDMA2, "sdma2"}, - {RSMI_FW_BLOCK_SMC, "smc"}, - {RSMI_FW_BLOCK_SOS, "sos"}, - {RSMI_FW_BLOCK_TA_RAS, "ta_ras"}, - {RSMI_FW_BLOCK_TA_XGMI, "ta_xgmi"}, - {RSMI_FW_BLOCK_UVD, "uvd"}, - {RSMI_FW_BLOCK_VCE, "vce"}, - {RSMI_FW_BLOCK_VCN, "vcn"}, +static const std::map kDevFWNameMap = { + {FW_ID_SMU, "SMU"}, + {FW_ID_FIRST, "FIRST"}, + {FW_ID_CP_CE, "CP_CE"}, + {FW_ID_CP_PFP, "CP_PFP"}, + {FW_ID_CP_ME, "CP_ME"}, + {FW_ID_CP_MEC_JT1, "CP_MEC_JT1"}, + {FW_ID_CP_MEC_JT2, "CP_MEC_JT2"}, + {FW_ID_CP_MEC1, "CP_MEC1"}, + {FW_ID_CP_MEC2, "CP_MEC2"}, + {FW_ID_RLC, "RLC"}, + {FW_ID_SDMA0, "SDMA0"}, + {FW_ID_SDMA1, "SDMA1"}, + {FW_ID_SDMA2, "SDMA2"}, + {FW_ID_SDMA3, "SDMA3"}, + {FW_ID_SDMA4, "SDMA4"}, + {FW_ID_SDMA5, "SDMA5"}, + {FW_ID_SDMA6, "SDMA6"}, + {FW_ID_SDMA7, "SDMA7"}, + {FW_ID_VCN, "VCN"}, + {FW_ID_UVD, "UVD"}, + {FW_ID_VCE, "VCE"}, + {FW_ID_ISP, "ISP"}, + {FW_ID_DMCU_ERAM, "DMCU_ERAM"}, + {FW_ID_DMCU_ISR, "DMCU_ISR"}, + {FW_ID_RLC_RESTORE_LIST_GPM_MEM, "RLC_RESTORE_LIST_GPM_MEM"}, + {FW_ID_RLC_RESTORE_LIST_SRM_MEM, "RLC_RESTORE_LIST_SRM_MEM"}, + {FW_ID_RLC_RESTORE_LIST_CNTL, "RLC_RESTORE_LIST_CNTL"}, + {FW_ID_RLC_V, "RLC_V"}, + {FW_ID_MMSCH, "MMSCH"}, + {FW_ID_PSP_SYSDRV, "PSP_SYSDRV"}, + {FW_ID_PSP_SOSDRV, "PSP_SOSDRV"}, + {FW_ID_PSP_TOC, "PSP_TOC"}, + {FW_ID_PSP_KEYDB, "PSP_KEYDB"}, + {FW_ID_DFC, "DFC"}, + {FW_ID_PSP_SPL, "PSP_SPL"}, + {FW_ID_DRV_CAP, "DRV_CAP"}, + {FW_ID_MC, "MC"}, + {FW_ID_PSP_BL, "PSP_BL"}, + {FW_ID_CP_PM4, "CP_PM4"}, + {FW_ID_RLC_P, "RLC_P"}, + {FW_ID_SEC_POLICY_STAGE2, "SEC_POLICY_STAGE2"}, + {FW_ID_REG_ACCESS_WHITELIST, "REG_ACCESS_WHITELIST"}, + {FW_ID_IMU_DRAM, "IMU_DRAM"}, + {FW_ID_IMU_IRAM, "IMU_IRAM"}, + {FW_ID_SDMA_TH0, "SDMA_TH0"}, + {FW_ID_SDMA_TH1, "SDMA_TH1"}, + {FW_ID_CP_MES, "CP_MES"}, + {FW_ID_MES_KIQ, "MES_KIQ"}, + {FW_ID_MES_STACK, "MES_STACK"}, + {FW_ID_MES_THREAD1, "MES_THREAD1"}, + {FW_ID_MES_THREAD1_STACK, "MES_THREAD1_STACK"}, + {FW_ID_RLX6, "RLX6"}, + {FW_ID_RLX6_DRAM_BOOT, "RLX6_DRAM_BOOT"}, + {FW_ID_RS64_ME, "RS64_ME"}, + {FW_ID_RS64_ME_P0_DATA, "RS64_ME_P0_DATA"}, + {FW_ID_RS64_ME_P1_DATA, "RS64_ME_P1_DATA"}, + {FW_ID_RS64_PFP, "RS64_PFP"}, + {FW_ID_RS64_PFP_P0_DATA, "RS64_PFP_P0_DATA"}, + {FW_ID_RS64_PFP_P1_DATA, "RS64_PFP_P1_DATA"}, + {FW_ID_RS64_MEC, "RS64_MEC"}, + {FW_ID_RS64_MEC_P0_DATA, "RS64_MEC_P0_DATA"}, + {FW_ID_RS64_MEC_P1_DATA, "RS64_MEC_P1_DATA"}, + {FW_ID_RS64_MEC_P2_DATA, "RS64_MEC_P2_DATA"}, + {FW_ID_RS64_MEC_P3_DATA, "RS64_MEC_P3_DATA"}, + {FW_ID_PPTABLE, "PPTABLE"}, + {FW_ID_PSP_SOC, "PSP_SOC"}, + {FW_ID_PSP_DBG, "PSP_DBG"}, + {FW_ID_PSP_INTF, "PSP_INTF"}, + {FW_ID_RLX6_CORE1, "RLX6_CORE1"}, + {FW_ID_RLX6_DRAM_BOOT_CORE1, "RLX6_DRAM_BOOT_CORE1"}, + {FW_ID_RLCV_LX7, "RLCV_LX7"}, + {FW_ID_RLC_SAVE_RESTORE_LIST, "RLC_SAVE_RESTORE_LIST"}, + {FW_ID_ASD, "ASD"}, + {FW_ID_TA_RAS, "TA_RAS"}, + {FW_ID_TA_XGMI, "TA_XGMI"}, + {FW_ID_RLC_SRLG, "RLC_SRLG"}, + {FW_ID_RLC_SRLS, "RLC_SRLS"}, + {FW_ID_PM, "PM"}, + {FW_ID_DMCU, "DMCU"}, }; -const char* NameFromFWEnum(rsmi_fw_block_t blk) { return kDevFWNameMap.at(blk); } +const char* NameFromFWEnum(amdsmi_fw_block_t blk) { return kDevFWNameMap.at(blk); } diff --git a/projects/rdc/tests/rdc_tests/test_utils.h b/projects/rdc/tests/rdc_tests/test_utils.h index 6fc0ef2a0d..6c51cd8984 100644 --- a/projects/rdc/tests/rdc_tests/test_utils.h +++ b/projects/rdc/tests/rdc_tests/test_utils.h @@ -46,8 +46,8 @@ #ifndef TESTS_RDC_TESTS_TEST_UTILS_H_ #define TESTS_RDC_TESTS_TEST_UTILS_H_ -#include "rocm_smi/rocm_smi.h" +#include "amd_smi/amdsmi.h" -const char* NameFromFWEnum(rsmi_fw_block_t blk); +const char* NameFromFWEnum(amdsmi_fw_block_t blk); #endif // TESTS_RDC_TESTS_TEST_UTILS_H_