From 6594f8f58b88aa579643ee2c48c58a4af854c262 Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Sat, 20 Jun 2020 17:00:06 -0500 Subject: [PATCH] Refactor rsmi to support oam Change-Id: Idc524e01ba06eb5c8d1682becaf5bf8ced5bffcf --- CMakeLists.txt | 197 ++---- cmake_modules/utils.cmake | 2 +- include/rocm_smi/rocm_smi_common.h | 70 ++ include/rocm_smi/rocm_smi_utils.h | 11 + oam/CMakeLists.txt | 108 +++ oam/example/oam_example.c | 30 + oam/include/oam/amd_oam.h | 43 ++ oam/include/oam/oam_mapi.h | 647 ++++++++++++++++++ oam/src/amd_oam.cc | 161 +++++ oam/src/oamConfig.in | 56 ++ rocm_smi/CMakeLists.txt | 143 ++++ {docs => rocm_smi/docs}/README.md | 0 {docs => rocm_smi/docs}/ROCm_SMI_Manual.pdf | Bin .../docs/amd_smi_doxygen.cfg | 0 .../example}/rocm_smi_example.cc | 0 src/rocm_smi.cc | 210 ++---- src/rocm_smi_utils.cc | 70 ++ {src => third_party}/shared_mutex/LICENSE | 0 .../shared_mutex/shared_mutex.cc | 0 .../shared_mutex/shared_mutex.h | 0 20 files changed, 1438 insertions(+), 310 deletions(-) create mode 100755 oam/CMakeLists.txt create mode 100755 oam/example/oam_example.c create mode 100755 oam/include/oam/amd_oam.h create mode 100755 oam/include/oam/oam_mapi.h create mode 100755 oam/src/amd_oam.cc create mode 100755 oam/src/oamConfig.in create mode 100755 rocm_smi/CMakeLists.txt rename {docs => rocm_smi/docs}/README.md (100%) rename {docs => rocm_smi/docs}/ROCm_SMI_Manual.pdf (100%) rename docs/rsmi_doxygen.cfg => rocm_smi/docs/amd_smi_doxygen.cfg (100%) rename {example => rocm_smi/example}/rocm_smi_example.cc (100%) rename {src => third_party}/shared_mutex/LICENSE (100%) rename {src => third_party}/shared_mutex/shared_mutex.cc (100%) rename {src => third_party}/shared_mutex/shared_mutex.h (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 80177e2412..f1578eee14 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,8 @@ # cmake_minimum_required(VERSION 3.5.0) +set(AMD_SMI_LIBS_TARGET "amd_smi_libraries") + ## Set default module path if not already set if(NOT DEFINED CMAKE_MODULE_PATH) set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules/") @@ -15,6 +17,18 @@ set(ROCM_SMI_COMPONENT "lib${ROCM_SMI}") set(ROCM_SMI_TARGET "${ROCM_SMI}64") set(ROCM_SMI_LIB_NAME "lib${ROCM_SMI_TARGET}") +# provide git to utilities +find_program (GIT NAMES git) + +## Setup the package version based on git tags. +set(PKG_VERSION_GIT_TAG_PREFIX "rsmi_pkg_ver") +get_package_version_number("1.0.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) +message("Package version: ${PKG_VERSION_STR}") +set(${AMD_SMI_LIBS_TARGET}_VERSION_MAJOR "${VERSION_MAJOR}") +set(${AMD_SMI_LIBS_TARGET}_VERSION_MINOR "${VERSION_MINOR}") +set(${AMD_SMI_LIBS_TARGET}_VERSION_PATCH "0") +set(${AMD_SMI_LIBS_TARGET}_VERSION_BUILD "0") + # The following default version values should be updated as appropriate for # ABI breaks (update MAJOR and MINOR), and ABI/API additions (update MINOR). # Until ABI stabilizes VERSION_MAJOR will be 0. This should be over-ridden @@ -24,54 +38,18 @@ set(PKG_VERSION_MINOR 0) set(PKG_VERSION_PATCH 0) set(PKG_VERSION_NUM_COMMIT 0) -################# Determine the library version ######################### -## Setup the package version based on git tags. -set(PKG_VERSION_GIT_TAG_PREFIX "rsmi_pkg_ver") -set(SO_VERSION_GIT_TAG_PREFIX "rsmi_so_ver") - -# provide git to utilities -find_program (GIT NAMES git) - -get_package_version_number("1.0.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) -# VERSION_* variables should be set by get_version_from_tag -message("Package version: ${PKG_VERSION_STR}") - -# Debian package specific variables -# Set a default value for the package version -get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT) - -# VERSION_* variables should be set by get_version_from_tag -if ( ${ROCM_PATCH_VERSION} ) - set ( VERSION_PATCH ${ROCM_PATCH_VERSION}) - set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") -else() - set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}") -endif () -set(${ROCM_SMI}_VERSION_MAJOR "${VERSION_MAJOR}") -set(${ROCM_SMI}_VERSION_MINOR "${VERSION_MINOR}") -set(${ROCM_SMI}_VERSION_PATCH "0") -set(${ROCM_SMI}_VERSION_BUILD "0") -message("SOVERSION: ${SO_VERSION_STRING}") - ## Define default variable and variables for the optional build target ## rocm_smi_lib-dev -set(SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR} - CACHE STRING "Location of rocm_smi source code.") if(NOT DEFINED CMAKE_INSTALL_PREFIX) set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE STRING "Default installation directory.") endif () +set(COMMON_SRC_ROOT ${CMAKE_CURRENT_SOURCE_DIR} + CACHE STRING "Location source code common root.") set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm" CACHE STRING "Default packaging prefix.") set(CPACK_GENERATOR "DEB;RPM" CACHE STRING "Default packaging generators.") -project(${ROCM_SMI_TARGET}) - -# Create a configure file to get version info from within library -configure_file( - "${PROJECT_SOURCE_DIR}/src/${ROCM_SMI_TARGET}Config.in" - "${PROJECT_SOURCE_DIR}/include/rocm_smi/${ROCM_SMI_TARGET}Config.h") - if (NOT DEFINED CPACK_PACKAGE_VENDOR) set(CPACK_PACKAGE_VENDOR "AMD") endif() @@ -82,14 +60,19 @@ endif() if (NOT DEFINED CPACK_PACKAGE_DESCRIPTION_SUMMARY) set(CPACK_PACKAGE_DESCRIPTION_SUMMARY - "ROCm System Management Interface library") + "AMD System Management libraries") endif() -if (NOT ROCM_SMI_PACKAGE) - set(ROCM_SMI_PACKAGE rocm_smi_lib64) +if (NOT AMD_SMI_PACKAGE) + set(AMD_SMI_PACKAGE rocm-smi-lib64) endif() -set(CPACK_PACKAGE_FILE_NAME "${ROCM_SMI_PACKAGE}-${PKG_VERSION_STR}") +set(CPACK_PACKAGE_FILE_NAME "${AMD_SMI_PACKAGE}-${PKG_VERSION_STR}") + +project(${AMD_SMI_LIBS_TARGET}) + +set(COMMON_PROJ_ROOT ${PROJECT_SOURCE_DIR}) + ## Verbose output. set(CMAKE_VERBOSE_MAKEFILE on) @@ -128,108 +111,47 @@ else () set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG") endif () -set(SRC_DIR "src") -set(INC_DIR "include/rocm_smi") +set(COMMON_SRC_DIR "${PROJECT_SOURCE_DIR}/src") +set(COMMON_INC_DIR "${PROJECT_SOURCE_DIR}/include/rocm_smi") +set(SHR_MUTEX_DIR "${PROJECT_SOURCE_DIR}/third_party/shared_mutex") include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include - ${CMAKE_CURRENT_SOURCE_DIR}/src/shared_mutex) -set(SMI_SRC_LIST "${SRC_DIR}/rocm_smi_device.cc") -set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_main.cc") -set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_monitor.cc") -set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi.cc") -set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_power_mon.cc") -set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_utils.cc") -set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_counters.cc") -set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_kfd.cc") -set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_io_link.cc") -set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/shared_mutex/shared_mutex.cc") + ${CMAKE_CURRENT_SOURCE_DIR}/third_party/shared_mutex) -set(SMI_INC_LIST "${INC_DIR}/rocm_smi_device.h") -set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_main.h") -set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_monitor.h") -set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_power_mon.h") -set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_utils.h") -set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_common.h") -set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_exception.h") -set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_counters.h") -set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_kfd.h") -set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_io_link.h") -set(SMI_INC_LIST ${SMI_INC_LIST} "${SRC_DIR}/shared_mutex/shared_mutex.h") +set(CMN_SRC_LIST "${COMMON_SRC_DIR}/rocm_smi_device.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_main.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_monitor.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_power_mon.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_utils.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_counters.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_kfd.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_io_link.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${SHR_MUTEX_DIR}/shared_mutex.cc") -set(SMI_EXAMPLE_EXE "rocm_smi_ex") +set(CMN_INC_LIST "${COMMON_INC_DIR}/rocm_smi_device.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_main.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_monitor.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_power_mon.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_utils.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_common.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_exception.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_counters.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_kfd.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_io_link.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${SHR_MUTEX_DIR}/shared_mutex.h") -add_executable(${SMI_EXAMPLE_EXE} "example/rocm_smi_example.cc") -target_link_libraries(${SMI_EXAMPLE_EXE} ${ROCM_SMI_TARGET}) -add_library(${ROCM_SMI_TARGET} SHARED ${SMI_SRC_LIST} ${SMI_INC_LIST}) -target_link_libraries(${ROCM_SMI_TARGET} pthread rt) - - -## Set the VERSION and SOVERSION values -set_property(TARGET ${ROCM_SMI_TARGET} PROPERTY - SOVERSION "${VERSION_MAJOR}") -set_property(TARGET ${ROCM_SMI_TARGET} PROPERTY - VERSION "${SO_VERSION_STRING}") - -## If the library is a release, strip the target library -if ("${CMAKE_BUILD_TYPE}" STREQUAL Release) - add_custom_command( - TARGET ${ROCM_SMI_TARGET} - POST_BUILD COMMAND ${CMAKE_STRIP} lib${ROCM_SMI_TARGET}.so) -endif () - -## Add symlinks from top level ROCm lib dir to rocm-smi lib so files -add_custom_target ( so-link ALL WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - COMMAND ${CMAKE_COMMAND} -E create_symlink - ../${ROCM_SMI}/lib/${ROCM_SMI_LIB_NAME}.so so-link ) -add_custom_target ( so-major-link ALL WORKING_DIRECTORY - ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_COMMAND} - -E create_symlink - ../${ROCM_SMI}/lib/${ROCM_SMI_LIB_NAME}.so.${VERSION_MAJOR} - so-major-link ) - -install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/so-link DESTINATION lib RENAME - ${ROCM_SMI_LIB_NAME}.so ) -install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/so-major-link DESTINATION lib - RENAME ${ROCM_SMI_LIB_NAME}.so.${VERSION_MAJOR} ) - -## Add the install directives for the runtime library. -install(TARGETS ${ROCM_SMI_TARGET} - LIBRARY DESTINATION ${ROCM_SMI}/lib COMPONENT ${ROCM_SMI_COMPONENT}) -install(FILES ${SOURCE_DIR}/include/rocm_smi/rocm_smi.h - DESTINATION rocm_smi/include/rocm_smi) -install(FILES ${SOURCE_DIR}/include/rocm_smi/kfd_ioctl.h - DESTINATION rocm_smi/include/rocm_smi) +add_subdirectory("rocm_smi") +add_subdirectory("oam") # Generate Doxygen documentation find_package(Doxygen) find_package(LATEX COMPONENTS PDFLATEX) -if (DOXYGEN_FOUND AND LATEX_FOUND) - set (RSMI_MANUAL_NAME "ROCm_SMI_Manual") - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/docs/rsmi_doxygen.cfg - ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY) - - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.tex - COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/docs/rsmi_doxygen.cfg - "${INC_DIR}/rocm_smi.h" - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.pdf - COMMAND make > /dev/null - COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.pdf - ${CMAKE_CURRENT_SOURCE_DIR}/docs/${RSMI_MANUAL_NAME}_new.pdf - DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.tex - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/latex) - - add_custom_target(docs DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.pdf) - - add_dependencies(${ROCM_SMI_TARGET} docs) - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.pdf - DESTINATION ${ROCM_SMI}/docs/${RSMI_MANUAL_NAME}.pdf) - install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/docs/README.md - DESTINATION ${ROCM_SMI}/docs/) -else() - message("Doxygen or Latex is not found. Will not generate documents.") -endif(DOXYGEN_FOUND AND LATEX_FOUND) +# install(TARGETS ${ROCM_SMI_TARGET} +# LIBRARY DESTINATION ${ROCM_SMI}/lib COMPONENT ${ROCM_SMI_COMPONENT}) +# install(FILES ${COMMON_SRC_ROOT}/include/rocm_smi/rocm_smi.h +# DESTINATION rocm_smi/include/rocm_smi) ## Add the packaging directives for the runtime library. @@ -237,7 +159,7 @@ endif(DOXYGEN_FOUND AND LATEX_FOUND) set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst; ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm") -set (CPACK_DEBIAN_PACKAGE_NAME ${ROCM_SMI_PACKAGE}) +set (CPACK_DEBIAN_PACKAGE_NAME ${AMD_SMI_PACKAGE}) set (CPACK_DEBIAN_PACKAGE_VERSION ${PKG_VERSION_STR}) # RPM package specific variables @@ -245,9 +167,8 @@ set(CPACK_RPM_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post") set(CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun") -set (CPACK_RPM_PACKAGE_NAME ${ROCM_SMI_PACKAGE}) +set (CPACK_RPM_PACKAGE_NAME ${AMD_SMI_PACKAGE}) set (CPACK_RPM_PACKAGE_VERSION ${PKG_VERSION_STR}) include (CPack) - diff --git a/cmake_modules/utils.cmake b/cmake_modules/utils.cmake index 9c9fea5ea4..3d0496aca9 100755 --- a/cmake_modules/utils.cmake +++ b/cmake_modules/utils.cmake @@ -103,7 +103,7 @@ endfunction() function(num_change_since_prev_pkg VERSION_PREFIX) find_program(get_commits NAMES version_util.sh - PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules) + PATHS ${COMMON_PROJ_ROOT}/cmake_modules) if (get_commits) execute_process( COMMAND ${get_commits} -c ${VERSION_PREFIX} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} diff --git a/include/rocm_smi/rocm_smi_common.h b/include/rocm_smi/rocm_smi_common.h index ee81b91c9c..d95040b74d 100755 --- a/include/rocm_smi/rocm_smi_common.h +++ b/include/rocm_smi/rocm_smi_common.h @@ -50,6 +50,76 @@ #include #include +#define CHECK_DV_IND_RANGE \ + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); \ + if (dv_ind >= smi.monitor_devices().size()) { \ + return RSMI_STATUS_INVALID_ARGS; \ + } \ + +#define GET_DEV_FROM_INDX \ + CHECK_DV_IND_RANGE \ + std::shared_ptr dev = smi.monitor_devices()[dv_ind]; \ + assert(dev != nullptr); + + +#define GET_DEV_AND_KFDNODE_FROM_INDX \ + GET_DEV_FROM_INDX \ + std::shared_ptr kfd_node; \ + if (smi.kfd_node_map().find(dev->kfd_gpu_id()) == \ + smi.kfd_node_map().end()) { \ + return RSMI_INITIALIZATION_ERROR; \ + } \ + kfd_node = smi.kfd_node_map()[dev->kfd_gpu_id()]; + +#define REQUIRE_ROOT_ACCESS \ + if (amd::smi::RocmSMI::getInstance().euid()) { \ + return RSMI_STATUS_PERMISSION; \ + } + +#define DEVICE_MUTEX \ + amd::smi::pthread_wrap _pw(*amd::smi::GetMutex(dv_ind)); \ + amd::smi::RocmSMI& smi_ = amd::smi::RocmSMI::getInstance(); \ + bool blocking_ = !(smi_.init_options() && RSMI_INIT_FLAG_RESRV_TEST1); \ + amd::smi::ScopedPthread _lock(_pw, blocking_); \ + if (!blocking_ && _lock.mutex_not_acquired()) { \ + return RSMI_STATUS_BUSY; \ + } + +/* This group of macros is used to facilitate checking of support for rsmi_dev* + * "getter" functions. When the return buffer is set to nullptr, the macro will + * check the previously gathered device support data to see if the function, + * with possible variants (e.g., memory types, firware types,...) and + * subvariants (e.g. monitors/sensors) are supported. + */ +// This macro assumes dev already available +#define CHK_API_SUPPORT_ONLY(RT_PTR, VR, SUB_VR) \ + if ((RT_PTR) == nullptr) { \ + try { \ + if (!dev->DeviceAPISupported(__FUNCTION__, (VR), (SUB_VR))) { \ + return RSMI_STATUS_NOT_SUPPORTED; \ + } \ + return RSMI_STATUS_INVALID_ARGS; \ + } catch (const amd::smi::rsmi_exception& e) { \ + debug_print( \ + "Exception caught when checking if API is supported %s.\n", \ + e.what()); \ + return RSMI_STATUS_INVALID_ARGS; \ + } \ + } + +#define CHK_SUPPORT(RT_PTR, VR, SUB_VR) \ + GET_DEV_FROM_INDX \ + CHK_API_SUPPORT_ONLY((RT_PTR), (VR), (SUB_VR)) + +#define CHK_SUPPORT_NAME_ONLY(RT_PTR) \ + CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT) \ + +#define CHK_SUPPORT_VAR(RT_PTR, VR) \ + CHK_SUPPORT((RT_PTR), (VR), RSMI_DEFAULT_VARIANT) \ + +#define CHK_SUPPORT_SUBVAR_ONLY(RT_PTR, SUB_VR) \ + CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, (SUB_VR)) \ + #define DBG_FILE_ERROR(FN, WR_STR) \ if (env_ && env_->debug_output_bitfield & RSMI_DEBUG_SYSFS_FILE_PATHS) { \ std::cout << "*****" << __FUNCTION__ << std::endl; \ diff --git a/include/rocm_smi/rocm_smi_utils.h b/include/rocm_smi/rocm_smi_utils.h index 4c042fd164..89c3ac4b7d 100755 --- a/include/rocm_smi/rocm_smi_utils.h +++ b/include/rocm_smi/rocm_smi_utils.h @@ -47,6 +47,9 @@ #include #include +#include + +#include "rocm_smi/rocm_smi_device.h" #ifdef NDEBUG #define debug_print(fmt, ...) \ @@ -62,6 +65,8 @@ namespace amd { namespace smi { +pthread_mutex_t *GetMutex(uint32_t dv_ind); + int SameFile(const std::string fileA, const std::string fileB); bool FileExists(char const *filename); int isRegularFile(std::string fname, bool *is_reg); @@ -71,6 +76,12 @@ int WriteSysfsStr(std::string path, std::string val); bool IsInteger(const std::string & n_str); +rsmi_status_t handleException(); +rsmi_status_t +GetDevValueVec(amd::smi::DevInfoTypes type, + uint32_t dv_ind, std::vector *val_vec); +rsmi_status_t ErrnoToRsmiStatus(uint32_t err); + struct pthread_wrap { public: explicit pthread_wrap(pthread_mutex_t &p_mut) : mutex_(p_mut) {} diff --git a/oam/CMakeLists.txt b/oam/CMakeLists.txt new file mode 100755 index 0000000000..96b4e6fb16 --- /dev/null +++ b/oam/CMakeLists.txt @@ -0,0 +1,108 @@ +# +# Minimum version of cmake required +# + +message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") +message(" CMake OAM (Library) ") +message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") + +## Verbose output. +set(CMAKE_VERBOSE_MAKEFILE on) + +# Required Defines first: + +message("") +message("Build Configuration:") +# message("-----------BuildType: " ${CMAKE_BUILD_TYPE}) +# message("------------Compiler: " ${CMAKE_CXX_COMPILER}) +# message("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION}) +message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR}) +# message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR}) +# message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) +# message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) +# message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR}) +# message("--------RSMI Inc Dir: " ${OAM_INC_DIR}) +# message("") + +set(OAM_ROOT "${PROJECT_SOURCE_DIR}/oam") +set(OAM_NAME "oam") +set(OAM_COMPONENT "lib${OAM_NAME}") +set(OAM_TARGET "${OAM_NAME}") + +################# Determine the library version ######################### +set(SO_VERSION_GIT_TAG_PREFIX "oam_so_ver") + +# VERSION_* variables should be set by get_version_from_tag +message("Package version: ${PKG_VERSION_STR}") + +# Debian package specific variables +# Set a default value for the package version +get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT) + +# VERSION_* variables should be set by get_version_from_tag +if ( ${ROCM_PATCH_VERSION} ) + set ( VERSION_PATCH ${ROCM_PATCH_VERSION}) + set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") +else() + set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}") +endif () +set(${OAM_NAME}_VERSION_MAJOR "${VERSION_MAJOR}") +set(${OAM_NAME}_VERSION_MINOR "${VERSION_MINOR}") +set(${OAM_NAME}_VERSION_PATCH "0") +set(${OAM_NAME}_VERSION_BUILD "0") +message("SOVERSION: ${SO_VERSION_STRING}") + + +# Create a configure file to get version info from within library +configure_file( + "${OAM_ROOT}/src/${OAM_TARGET}Config.in" + "${OAM_ROOT}/include/oam/${OAM_TARGET}Config.h") + +set(OAM_SRC_DIR "src") +set(OAM_INC_DIR "include") +set(OAM_DOCS_DIR "docs") + +set(OAM_SRC_LIST ${CMN_SRC_LIST} "${OAM_SRC_DIR}/amd_oam.cc") + +set(OAM_INC_LIST ${COMMON_INC_DIR} "${OAM_INC_DIR}") +set(OAM_EXAMPLE_EXE "oam_ex") + +add_executable(${OAM_EXAMPLE_EXE} "example/oam_example.c") +target_include_directories(${OAM_EXAMPLE_EXE} PRIVATE ${OAM_INC_LIST}) +target_link_libraries(${OAM_EXAMPLE_EXE} ${OAM_TARGET}) +add_library(${OAM_TARGET} SHARED ${CMN_SRC_LIST} ${OAM_SRC_LIST} + ${CMN_INC_LIST} ${OAM_INC_LIST}) +target_link_libraries(${OAM_TARGET} pthread rt) +target_include_directories(${OAM_TARGET} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include ${COMMON_PROJ_ROOT}/common/shared_mutex) + +## Set the VERSION and SOVERSION values +set_property(TARGET ${OAM_TARGET} PROPERTY + SOVERSION "${VERSION_MAJOR}") +set_property(TARGET ${OAM_TARGET} PROPERTY + VERSION "${SO_VERSION_STRING}") + +## If the library is a release, strip the target library +if ("${CMAKE_BUILD_TYPE}" STREQUAL Release) + add_custom_command( + TARGET ${OAM_TARGET} + POST_BUILD COMMAND ${CMAKE_STRIP} lib${OAM_TARGET}.so) +endif () + +## Add the install directives for the runtime library. +install(TARGETS ${OAM_TARGET} + LIBRARY DESTINATION ${OAM_NAME}/lib COMPONENT ${OAM_COMPONENT}) +install(FILES ${COMMON_SRC_ROOT}/oam/include/oam/oam_mapi.h + ${COMMON_SRC_ROOT}/oam/include/oam/amd_oam.h + DESTINATION oam/include/oam) + +# Generate Doxygen documentation +if (DOXYGEN_FOUND) + configure_file(${OAM_DOCS_DIR}/docs/rsmi_doxygen.cfg + ${OAM_DOCS_DIR}/Doxyfile @ONLY) + add_custom_target(doc + ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Generating AMD OAM API documentation with Doxygen" VERBATIM) +endif(DOXYGEN_FOUND) + diff --git a/oam/example/oam_example.c b/oam/example/oam_example.c new file mode 100755 index 0000000000..c683819c10 --- /dev/null +++ b/oam/example/oam_example.c @@ -0,0 +1,30 @@ +#include +#include "oam/oam_mapi.h" +#include "oam/amd_oam.h" + +const oam_ops_t amd_oam_ops = { + .init = amdoam_init, + .free = amdoam_free, +// .get_mapi_version = amdoam_get_mapi_version, + .discover_devices = amdoam_discover_devices, +}; + +int main() +{ + uint32_t dev_cnt = 0; + oam_mapi_version_t version; + + if (amd_oam_ops.init(version)) { + printf("init failed\n"); + return -1; + } + +// amd_oam_ops.get_mapi_version(&version); + if (!amd_oam_ops.discover_devices(&dev_cnt)) + printf("%d AMD devices are discovered\n", dev_cnt); + + amd_oam_ops.free(); + + return 0; +} + diff --git a/oam/include/oam/amd_oam.h b/oam/include/oam/amd_oam.h new file mode 100755 index 0000000000..2d4083cafb --- /dev/null +++ b/oam/include/oam/amd_oam.h @@ -0,0 +1,43 @@ +/* + * MIT License + * + * Copyright (c) 2020 Open Compute Project + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef OAM_INCLUDE_OAM_AMD_OAM_H_ +#define OAM_INCLUDE_OAM_AMD_OAM_H_ + +#ifdef __cplusplus +extern "C" { +#include +#else +#include +#endif // __cplusplus + +int amdoam_init(oam_mapi_version_t version); +int amdoam_free(void); +// int amdoam_get_mapi_version(oam_mapi_version_t *version); +int amdoam_discover_devices(int *device_count); + +#ifdef __cplusplus +} +#endif // __cplusplus +#endif // OAM_INCLUDE_OAM_AMD_OAM_H_ diff --git a/oam/include/oam/oam_mapi.h b/oam/include/oam/oam_mapi.h new file mode 100755 index 0000000000..b8fae4b96f --- /dev/null +++ b/oam/include/oam/oam_mapi.h @@ -0,0 +1,647 @@ +/* + * MIT License + * + * Copyright (c) 2020 Open Compute Project + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef OAM_INCLUDE_OAM_OAM_MAPI_H_ +#define OAM_INCLUDE_OAM_OAM_MAPI_H_ + +/** + * \file oam_mapi.h + * \brief OAM management and monitoring library API definitions + */ + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include +#include + +/** + * \struct oam_mapi_version_t + * \brief OAM library API version + * \details TBD + * All the libraries versions are expected to be backward compatible. + * The major version increment indicates a new API has been added. + * Minor version increment indicates an interface change. + */ +typedef struct oam_mapi_version { + uint32_t major; + uint32_t minor; +} oam_mapi_version_t; + +/** + * \struct oam_dev_properties_t + * \brief Local identifier for the device + * \details Immutable device identifier + * This is unique within the chassis. + */ +typedef struct oam_dev_id { + /*!< local identifier for the device */ + int device_id; +} oam_dev_id_t; + +/** + * \struct oam_dev_properties_t + * \brief Network identifier for the device + * \details Immutable network identifier for the device. + * This is unique across the entire network. + */ +typedef struct oam_net_dev_id { + /*!< unique network identifier for the device */ + int network_id; +} oam_net_dev_id_t; + +/* + * various lengths for device properties + */ +#define DEVICE_VENDOR_LEN 128 +#define DEVICE_NAME_LEN 128 +#define DEVICE_SKU_LEN 128 +#define BOARD_NAME_LEN 128 +#define BOARD_REVISION_LEN 128 +#define BOARD_SERIAL_NUM_LEN 128 + +/** + * \struct oam_dev_properties_t + * \brief TBD + * \details TBD + */ +typedef struct oam_dev_properties { + /*!< unique network identifier for the device */ + oam_dev_id_t device_id; + /*!< vendor name */ + char device_vendor[DEVICE_VENDOR_LEN]; + /*!< Device name */ + char device_name[DEVICE_NAME_LEN]; + /*!< SKU name */ + char sku_name[DEVICE_SKU_LEN]; + /*!< Board name */ + char board_name[BOARD_NAME_LEN]; + /*!< Board revision */ + char board_revision[BOARD_REVISION_LEN]; + /*!< + * Board Serial Number or UUID any other identifier, which can be used + * to identify devices uniquely and physically. + */ + char board_serial_number[BOARD_SERIAL_NUM_LEN]; +} oam_dev_properties_t; + +/** + * \struct oam_sensor_count_t + * \brief TBD + * \details TBD + * Various sensor related information + */ +typedef struct oam_sensor_count { + uint32_t num_temperature_sensors; + uint32_t num_power_sensors; + uint32_t num_voltage_sensors; + uint32_t num_current_sensors; + uint32_t num_fans; +} oam_sensor_count_t; + +/** + * \enum oam_sensor_type_t + * \brief Sensor types + * \details This enumerated type defines available sensors types. + */ +typedef enum oam_sensor_type { + OAM_SENSOR_TYPE_POWER = 0, + OAM_SENSOR_TYPE_VOLTAGE, + OAM_SENSOR_TYPE_CURRENT, + OAM_SENSOR_TYPE_TEMP, + OAM_SENSOR_TYPE_FAN_SPEED, + OAM_SENSOR_TYPE_UNKNOWN = 0xFF +} oam_sensor_type_t; + +/** + * \enum oam_power_sensor_scale_t + * \brief scale for power measurements + * \details This enumerated type defines available scales for power measurements + */ +typedef enum oam_power_sensor_scale { + OAM_POWER_SCALE_uW = 0, + OAM_POWER_SCALE_mW, + OAM_POWER_SCALE_W, +} oam_power_sensor_scale_t; + +/** + * \enum oam_voltage_sensor_scale_t + * \brief scale for voltage measurements + * \details This enumerated type defines available scales for voltage measurements + */ +typedef enum oam_voltage_sensor_scale { + OAM_VOLTAGE_SCALE_uV = 0, + OAM_VOLTAGE_SCALE_mV, + OAM_VOLTAGE_SCALE_V, +} oam_voltage_sensor_scale_t; + +/** + * \enum oam_current_sensor_scale_t + * \brief scale for current measurements + * \details This enumerated type defines available scales for current measurements + */ +typedef enum oam_current_sensor_scale { + OAM_CURRENT_SCALE_uA = 0, + OAM_CURRENT_SCALE_mA, + OAM_CURRENT_SCALE_A, +} oam_current_sensor_scale_t; + +/** + * \enum oam_temp_sensor_scale_t + * \brief scale for temp measurements + * \details This enumerated type defines available scales for temp measurements + */ +typedef enum oam_temp_sensor_scale { + OAM_TEMP_SCALE_C = 0, + OAM_TEMP_SCALE_F +} oam_temp_sensor_scale_t; + +/** + * \enum oam_fan_sensor_scale_t + * \brief scale for power measurements + * \details This enumerated type defines available scales for power measurements + */ +typedef enum oam_fan_sensor_scale { + OAM_FAN_SPEED_Hz = 0, + OAM_FAN_SPEED_KHz, + OAM_FAN_SPEED_MHz +} oam_fan_sensor_scale_t; + +typedef union oam_sensor_scale { + oam_power_sensor_scale_t power_scale; + oam_voltage_sensor_scale_t volate_scale; + oam_current_sensor_scale_t current_scale; + oam_temp_sensor_scale_t temp_scale; + oam_fan_sensor_scale_t fan_scale; +} oam_sensor_scale_t; + +/** + * \struct oam_dev_handle_t + * \brief Device handle + * \details Device handle obtained using open call + * The same handle is used by all the APIs which are used to perform + * specific operation on that device. + */ +typedef struct oam_dev_handle { + void *handle; +} oam_dev_handle_t; + +/** + * \enum oam_dev_mode_t + * \brief Device open modes + * \details This enumerated type defines modes in which the device can be opened + * For some operations e.g. health check user should open the device + * in exclusive mode, so that if there are many applications using the same + * device there are no side effects. + */ +typedef enum oam_dev_mode { + OAM_DEV_MODE_EXCLUSIVE = 0, + OAM_DEV_MODE_NONEXLUSIVE = 1, + OAM_DEV_MODE_UNKNOWN = 0xFF +} oam_dev_mode_t; + +/** + * \def OAM_SENSOR_NAME_MAX + * \brief length of sensor name + */ +#define OAM_SENSOR_NAME_MAX 256 + +/** + * \struct oam_sensor_info_t + * \brief Sensor information + * \details Device handle obtained using open call + * The same handle is used by all the APIs which are used to perform + * specific operation on that device. + */ +typedef struct oam_sensor_info { + char sensor_name[OAM_SENSOR_NAME_MAX]; + oam_sensor_type_t sensor_type; + oam_sensor_scale_t scale; + int32_t value; +} oam_sensor_info_t; + +/** + * \struct oam_dev_error_count_t + * \brief Device error information + * \details Various types of errors reported by device. + */ +typedef struct oam_dev_error_count { + uint32_t total_error_count; + uint32_t fatal_error_count; + uint32_t unknown_error_count; + uint32_t ecc_error_count; +} oam_dev_error_count_t; + +/** + * \struct oam_firmware_version_t + * \brief Device error information + * \details Structure to store various firmware versions of OAM module + */ +typedef struct oam_firmware_version { + oam_mapi_version_t device_boot_fw_version; + oam_mapi_version_t device_fw_version; + oam_mapi_version_t board_boot_fw_version; + oam_mapi_version_t board_fw_version; +} oam_firmware_version_t; + +/** + * \struct oam_pci_info_t + * \brief PCI information for the device + * \details Structure to store PCI (Domain, BDF) information of the device + */ +typedef struct oam_pci_info { + uint16_t domain; + uint8_t bus; + uint8_t device; + uint8_t function; +} oam_pci_info_t; + +/** + * \enum oam_net_port_state_t + * \brief Network port state + * \details This enumerated type defines various states of the network port + */ +typedef enum oam_net_port_state { + OAM_NET_PORT_DISABLED = 0, + OAM_NET_PORT_ENABLED = 1 +} oam_net_port_state_t; + +/** + * \enum oam_net_port_status_t + * \brief Network port status + * \details This enumerated type defines various status of the network port + */ +typedef enum oam_net_port_status { + OAM_NET_PORT_UP = 0, + OAM_NET_PORT_DOWN = 1, +} oam_net_port_status_t; + +/** + * \enum oam_net_port_id_t + * \brief Network port identifiers + * \details This enumerated type defines various identifiers for network ports + */ +typedef enum oam_net_port_id { + OAM_NET_PORT0 = 0, + OAM_NET_PORT1 = 1, + OAM_NET_PORT2 = 2, + OAM_NET_PORT_MAX = 0xFFFF +} oam_net_port_id_t; + +/** + * \enum oam_firmware_modes_t + * \brief Supported mode to update firmware on device + * \details This enumerated type defines various modes which are supported by + * the device to update firmware. + */ +typedef enum oam_firmware_modes { + OAM_DOWNLOAD_ONLY = 0, + OAM_DOWNLOAD_ACTIVATE = 1 +} oam_firmware_modes_t; + +/** + * \def OAM_NET_PORT_NAME + * \brief length of network port name + */ +#define OAM_NET_PORT_NAME 256 + +/** + * \struct oam_net_port_desc + * \brief Network port description + * \details Structure to store additional details about the network port + */ +typedef struct oam_net_port_desc { + char name[OAM_NET_PORT_NAME]; +} oam_net_port_desc_t; + +/** + * \def OAM_DEV_HOST_NAME + * \brief length of host name + */ +#define OAM_DEV_HOST_NAME 256 + +/** + * \struct oam_net_dev_info_t + * \brief Information about the device on a network + * \details Structure to store additional details about the network device + * on a particular network. + */ +typedef struct oam_net_dev_info { + oam_net_dev_id_t net_dev_id; + char host_name[OAM_DEV_HOST_NAME]; + oam_pci_info_t pci_info; +} oam_net_dev_info_t; + +/** + * \struct oam_neighbour_info_t + * \brief Information about device neighburs + * \details Structure to store information about device neighbours on the + * network + */ +typedef struct oam_neighbour_info { + oam_net_port_id_t device_port; + oam_net_dev_info_t device_info; +} oam_neighbour_info_t; + +/** + * \enum oam_dev_tpc_id_t + * \brief TPC identifiers + * \details This enumerated type defines various identifiers for TPCs + */ +typedef enum oam_dev_tpc_id { + OAM_DEV_TPC0, + OAM_DEV_TPC1, + OAM_DEV_TPC2, + OAM_DEV_TPC_MAX +} oam_dev_tpc_id_t; + +/** + * \def OAM_TPC_NAME + * \brief length of TPC name + */ +#define OAM_TPC_NAME 256 + +/** + * \struct oam_tpc_desc_t + * \brief TPC description + * \details Structure to store information about TPC e.g. name corresponding + * to the id etc. + */ +typedef struct oam_tpc_desc { + char name[256]; +} oam_tpc_desc_t; + +/** + * \struct oam_dev_tpc_stats_t + * \brief TPC statistical information + * \details Structure to store information about TPC statistical information + * e.g. TPC utilization + */ +typedef struct oam_dev_tpc_stats { + double util; +} oam_dev_tpc_stats_t; + +/** + * \enum oam_dev_mem_id_t + * \brief Device memory identifiers + * \details This enumerated type defines various identifiers for device memories + */ +typedef enum oam_dev_mem_id { + OAM_DEV_MEM0, + OAM_DEV_MEM1, + OAM_DEV_MEM2, + OAM_DEV_MEM_MAX +} oam_dev_mem_id_t; + +/** + * \struct oam_mem_desc_t + * \brief Device memory description + * \details Structure to store additional details about device memories port + */ +typedef struct oam_mem_desc { + char name[256]; +} oam_mem_desc_t; + +/** + * \struct oam_dev_mem_stats_t + * \brief Device memory statistical information + * \details Structure to store various statastical information about device + * memory. + */ +typedef struct oam_dev_mem_stats { + uint32_t total_mem; + uint32_t allocated_mem; + uint32_t free_mem; +} oam_dev_mem_stats_t; + +/** + * \struct oam_net_port_pkt_stats_t + * \brief Device network port statistical information + * \details Structure to store various statastical information about the network + * packets on a given port. + */ +typedef struct oam_net_port_pkt_stats { + uint64_t rx_count; + uint64_t tx_count; + uint64_t rx_errors; + uint64_t tx_errors; +} oam_net_port_pkt_stats_t; + +/** + * \struct oam_ops_t + * \brief OAM Device operations + * \details Structure provides list of APIs which needs to be + * supported by the OAM library. + */ +typedef struct oam_ops { + /*!< + * to initialise library instance and perform version compatibility + * check + */ + int (*init)(oam_mapi_version_t version); + int (*free)(void); + + /*!< + * To get error description from the error code + */ + int (*get_error_description)(int error_code, const char **error_description); + + /*!< + * To retrieve the OAM Management interface version + */ + int (*get_mapi_version)(oam_mapi_version_t *version); + + /*!< + * To retrieve the number of devices present/discovered by the library + */ + int (*discover_devices)(int *device_count); + + /*!< + * To retrieve device properties for each discovered devices + */ + int (*get_dev_properties)(oam_dev_properties_t *devices); + + /*!< + * To retrieve PCI properties of the device + */ + int (*get_pci_properties)(oam_dev_id_t *device_id, oam_pci_info_t *pci_info); + + /*!< + * To query the number of various sensors present + */ + int (*get_sensors_count)(oam_dev_id_t *device_id, + oam_sensor_count_t *sensor_count); + + /*!< + * Open the device and obtain handle + */ + int (*open_device)(oam_dev_id_t *dev_id, oam_dev_mode_t mode, + oam_dev_handle_t *handle); + int (*close_device)(oam_dev_handle_t *handle); + + + /*!< + * To read various sensor values for a given sensor type + */ + int (*get_sensors_info)(oam_dev_handle_t *handle, + oam_sensor_type_t type, + uint32_t num_sensors, + oam_sensor_info_t sensor_info[]); + /*!< + * To read current error count of the device + */ + int (*get_device_error_count)(oam_dev_handle_t *handle, + oam_dev_error_count_t *count); + + /*!< + * To update firmware on the device + * fw_image contains a null terminated string which specifies complete + * path where the firmware image is located + */ + int (*download_firmware)(oam_dev_id_t *device_id, char *fw_image, + oam_firmware_modes_t mode); + + /*!< + * To query firmware versions + */ + int (*get_firmware_version)(oam_dev_id_t *device_id, + oam_firmware_version_t *version); + + + /*!< + * to get network id from device id + */ + int (*get_net_dev_id)(oam_dev_id_t *device_id, oam_net_dev_id_t *net_device); + + /*!< + * Network management APIs. + */ + + /*!< + * discover network. + */ + int (*discover_network)(int *net_dev_count); + int (*get_dev_net_properties)(oam_net_dev_info_t *net_dev_info); + + int (*get_neighbour_count)(oam_dev_id_t *device, + oam_net_port_id_t local_port_id, + uint32_t *neighbor_count); + + int (*get_neighbours_info)(oam_dev_id_t *device, + oam_net_port_id_t local_port_id, + uint32_t *neighbors_count, + oam_neighbour_info_t *neighbours_info); + + int (*configure_network)(oam_net_dev_id_t *net_devices, + uint32_t *net_device_count, + char *network_name); + + int (*destroy_network)(char *network_name); + + int (*query_network)(char *network_name, oam_net_dev_info_t *devices, + uint32_t *device_count); + + int (*get_network_count)(uint32_t *network_count); + int (*list_networks)(char *network_names[]); + + /*!< + * Various statistics related to blocks + */ + + /*!< + * To query number of ports + */ + int (*get_net_port_count)(oam_dev_handle_t *handle, uint32_t *count, + oam_net_port_id_t *port_ids); + + int (*get_net_port_desc)(oam_dev_handle_t *handle, oam_net_port_id_t *port, + oam_net_port_desc_t *desc); + + int (*get_net_port_state)(oam_dev_handle_t *handle, oam_net_port_id_t *port, + oam_net_port_state_t *state); + + int (*check_net_port_status)(oam_dev_handle_t *handle, + oam_net_port_id_t *port, + oam_net_port_status_t *status); + int (*get_net_port_pkt_stats)(oam_dev_handle_t *handle, + oam_net_port_id_t *port, + uint32_t duration_sec, + oam_net_port_pkt_stats_t *stats); + + int (*query_net_port_bandwidth)(oam_dev_handle_t *handle, + oam_net_port_id_t *port, + uint32_t duration_sec, + double *bandwidth); + + int (*get_tpc_count)(oam_dev_handle_t *handle, uint32_t *count, + oam_dev_tpc_id_t *tpc_ids); + + int (*get_tpc_desc)(oam_dev_handle_t *handle, oam_dev_tpc_id_t *tpc_id, + oam_tpc_desc_t *desc); + + int (*get_tpc_stats)(oam_dev_handle_t *handle, + oam_dev_tpc_id_t *port, + oam_dev_tpc_stats_t *stats, + uint32_t duration_sec); + + int (*get_mem_count)(oam_dev_handle_t *handle, uint32_t *count, + oam_dev_mem_id_t *mem_ids); + + int (*get_mem_desc)(oam_dev_handle_t *handle, oam_dev_mem_id_t *tpc_id, + oam_mem_desc_t *desc); + + int (*get_mem_stats)(oam_dev_handle_t *handle, oam_dev_mem_id_t *mem_id, + oam_dev_mem_stats_t *stats); + + /*!< + * To check the health of the individual components, libraries + * generates test workload to check if the block is functioning properly + * or not. So no other workload should be running while calling these + * APIs + */ + int (*check_tpc_health)(oam_dev_id_t *device_id, oam_dev_tpc_id_t *tpc_id); + int (*check_net_port_health)(oam_dev_id_t *device_id, + oam_net_port_id_t *port); + int (*check_mem_health)(oam_dev_id_t *device_id, oam_dev_mem_id_t *port); + + /* + * Following needs more attention, will work on in next + int (*get_fan_speed)(oam_dev_t *oam); + int (*set_fan_speed)(oam_dev_t *oam, int speed); + + int (*get_power_cap)(oam_dev_t *oam); + int (*set_power_cap)(oam_dev_t *oam, int power); + + int (*get_telemetry)(oam_dev_t *oam); + */ +} oam_ops_t; + + +#ifdef __cplusplus +} +#endif + +#endif // OAM_INCLUDE_OAM_OAM_MAPI_H_ diff --git a/oam/src/amd_oam.cc b/oam/src/amd_oam.cc new file mode 100755 index 0000000000..e3978134ff --- /dev/null +++ b/oam/src/amd_oam.cc @@ -0,0 +1,161 @@ +/* + * MIT License + * + * Copyright (c) 2020 Open Compute Project + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include + +#include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_device.h" +#include "rocm_smi/rocm_smi_utils.h" +#include "rocm_smi/rocm_smi_exception.h" +#include "rocm_smi/rocm_smi_counters.h" +#include "rocm_smi/rocm_smi_kfd.h" +#include "rocm_smi/rocm_smi.h" + +#include "oam/oam_mapi.h" +#include "oam/amd_oam.h" + +#define TRY try { +#define CATCH } catch (...) {return handleRSMIException();} + + +static int handleRSMIException() { + rsmi_status_t ret; + ret = amd::smi::handleException(); + + // TODO(x): convert RSMI return to OAM return + // For now, just return int equiv. + return static_cast(ret); +} + +int amdoam_init(oam_mapi_version_t version) { + TRY + + // TODO(x): handle version argument + (void)version; + + rsmi_status_t ret = rsmi_init(0); + + return 0; + CATCH +} + +int amdoam_free(void) { + rsmi_status_t ret = rsmi_shut_down(); + + // TODO(x) convert rsmi return to oam return val + return static_cast(ret); +} + + +int amdoam_discover_devices(int *device_count) { + uint32_t dv_cnt; + + if (device_count == nullptr) { + return -1; // TODO(x): return appropriate OAM code + } + + rsmi_status_t ret = rsmi_num_monitor_devices(&dv_cnt); + + *device_count = static_cast(dv_cnt); + + // TODO(x) convert rsmi return to oam return val + return static_cast(ret); +} + +// TODO(x): This function doesn't work for OAM. It's just a version +// of rsmi_dev_ecc_count_get(), which has similar functionality. +// The purpose here is just to drive refactoring; e.g., making macros +// available and previously static functions global. +int +get_device_error_count(oam_dev_handle_t *handle, + oam_dev_error_count_t *count) { + std::vector val_vec; + rsmi_status_t ret; + + TRY + // TODO(x): replace with final code... + // Below, we are just returning errors for RSMI_GPU_BLOCK_GFX as a + // placeholder + (void)handle; // Just ignore for now + + rsmi_gpu_block_t block = RSMI_GPU_BLOCK_GFX; + + // The macro CHK_SUPPORT_VAR assumes the existence of a device index variable + // "dv_ind". Presumably, the device index will come from the "handle" + // pointer. Since I don't know how that will be implemented, for now we + // will just make up a device index: + uint32_t dv_ind = 0; + CHK_SUPPORT_VAR(count, block) + + amd::smi::DevInfoTypes type; + switch (block) { + case RSMI_GPU_BLOCK_UMC: + type = amd::smi::kDevErrCntUMC; + break; + + case RSMI_GPU_BLOCK_SDMA: + type = amd::smi::kDevErrCntSDMA; + break; + + case RSMI_GPU_BLOCK_GFX: + type = amd::smi::kDevErrCntGFX; + break; + + default: + return RSMI_STATUS_NOT_SUPPORTED; + } + + DEVICE_MUTEX + + ret = GetDevValueVec(type, dv_ind, &val_vec); + + if (ret == RSMI_STATUS_FILE_ERROR) { + return RSMI_STATUS_NOT_SUPPORTED; + } + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + assert(val_vec.size() == 2); + + std::string junk; + std::istringstream fs1(val_vec[0]); + + fs1 >> junk; + assert(junk == "ue:"); + fs1 >> count->total_error_count; + + std::istringstream fs2(val_vec[1]); + + fs2 >> junk; + assert(junk == "ce:"); + fs2 >> count->total_error_count; + + return ret; + CATCH +} + diff --git a/oam/src/oamConfig.in b/oam/src/oamConfig.in new file mode 100755 index 0000000000..bde279ced1 --- /dev/null +++ b/oam/src/oamConfig.in @@ -0,0 +1,56 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#ifndef INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_ +#define INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_ + +// This file is generated on build. + +#define rocm_smi_VERSION_MAJOR @rocm_smi_VERSION_MAJOR@ +#define rocm_smi_VERSION_MINOR @rocm_smi_VERSION_MINOR@ +#define rocm_smi_VERSION_PATCH @rocm_smi_VERSION_PATCH@ +#define rocm_smi_VERSION_BUILD "@rocm_smi_VERSION_BUILD@" + +#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_ \ No newline at end of file diff --git a/rocm_smi/CMakeLists.txt b/rocm_smi/CMakeLists.txt new file mode 100755 index 0000000000..3875e9bb06 --- /dev/null +++ b/rocm_smi/CMakeLists.txt @@ -0,0 +1,143 @@ +# +# Minimum version of cmake required +# + +message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") +message(" CMake ROCm SMI (Library) ") +message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") + +## Verbose output. +set(CMAKE_VERBOSE_MAKEFILE on) + +# Required Defines first: + +message("") +message("Build Configuration:") +# message("-----------BuildType: " ${CMAKE_BUILD_TYPE}) +# message("------------Compiler: " ${CMAKE_CXX_COMPILER}) +# message("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION}) +message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR}) +# message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR}) +# message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) +# message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) +# message("--------RSMI Lib Dir: " ${RSMI_LIB_DIR}) +# message("--------RSMI Inc Dir: " ${RSMI_INC_DIR}) +# message("") + +set(ROCM_SMI "rocm_smi") +set(ROCM_SMI_COMPONENT "lib${ROCM_SMI}") +set(ROCM_SMI_TARGET "${ROCM_SMI}64") + +################# Determine the library version ######################### +set(SO_VERSION_GIT_TAG_PREFIX "rsmi_so_ver") + +# VERSION_* variables should be set by get_version_from_tag +message("Package version: ${PKG_VERSION_STR}") + +# Debian package specific variables +# Set a default value for the package version +get_version_from_tag("1.0.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT) + +# VERSION_* variables should be set by get_version_from_tag +if ( ${ROCM_PATCH_VERSION} ) + set ( VERSION_PATCH ${ROCM_PATCH_VERSION}) + set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") +else() + set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}") +endif () +set(${ROCM_SMI}_VERSION_MAJOR "${VERSION_MAJOR}") +set(${ROCM_SMI}_VERSION_MINOR "${VERSION_MINOR}") +set(${ROCM_SMI}_VERSION_PATCH "0") +set(${ROCM_SMI}_VERSION_BUILD "0") +message("SOVERSION: ${SO_VERSION_STRING}") + + +# Create a configure file to get version info from within library +configure_file( + "${PROJECT_SOURCE_DIR}/src/${ROCM_SMI_TARGET}Config.in" + "${PROJECT_SOURCE_DIR}/include/rocm_smi/${ROCM_SMI_TARGET}Config.h") + +set(RSMI_SRC_DIR "src") +set(RSMI_INC_DIR "include") +set(RSMI_DOCS_DIR "docs") + +# Add any rocm_smi_lib specific source files here +set(SMI_SRC_LIST ${CMN_SRC_LIST}) + +# Add any rocm_smi_lib specific headers here +set(SMI_INC_LIST "") + +set(SMI_EXAMPLE_EXE "rocm_smi_ex") + +add_executable(${SMI_EXAMPLE_EXE} "example/rocm_smi_example.cc") +target_link_libraries(${SMI_EXAMPLE_EXE} ${ROCM_SMI_TARGET}) +add_library(${ROCM_SMI_TARGET} SHARED ${CMN_SRC_LIST} ${SMI_SRC_LIST} + ${CMN_INC_LIST} ${SMI_INC_LIST}) +target_link_libraries(${ROCM_SMI_TARGET} pthread rt) +target_include_directories(${ROCM_SMI_TARGET} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} ${COMMON_PROJ_ROOT}/common/shared_mutex) + +## Set the VERSION and SOVERSION values +set_property(TARGET ${ROCM_SMI_TARGET} PROPERTY + SOVERSION "${VERSION_MAJOR}") +set_property(TARGET ${ROCM_SMI_TARGET} PROPERTY + VERSION "${SO_VERSION_STRING}") + +## If the library is a release, strip the target library +if ("${CMAKE_BUILD_TYPE}" STREQUAL Release) + add_custom_command( + TARGET ${ROCM_SMI_TARGET} + POST_BUILD COMMAND ${CMAKE_STRIP} lib${ROCM_SMI_TARGET}.so) +endif () + +## Add symlinks from top level ROCm lib dir to rocm-smi lib so files +add_custom_target ( so-link ALL WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E create_symlink + ../${ROCM_SMI}/lib/${ROCM_SMI_LIB_NAME}.so so-link ) +add_custom_target ( so-major-link ALL WORKING_DIRECTORY + ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_COMMAND} + -E create_symlink + ../${ROCM_SMI}/lib/${ROCM_SMI_LIB_NAME}.so.${VERSION_MAJOR} + so-major-link ) + +install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/so-link DESTINATION lib RENAME + ${ROCM_SMI_LIB_NAME}.so ) +install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/so-major-link DESTINATION lib + RENAME ${ROCM_SMI_LIB_NAME}.so.${VERSION_MAJOR} ) + +## Add the install directives for the runtime library. +install(TARGETS ${ROCM_SMI_TARGET} + LIBRARY DESTINATION ${ROCM_SMI}/lib COMPONENT ${ROCM_SMI_COMPONENT}) +install(FILES ${COMMON_SRC_ROOT}/include/rocm_smi/rocm_smi.h + DESTINATION rocm_smi/include/rocm_smi) +install(FILES ${COMMON_SRC_ROOT}/include/rocm_smi/kfd_ioctl.h + DESTINATION rocm_smi/include/rocm_smi) +# Generate Doxygen documentation +if (DOXYGEN_FOUND AND LATEX_FOUND) + set (RSMI_MANUAL_NAME "ROCm_SMI_Manual") + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/docs/rsmi_doxygen.cfg + ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY) + + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.tex + COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/docs/rsmi_doxygen.cfg + "${INC_DIR}/rocm_smi.h" + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.pdf + COMMAND make > /dev/null + COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.pdf + ${CMAKE_CURRENT_SOURCE_DIR}/docs/${RSMI_MANUAL_NAME}_new.pdf + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.tex + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/latex) + + add_custom_target(docs DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.pdf) + + add_dependencies(${ROCM_SMI_TARGET} docs) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.pdf + DESTINATION ${ROCM_SMI}/docs/${RSMI_MANUAL_NAME}.pdf) + install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/docs/README.md + DESTINATION ${ROCM_SMI}/docs/) +else() + message("Doxygen or Latex is not found. Will not generate documents.") +endif(DOXYGEN_FOUND AND LATEX_FOUND) + diff --git a/docs/README.md b/rocm_smi/docs/README.md similarity index 100% rename from docs/README.md rename to rocm_smi/docs/README.md diff --git a/docs/ROCm_SMI_Manual.pdf b/rocm_smi/docs/ROCm_SMI_Manual.pdf similarity index 100% rename from docs/ROCm_SMI_Manual.pdf rename to rocm_smi/docs/ROCm_SMI_Manual.pdf diff --git a/docs/rsmi_doxygen.cfg b/rocm_smi/docs/amd_smi_doxygen.cfg similarity index 100% rename from docs/rsmi_doxygen.cfg rename to rocm_smi/docs/amd_smi_doxygen.cfg diff --git a/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc similarity index 100% rename from example/rocm_smi_example.cc rename to rocm_smi/example/rocm_smi_example.cc diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 282a4ceaa3..9d81ec5b1a 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -76,129 +76,8 @@ static const uint32_t kMaxOverdriveLevel = 20; -static rsmi_status_t errno_to_rsmi_status(uint32_t err) { - switch (err) { - case 0: return RSMI_STATUS_SUCCESS; - case ESRCH: return RSMI_STATUS_NOT_FOUND; - case EACCES: return RSMI_STATUS_PERMISSION; - case EPERM: - case ENOENT: return RSMI_STATUS_NOT_SUPPORTED; - case EBADF: - case EISDIR: return RSMI_STATUS_FILE_ERROR; - case EINTR: return RSMI_STATUS_INTERRUPT; - case EIO: return RSMI_STATUS_UNEXPECTED_SIZE; - case ENXIO: return RSMI_STATUS_UNEXPECTED_DATA; - case EBUSY: return RSMI_STATUS_BUSY; - default: return RSMI_STATUS_UNKNOWN_ERROR; - } -} - -static rsmi_status_t handleException() { - try { - throw; - } catch (const std::bad_alloc& e) { - return RSMI_STATUS_OUT_OF_RESOURCES; - } catch (const amd::smi::rsmi_exception& e) { - debug_print("Exception caught: %s.\n", e.what()); - return e.error_code(); - } catch (const std::exception& e) { - debug_print("Exception caught: %s\n", e.what()); - return RSMI_STATUS_INTERNAL_EXCEPTION; - } catch (const std::nested_exception& e) { - debug_print("Callback threw.\n"); - return RSMI_STATUS_INTERNAL_EXCEPTION; - } catch (int erno) { - return errno_to_rsmi_status(erno); - } catch (...) { - debug_print("Unknown exception caught.\n"); - return RSMI_STATUS_INTERNAL_EXCEPTION; - } -} - #define TRY try { -#define CATCH } catch (...) {return handleException();} - -#define CHECK_DV_IND_RANGE \ - amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); \ - if (dv_ind >= smi.monitor_devices().size()) { \ - return RSMI_STATUS_INVALID_ARGS; \ - } \ - -#define GET_DEV_FROM_INDX \ - CHECK_DV_IND_RANGE \ - std::shared_ptr dev = smi.monitor_devices()[dv_ind]; \ - assert(dev != nullptr); - - -#define GET_DEV_AND_KFDNODE_FROM_INDX \ - GET_DEV_FROM_INDX \ - std::shared_ptr kfd_node; \ - if (smi.kfd_node_map().find(dev->kfd_gpu_id()) == \ - smi.kfd_node_map().end()) { \ - return RSMI_INITIALIZATION_ERROR; \ - } \ - kfd_node = smi.kfd_node_map()[dev->kfd_gpu_id()]; - -#define REQUIRE_ROOT_ACCESS \ - if (amd::smi::RocmSMI::getInstance().euid()) { \ - return RSMI_STATUS_PERMISSION; \ - } - -#define DEVICE_MUTEX \ - amd::smi::pthread_wrap _pw(*get_mutex(dv_ind)); \ - amd::smi::RocmSMI& smi_ = amd::smi::RocmSMI::getInstance(); \ - bool blocking_ = !(smi_.init_options() && RSMI_INIT_FLAG_RESRV_TEST1); \ - amd::smi::ScopedPthread _lock(_pw, blocking_); \ - if (!blocking_ && _lock.mutex_not_acquired()) { \ - return RSMI_STATUS_BUSY; \ - } - -/* This group of macros is used to facilitate checking of support for rsmi_dev* - * "getter" functions. When the return buffer is set to nullptr, the macro will - * check the previously gathered device support data to see if the function, - * with possible variants (e.g., memory types, firware types,...) and - * subvariants (e.g. monitors/sensors) are supported. - */ -// This macro assumes dev already available -#define CHK_API_SUPPORT_ONLY(RT_PTR, VR, SUB_VR) \ - if ((RT_PTR) == nullptr) { \ - try { \ - if (!dev->DeviceAPISupported(__FUNCTION__, (VR), (SUB_VR))) { \ - return RSMI_STATUS_NOT_SUPPORTED; \ - } \ - return RSMI_STATUS_INVALID_ARGS; \ - } catch (const amd::smi::rsmi_exception& e) { \ - debug_print( \ - "Exception caught when checking if API is supported %s.\n", \ - e.what()); \ - return RSMI_STATUS_INVALID_ARGS; \ - } \ - } - -#define CHK_SUPPORT(RT_PTR, VR, SUB_VR) \ - GET_DEV_FROM_INDX \ - CHK_API_SUPPORT_ONLY((RT_PTR), (VR), (SUB_VR)) - -#define CHK_SUPPORT_NAME_ONLY(RT_PTR) \ - CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT) - -#define CHK_SUPPORT_VAR(RT_PTR, VR) \ - CHK_SUPPORT((RT_PTR), (VR), RSMI_DEFAULT_VARIANT) - -#define CHK_SUPPORT_SUBVAR_ONLY(RT_PTR, SUB_VR) \ - CHK_SUPPORT((RT_PTR), RSMI_DEFAULT_VARIANT, (SUB_VR)) - -static pthread_mutex_t *get_mutex(uint32_t dv_ind) { - amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); - - if (dv_ind >= smi.monitor_devices().size()) { - return nullptr; - } - std::shared_ptr dev = smi.monitor_devices()[dv_ind]; - assert(dev != nullptr); - - return dev->mutex(); -} +#define CATCH } catch (...) {return amd::smi::handleException();} static uint64_t get_multiplier_from_str(char units_char) { uint32_t multiplier = 0; @@ -404,7 +283,7 @@ static rsmi_status_t get_dev_value_str(amd::smi::DevInfoTypes type, GET_DEV_FROM_INDX int ret = dev->readDevInfo(type, val_str); - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } static rsmi_status_t get_dev_value_int(amd::smi::DevInfoTypes type, uint32_t dv_ind, uint64_t *val_int) { @@ -415,7 +294,7 @@ static rsmi_status_t get_dev_value_int(amd::smi::DevInfoTypes type, GET_DEV_FROM_INDX int ret = dev->readDevInfo(type, val_int); - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } static rsmi_status_t get_dev_value_line(amd::smi::DevInfoTypes type, @@ -427,7 +306,7 @@ static rsmi_status_t get_dev_value_line(amd::smi::DevInfoTypes type, GET_DEV_FROM_INDX int ret = dev->readDevInfoLine(type, val_str); - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } static rsmi_status_t set_dev_value(amd::smi::DevInfoTypes type, @@ -435,7 +314,7 @@ static rsmi_status_t set_dev_value(amd::smi::DevInfoTypes type, GET_DEV_FROM_INDX int ret = dev->writeDevInfo(type, val); - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type, @@ -452,7 +331,7 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type, int ret = dev->monitor()->readMonitor(type, sensor_ind, &val_str); if (ret) { - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } if (!amd::smi::IsInteger(val_str)) { @@ -480,7 +359,7 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type, int ret = dev->monitor()->readMonitor(type, sensor_ind, &val_str); if (ret) { - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } if (!amd::smi::IsInteger(val_str)) { @@ -504,7 +383,7 @@ static rsmi_status_t set_dev_mon_value(amd::smi::MonitorTypes type, int ret = dev->monitor()->writeMonitor(type, sensor_ind, std::to_string(val)); - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } static rsmi_status_t get_power_mon_value(amd::smi::PowerMonTypes type, @@ -517,7 +396,7 @@ static rsmi_status_t get_power_mon_value(amd::smi::PowerMonTypes type, uint32_t ret = smi.DiscoverAMDPowerMonitors(); if (ret != 0) { - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } std::shared_ptr dev = smi.monitor_devices()[dv_ind]; @@ -526,20 +405,9 @@ static rsmi_status_t get_power_mon_value(amd::smi::PowerMonTypes type, ret = dev->power_monitor()->readPowerValue(type, val); - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } -static rsmi_status_t get_dev_value_vec(amd::smi::DevInfoTypes type, - uint32_t dv_ind, std::vector *val_vec) { - assert(val_vec != nullptr); - if (val_vec == nullptr) { - return RSMI_STATUS_INVALID_ARGS; - } - GET_DEV_FROM_INDX - - int ret = dev->readDevInfo(type, val_vec); - return errno_to_rsmi_status(ret); -} static bool is_power_of_2(uint64_t n) { return n && !(n & (n - 1)); } @@ -654,7 +522,7 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, *enabled_blks = strtoul(tmp_str.c_str(), nullptr, 16); assert(errno == 0); - return errno_to_rsmi_status(errno); + return amd::smi::ErrnoToRsmiStatus(errno); CATCH } @@ -732,7 +600,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, DEVICE_MUTEX - ret = get_dev_value_vec(type, dv_ind, &val_vec); + ret = GetDevValueVec(type, dv_ind, &val_vec); if (ret == RSMI_STATUS_FILE_ERROR) { return RSMI_STATUS_NOT_SUPPORTED; @@ -828,7 +696,7 @@ get_id(uint32_t dv_ind, amd::smi::DevInfoTypes typ, uint16_t *id) { val_u64 = strtoul(val_str.c_str(), nullptr, 16); assert(errno == 0); if (errno != 0) { - return errno_to_rsmi_status(errno); + return amd::smi::ErrnoToRsmiStatus(errno); } if (val_u64 > 0xFFFF) { return RSMI_STATUS_UNEXPECTED_SIZE; @@ -951,7 +819,7 @@ static rsmi_status_t get_frequencies(amd::smi::DevInfoTypes type, return RSMI_STATUS_INVALID_ARGS; } - ret = get_dev_value_vec(type, dv_ind, &val_vec); + ret = GetDevValueVec(type, dv_ind, &val_vec); if (ret != RSMI_STATUS_SUCCESS) { return ret; } @@ -1001,7 +869,7 @@ static rsmi_status_t get_power_profiles(uint32_t dv_ind, return RSMI_STATUS_INVALID_ARGS; } - ret = get_dev_value_vec(amd::smi::kDevPowerProfileMode, dv_ind, &val_vec); + ret = GetDevValueVec(amd::smi::kDevPowerProfileMode, dv_ind, &val_vec); if (ret != RSMI_STATUS_SUCCESS) { return ret; } @@ -1085,7 +953,7 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, return RSMI_STATUS_INVALID_ARGS; } - ret = get_dev_value_vec(amd::smi::kDevPowerODVoltage, dv_ind, &val_vec); + ret = GetDevValueVec(amd::smi::kDevPowerODVoltage, dv_ind, &val_vec); if (ret != RSMI_STATUS_SUCCESS) { return ret; } @@ -1186,7 +1054,7 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind, THROW_IF_NULLPTR_DEREF(p) THROW_IF_NULLPTR_DEREF(num_regions) - ret = get_dev_value_vec(amd::smi::kDevPowerODVoltage, dv_ind, &val_vec); + ret = GetDevValueVec(amd::smi::kDevPowerODVoltage, dv_ind, &val_vec); if (ret != RSMI_STATUS_SUCCESS) { return ret; } @@ -1395,7 +1263,7 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, ret = get_dev_value_int(dev_type, dv_ind, fw_version); if (ret != 0) { - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } return RSMI_STATUS_SUCCESS; @@ -1487,7 +1355,7 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, } ret_i = dev->writeDevInfo(dev_type, freq_enable_str); - return errno_to_rsmi_status(ret_i); + return amd::smi::ErrnoToRsmiStatus(ret_i); CATCH } @@ -1743,7 +1611,7 @@ rsmi_dev_brand_get(uint32_t dv_ind, char *brand, uint32_t len) { // Retrieve vbios and store in vbios_value string int ret = dev->readDevInfo(amd::smi::kDevVBiosVer, &vbios_value); if (ret != 0) { - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } if (vbios_value.length() == 16) { sku_value = vbios_value.substr(4, 6); @@ -1779,7 +1647,7 @@ rsmi_dev_vram_vendor_get(uint32_t dv_ind, char *brand, uint32_t len) { int ret = dev->readDevInfo(amd::smi::kDevVramVendor, &val_str); if (ret != 0) { - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } uint32_t ln = static_cast(val_str.copy(brand, len)); @@ -1893,7 +1761,7 @@ rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask) { uint32_t ret_i; ret_i = dev->writeDevInfo(amd::smi::kDevPCIEClk, freq_enable_str); - return errno_to_rsmi_status(ret_i); + return amd::smi::ErrnoToRsmiStatus(ret_i); CATCH } @@ -2565,7 +2433,7 @@ rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len) { int ret = dev->readDevInfo(amd::smi::kDevVBiosVer, &val_str); if (ret != 0) { - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); } uint32_t ln = static_cast(val_str.copy(vbios, len)); @@ -2629,7 +2497,7 @@ rsmi_version_str_get(rsmi_sw_component_t component, char *ver_str, err = uname(&buf); if (err != 0) { - return errno_to_rsmi_status(err); + return amd::smi::ErrnoToRsmiStatus(err); } val_str = buf.release; @@ -2744,7 +2612,7 @@ rsmi_dev_counter_destroy(rsmi_event_handle_t evnt_handle) { ret = evt->stopCounter(); delete evt; - return errno_to_rsmi_status(ret);; + return amd::smi::ErrnoToRsmiStatus(ret);; CATCH } @@ -2755,7 +2623,7 @@ rsmi_counter_control(rsmi_event_handle_t evt_handle, amd::smi::evt::Event *evt = reinterpret_cast(evt_handle); - amd::smi::pthread_wrap _pw(*get_mutex(evt->dev_ind())); + amd::smi::pthread_wrap _pw(*amd::smi::GetMutex(evt->dev_ind())); amd::smi::ScopedPthread _lock(_pw); REQUIRE_ROOT_ACCESS @@ -2779,7 +2647,7 @@ rsmi_counter_control(rsmi_event_handle_t evt_handle, assert(!"Unexpected perf counter command"); return RSMI_STATUS_INVALID_ARGS; } - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); CATCH } @@ -2810,7 +2678,7 @@ rsmi_counter_read(rsmi_event_handle_t evt_handle, ret = evt->getValue(value); } - return errno_to_rsmi_status(ret); + return amd::smi::ErrnoToRsmiStatus(ret); CATCH } @@ -2868,7 +2736,7 @@ rsmi_compute_process_info_get(rsmi_process_info_t *procs, int err = amd::smi::GetProcessInfo(procs, *num_items, &procs_found); if (err) { - return errno_to_rsmi_status(err); + return amd::smi::ErrnoToRsmiStatus(err); } if (procs && *num_items < procs_found) { @@ -2896,7 +2764,7 @@ rsmi_compute_process_gpus_get(uint32_t pid, uint32_t *dv_indices, int err = amd::smi::GetProcessGPUs(pid, &gpu_set); if (err) { - return errno_to_rsmi_status(err); + return amd::smi::ErrnoToRsmiStatus(err); } uint32_t i = 0; @@ -2936,7 +2804,7 @@ rsmi_dev_memory_reserved_pages_get(uint32_t dv_ind, uint32_t *num_pages, std::vector val_vec; - ret = get_dev_value_vec(amd::smi::kDevMemPageBad, dv_ind, &val_vec); + ret = GetDevValueVec(amd::smi::kDevMemPageBad, dv_ind, &val_vec); if (ret == RSMI_STATUS_FILE_ERROR) { return RSMI_STATUS_NOT_SUPPORTED; @@ -3017,7 +2885,7 @@ rsmi_compute_process_info_by_pid_get(uint32_t pid, int err = amd::smi::GetProcessInfoForPID(pid, proc, &gpu_set); if (err) { - return errno_to_rsmi_status(err); + return amd::smi::ErrnoToRsmiStatus(err); } return RSMI_STATUS_SUCCESS; @@ -3534,7 +3402,7 @@ rsmi_event_notification_init(uint32_t dv_ind) { int ret = ioctl(smi.kfd_notif_evt_fh(), AMDKFD_IOC_SMI_EVENTS, &args); if (ret < 0) { - return errno_to_rsmi_status(errno); + return amd::smi::ErrnoToRsmiStatus(errno); } if (args.anon_fd < 1) { return RSMI_STATUS_NO_DATA; @@ -3544,7 +3412,7 @@ rsmi_event_notification_init(uint32_t dv_ind) { FILE *anon_file_ptr = fdopen(args.anon_fd, "r"); if (anon_file_ptr == nullptr) { close(dev->evt_notif_anon_fd()); - return errno_to_rsmi_status(errno); + return amd::smi::ErrnoToRsmiStatus(errno); } dev->set_evt_notif_anon_file_ptr(anon_file_ptr); @@ -3564,7 +3432,7 @@ rsmi_event_notification_mask_set(uint32_t dv_ind, uint64_t mask) { ssize_t ret = write(dev->evt_notif_anon_fd(), &mask, sizeof(uint64_t)); if (ret == -1) { - return errno_to_rsmi_status(errno); + return amd::smi::ErrnoToRsmiStatus(errno); } return RSMI_STATUS_SUCCESS; @@ -3645,7 +3513,7 @@ rsmi_event_notification_get(int timeout_ms, fill_data_buffer(false); if (*num_elem < buffer_size && errno != EAGAIN) { - return errno_to_rsmi_status(errno); + return amd::smi::ErrnoToRsmiStatus(errno); } else if (*num_elem >= buffer_size) { return RSMI_STATUS_SUCCESS; } @@ -3655,7 +3523,7 @@ rsmi_event_notification_get(int timeout_ms, if (p_ret > 0) { fill_data_buffer(true); } else if (p_ret < 0) { - return errno_to_rsmi_status(errno); + return amd::smi::ErrnoToRsmiStatus(errno); } if (*num_elem == 0) { return RSMI_STATUS_NO_DATA; @@ -3684,7 +3552,7 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) { int ret = close(smi.kfd_notif_evt_fh()); smi.set_kfd_notif_evt_fh(-1); if (ret < 0) { - return errno_to_rsmi_status(errno); + return amd::smi::ErrnoToRsmiStatus(errno); } } @@ -3700,7 +3568,7 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) { rsmi_status_t rsmi_test_sleep(uint32_t dv_ind, uint32_t seconds) { // DEVICE_MUTEX - amd::smi::pthread_wrap _pw(*get_mutex(dv_ind)); + amd::smi::pthread_wrap _pw(*amd::smi::GetMutex(dv_ind)); amd::smi::RocmSMI& smi_ = amd::smi::RocmSMI::getInstance(); bool blocking_ = !(smi_.init_options() && RSMI_INIT_FLAG_RESRV_TEST1); amd::smi::ScopedPthread _lock(_pw, blocking_); diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index a4630e092d..91dadb6c33 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -50,6 +50,13 @@ #include #include #include +#include + +#include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_utils.h" +#include "rocm_smi/rocm_smi_exception.h" +#include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_device.h" namespace amd { namespace smi { @@ -151,5 +158,68 @@ bool IsInteger(const std::string & n_str) { return (*tmp == 0); } + +rsmi_status_t handleException() { + try { + throw; + } catch (const std::bad_alloc& e) { + debug_print("RSMI exception: BadAlloc\n"); + return RSMI_STATUS_OUT_OF_RESOURCES; + } catch (const amd::smi::rsmi_exception& e) { + debug_print("Exception caught: %s.\n", e.what()); + return e.error_code(); + } catch (const std::exception& e) { + debug_print("Exception caught: %s\n", e.what()); + return RSMI_STATUS_INTERNAL_EXCEPTION; + } catch (const std::nested_exception& e) { + debug_print("Callback threw.\n"); + return RSMI_STATUS_INTERNAL_EXCEPTION; + } catch (...) { + debug_print("Unknown exception caught.\n"); + return RSMI_STATUS_INTERNAL_EXCEPTION; + } +} + +pthread_mutex_t *GetMutex(uint32_t dv_ind) { + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); + + if (dv_ind >= smi.monitor_devices().size()) { + return nullptr; + } + std::shared_ptr dev = smi.monitor_devices()[dv_ind]; + assert(dev != nullptr); + + return dev->mutex(); +} + +rsmi_status_t GetDevValueVec(amd::smi::DevInfoTypes type, + uint32_t dv_ind, std::vector *val_vec) { + assert(val_vec != nullptr); + if (val_vec == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + GET_DEV_FROM_INDX + + int ret = dev->readDevInfo(type, val_vec); + return ErrnoToRsmiStatus(ret); +} + +rsmi_status_t ErrnoToRsmiStatus(uint32_t err) { + switch (err) { + case 0: return RSMI_STATUS_SUCCESS; + case ESRCH: return RSMI_STATUS_NOT_FOUND; + case EACCES: return RSMI_STATUS_PERMISSION; + case EPERM: + case ENOENT: return RSMI_STATUS_NOT_SUPPORTED; + case EBADF: + case EISDIR: return RSMI_STATUS_FILE_ERROR; + case EINTR: return RSMI_STATUS_INTERRUPT; + case EIO: return RSMI_STATUS_UNEXPECTED_SIZE; + case ENXIO: return RSMI_STATUS_UNEXPECTED_DATA; + case EBUSY: return RSMI_STATUS_BUSY; + default: return RSMI_STATUS_UNKNOWN_ERROR; + } +} + } // namespace smi } // namespace amd diff --git a/src/shared_mutex/LICENSE b/third_party/shared_mutex/LICENSE similarity index 100% rename from src/shared_mutex/LICENSE rename to third_party/shared_mutex/LICENSE diff --git a/src/shared_mutex/shared_mutex.cc b/third_party/shared_mutex/shared_mutex.cc similarity index 100% rename from src/shared_mutex/shared_mutex.cc rename to third_party/shared_mutex/shared_mutex.cc diff --git a/src/shared_mutex/shared_mutex.h b/third_party/shared_mutex/shared_mutex.h similarity index 100% rename from src/shared_mutex/shared_mutex.h rename to third_party/shared_mutex/shared_mutex.h