add configs for read the docs
add handbook, user, install, and integration guides
Change-Id: I996f6909f4fdf76910981c0224f5a0266907e27a
remove old documentation steps
Change-Id: Icfad09926e67a2dfa1de0e182fc3cd534f0448f7
formatting fixes
Change-Id: I704bbbbf6ad384178f804e4a3f5e621f9c3d33b9
[ROCm/rdc commit: 1335d19020]
@@ -0,0 +1,12 @@
|
||||
# To get started with Dependabot version updates, you'll need to specify which
|
||||
# package ecosystems to update and where the package manifests are located.
|
||||
# Please see the documentation for all configuration options:
|
||||
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
|
||||
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "pip" # See documentation for possible values
|
||||
directory: "/docs/.sphinx" # Location of package manifests
|
||||
open-pull-requests-limit: 10
|
||||
schedule:
|
||||
interval: "daily"
|
||||
@@ -1,14 +1,8 @@
|
||||
# my install directory used for testing
|
||||
install/
|
||||
|
||||
# build directories generated by cmake
|
||||
# documentation artifacts
|
||||
build/
|
||||
cmake/build/
|
||||
.cache/
|
||||
|
||||
# build artifacts
|
||||
include/rdc/rdc64Config.h
|
||||
DEBIAN/postinst
|
||||
DEBIAN/prerm
|
||||
RPM/
|
||||
docs/*.pdf
|
||||
_build/
|
||||
_images/
|
||||
_static/
|
||||
_templates/
|
||||
_toc.yml
|
||||
docBin/
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
# Read the Docs configuration file
|
||||
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
||||
|
||||
version: 2
|
||||
|
||||
sphinx:
|
||||
configuration: docs/conf.py
|
||||
|
||||
formats: [htmlzip]
|
||||
|
||||
python:
|
||||
version: "3.8"
|
||||
install:
|
||||
- requirements: docs/.sphinx/requirements.txt
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
# ROCm<sup>TM</sup> Data Center Tool (RDC)
|
||||
|
||||
The ROCm™ Data Center Tool simplifies the administration and addresses key infrastructure challenges in AMD GPUs in cluster and datacenter environments. The main features are:
|
||||
@@ -32,6 +31,7 @@ RDC can run on AMD ROCm supported platforms, please refer to the **List of Suppo
|
||||
(ii) AMD ROCk Kernel driver (https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver)
|
||||
|
||||
## Building gRPC and protoc
|
||||
|
||||
**NOTE:** gRPC and protoc compiler must be built when building RDC from source as pre-built packages are not available. When installing RDC from a package, gRPC and protoc will be installed from the package.
|
||||
|
||||
**IMPORTANT:** Building gRPC and protocol buffers requires CMake 3.15 or greater. With an older version build will quietly succeed with a *message*. However, all components of gRPC will not be installed and RDC will ***fail*** to run
|
||||
@@ -50,7 +50,8 @@ mkdir -p build
|
||||
|
||||
By default (without using CMAKE_INSTALL_PREFIX option), gRPC will install to /usr/local lib, include and bin directories.
|
||||
It is highly recommended to install gRPC into a unique directory.
|
||||
Below example installs gRPC into /opt/grpc
|
||||
Below example installs gRPC into /opt/grpc
|
||||
|
||||
```bash
|
||||
export GRPC_ROOT=/opt/grpc
|
||||
cmake -B build -DgRPC_INSTALL=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX="$GRPC_ROOT"
|
||||
|
||||
@@ -121,41 +121,6 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/authentication
|
||||
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${RDC}
|
||||
COMPONENT ${CLIENT_COMPONENT})
|
||||
|
||||
# Generate Doxygen documentation for client api manual
|
||||
find_package(Doxygen)
|
||||
find_package(LATEX COMPONENTS PDFLATEX)
|
||||
if(DOXYGEN_FOUND AND LATEX_FOUND)
|
||||
set(RDC_MANUAL_NAME "RDC_API_Manual")
|
||||
message("Building ${RDC_MANUAL_NAME}.pdf")
|
||||
configure_file(${PROJECT_SOURCE_DIR}/docs/rdc_doxygen.cfg
|
||||
${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
|
||||
|
||||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.tex
|
||||
COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
|
||||
DEPENDS ${PROJECT_SOURCE_DIR}/docs/rdc_doxygen.cfg
|
||||
"${PROJECT_SOURCE_DIR}/include/rdc/rdc.h"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
|
||||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.pdf
|
||||
COMMAND make > /dev/null
|
||||
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.pdf
|
||||
${PROJECT_SOURCE_DIR}/docs/${RDC_MANUAL_NAME}_new.pdf
|
||||
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.tex
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/latex)
|
||||
|
||||
add_custom_target(docs DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.pdf)
|
||||
|
||||
add_dependencies(${CLIENT_LIB} docs)
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/latex/refman.pdf
|
||||
DESTINATION ${CMAKE_INSTALL_DOCDIR}
|
||||
COMPONENT ${CLIENT_COMPONENT} RENAME ${RDC_MANUAL_NAME}.pdf)
|
||||
install(FILES ${PROJECT_SOURCE_DIR}/README.md
|
||||
DESTINATION ${CMAKE_INSTALL_DOCDIR}
|
||||
COMPONENT ${CLIENT_COMPONENT})
|
||||
else()
|
||||
message("Doxygen or Latex is not found. Will not generate documents.")
|
||||
endif(DOXYGEN_FOUND AND LATEX_FOUND)
|
||||
|
||||
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
message(" Finished Cmake Client Lib ")
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Doxyfile 1.8.11
|
||||
# Doxyfile 1.8.10
|
||||
|
||||
# This file describes the settings to be used by the documentation system
|
||||
# doxygen (www.doxygen.org) for a project.
|
||||
@@ -38,7 +38,7 @@ PROJECT_NAME = "RDC"
|
||||
# could be handy for archiving the generated documentation or if some version
|
||||
# control system is used.
|
||||
|
||||
PROJECT_NUMBER =
|
||||
PROJECT_NUMBER =
|
||||
|
||||
# Using the PROJECT_BRIEF tag one can provide an optional one line description
|
||||
# for a project that appears at the top of each page and should give viewer a
|
||||
@@ -51,14 +51,14 @@ PROJECT_BRIEF = "Radeon Data Center Tools Reference Manual"
|
||||
# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
|
||||
# the logo to the output directory.
|
||||
|
||||
PROJECT_LOGO =
|
||||
PROJECT_LOGO =
|
||||
|
||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
|
||||
# into which the generated documentation will be written. If a relative path is
|
||||
# entered, it will be relative to the location where doxygen was started. If
|
||||
# left blank the current directory will be used.
|
||||
|
||||
OUTPUT_DIRECTORY =
|
||||
OUTPUT_DIRECTORY = docBin
|
||||
|
||||
# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
|
||||
# directories (in 2 levels) under the output directory of each output format and
|
||||
@@ -118,7 +118,17 @@ REPEAT_BRIEF = YES
|
||||
# the entity):The $name class, The $name widget, The $name file, is, provides,
|
||||
# specifies, contains, represents, a, an and the.
|
||||
|
||||
ABBREVIATE_BRIEF =
|
||||
ABBREVIATE_BRIEF = "The $name class" \
|
||||
"The $name widget" \
|
||||
"The $name file" \
|
||||
is \
|
||||
provides \
|
||||
specifies \
|
||||
contains \
|
||||
represents \
|
||||
a \
|
||||
an \
|
||||
the
|
||||
|
||||
# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
|
||||
# doxygen will generate a detailed section even if there is only a brief
|
||||
@@ -140,7 +150,7 @@ INLINE_INHERITED_MEMB = NO
|
||||
# shortest path that makes the file name unique will be used
|
||||
# The default value is: YES.
|
||||
|
||||
FULL_PATH_NAMES = NO
|
||||
FULL_PATH_NAMES = YES
|
||||
|
||||
# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
|
||||
# Stripping is only done if one of the specified strings matches the left-hand
|
||||
@@ -242,7 +252,7 @@ TCL_SUBST =
|
||||
# members will be omitted, etc.
|
||||
# The default value is: NO.
|
||||
|
||||
OPTIMIZE_OUTPUT_FOR_C = YES
|
||||
OPTIMIZE_OUTPUT_FOR_C = NO
|
||||
|
||||
# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
|
||||
# Python sources only. Doxygen will then generate output that is more tailored
|
||||
@@ -309,7 +319,7 @@ AUTOLINK_SUPPORT = YES
|
||||
# diagrams that involve STL classes more complete and accurate.
|
||||
# The default value is: NO.
|
||||
|
||||
BUILTIN_STL_SUPPORT = NO
|
||||
BUILTIN_STL_SUPPORT = YES
|
||||
|
||||
# If you use Microsoft's C++/CLI language, you should set this option to YES to
|
||||
# enable parsing support.
|
||||
@@ -341,7 +351,14 @@ IDL_PROPERTY_SUPPORT = YES
|
||||
# all members of a group must be documented explicitly.
|
||||
# The default value is: NO.
|
||||
|
||||
DISTRIBUTE_GROUP_DOC = NO
|
||||
DISTRIBUTE_GROUP_DOC = YES
|
||||
|
||||
# If one adds a struct or class to a group and this option is enabled, then also
|
||||
# any nested class or struct is added to the same group. By default this option
|
||||
# is disabled and one has to add nested compounds explicitly via \ingroup.
|
||||
# The default value is: NO.
|
||||
|
||||
GROUP_NESTED_COMPOUNDS = NO
|
||||
|
||||
# Set the SUBGROUPING tag to YES to allow class member groups of the same type
|
||||
# (for instance a group of public functions) to be put as a subgroup of that
|
||||
@@ -382,7 +399,7 @@ INLINE_SIMPLE_STRUCTS = NO
|
||||
# types are typedef'ed and only the typedef is referenced, never the tag name.
|
||||
# The default value is: NO.
|
||||
|
||||
TYPEDEF_HIDES_STRUCT = NO
|
||||
TYPEDEF_HIDES_STRUCT = YES
|
||||
|
||||
# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
|
||||
# cache is used to resolve symbols given their name and scope. Since this can be
|
||||
@@ -409,7 +426,7 @@ LOOKUP_CACHE_SIZE = 0
|
||||
# normally produced when WARNINGS is set to YES.
|
||||
# The default value is: NO.
|
||||
|
||||
EXTRACT_ALL = NO
|
||||
EXTRACT_ALL = YES
|
||||
|
||||
# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
|
||||
# be included in the documentation.
|
||||
@@ -498,7 +515,7 @@ INTERNAL_DOCS = NO
|
||||
# and Mac users are advised to set this option to NO.
|
||||
# The default value is: system dependent.
|
||||
|
||||
CASE_SENSE_NAMES = YES
|
||||
CASE_SENSE_NAMES = NO
|
||||
|
||||
# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
|
||||
# their full class and namespace scopes in the documentation. If set to YES, the
|
||||
@@ -544,7 +561,7 @@ INLINE_INFO = YES
|
||||
# name. If set to NO, the members will appear in declaration order.
|
||||
# The default value is: YES.
|
||||
|
||||
SORT_MEMBER_DOCS = NO
|
||||
SORT_MEMBER_DOCS = YES
|
||||
|
||||
# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
|
||||
# descriptions of file, namespace and class members alphabetically by member
|
||||
@@ -758,7 +775,7 @@ WARN_LOGFILE =
|
||||
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
|
||||
# Note: If this tag is empty the current directory is searched.
|
||||
|
||||
INPUT = @PROJECT_SOURCE_DIR@/include/rdc/rdc.h
|
||||
INPUT = ../../include/rdc/rdc.h
|
||||
|
||||
# This tag can be used to specify the character encoding of the source files
|
||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
||||
@@ -780,10 +797,48 @@ INPUT_ENCODING = UTF-8
|
||||
# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
|
||||
# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
|
||||
# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
|
||||
# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl,
|
||||
# *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js.
|
||||
# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
|
||||
# *.vhdl, *.ucf, *.qsf, *.as and *.js.
|
||||
|
||||
FILE_PATTERNS =
|
||||
FILE_PATTERNS = *.c \
|
||||
*.cc \
|
||||
*.cxx \
|
||||
*.cpp \
|
||||
*.c++ \
|
||||
*.java \
|
||||
*.ii \
|
||||
*.ixx \
|
||||
*.ipp \
|
||||
*.i++ \
|
||||
*.inl \
|
||||
*.idl \
|
||||
*.ddl \
|
||||
*.odl \
|
||||
*.h \
|
||||
*.hh \
|
||||
*.hxx \
|
||||
*.hpp \
|
||||
*.h++ \
|
||||
*.cs \
|
||||
*.d \
|
||||
*.php \
|
||||
*.php4 \
|
||||
*.php5 \
|
||||
*.phtml \
|
||||
*.inc \
|
||||
*.m \
|
||||
*.markdown \
|
||||
*.md \
|
||||
*.mm \
|
||||
*.dox \
|
||||
*.py \
|
||||
*.tcl \
|
||||
*.vhd \
|
||||
*.vhdl \
|
||||
*.ucf \
|
||||
*.qsf \
|
||||
*.as \
|
||||
*.js
|
||||
|
||||
# The RECURSIVE tag can be used to specify whether or not subdirectories should
|
||||
# be searched for input files as well.
|
||||
@@ -838,7 +893,7 @@ EXAMPLE_PATH =
|
||||
# *.h) to filter out the source-files in the directories. If left blank all
|
||||
# files are included.
|
||||
|
||||
EXAMPLE_PATTERNS =
|
||||
EXAMPLE_PATTERNS = *
|
||||
|
||||
# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
|
||||
# searched for input files to be used with the \include or \dontinclude commands
|
||||
@@ -867,10 +922,6 @@ IMAGE_PATH =
|
||||
# Note that the filter must not add or remove lines; it is applied before the
|
||||
# code is scanned, but not when the output code is generated. If lines are added
|
||||
# or removed, the anchors will not be placed correctly.
|
||||
#
|
||||
# Note that for custom extensions or not directly supported extensions you also
|
||||
# need to set EXTENSION_MAPPING for the extension otherwise the files are not
|
||||
# properly processed by doxygen.
|
||||
|
||||
INPUT_FILTER =
|
||||
|
||||
@@ -880,10 +931,6 @@ INPUT_FILTER =
|
||||
# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
|
||||
# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
|
||||
# patterns match the file name, INPUT_FILTER is applied.
|
||||
#
|
||||
# Note that for custom extensions or not directly supported extensions you also
|
||||
# need to set EXTENSION_MAPPING for the extension otherwise the files are not
|
||||
# properly processed by doxygen.
|
||||
|
||||
FILTER_PATTERNS =
|
||||
|
||||
@@ -907,7 +954,7 @@ FILTER_SOURCE_PATTERNS =
|
||||
# (index.html). This can be useful if you have a project on for instance GitHub
|
||||
# and want to reuse the introduction page also for the doxygen output.
|
||||
|
||||
USE_MDFILE_AS_MAINPAGE = @PROJECT_SOURCE_DIR@/docs/README.md
|
||||
USE_MDFILE_AS_MAINPAGE = ../README.md
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options related to source browsing
|
||||
@@ -1001,7 +1048,7 @@ VERBATIM_HEADERS = YES
|
||||
# rich C++ code for which doxygen's built-in parser lacks the necessary type
|
||||
# information.
|
||||
# Note: The availability of this option depends on whether or not doxygen was
|
||||
# generated with the -Duse-libclang=ON option for CMake.
|
||||
# compiled with the --with-libclang option.
|
||||
# The default value is: NO.
|
||||
|
||||
CLANG_ASSISTED_PARSING = NO
|
||||
@@ -1413,7 +1460,7 @@ GENERATE_TREEVIEW = NO
|
||||
# Minimum value: 0, maximum value: 20, default value: 4.
|
||||
# This tag requires that the tag GENERATE_HTML is set to YES.
|
||||
|
||||
ENUM_VALUES_PER_LINE = 4
|
||||
ENUM_VALUES_PER_LINE = 1
|
||||
|
||||
# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
|
||||
# to set the initial width (in pixels) of the frame in which the tree is shown.
|
||||
@@ -1436,7 +1483,7 @@ EXT_LINKS_IN_WINDOW = NO
|
||||
# Minimum value: 8, maximum value: 50, default value: 10.
|
||||
# This tag requires that the tag GENERATE_HTML is set to YES.
|
||||
|
||||
FORMULA_FONTSIZE = 16
|
||||
FORMULA_FONTSIZE = 10
|
||||
|
||||
# Use the FORMULA_TRANPARENT tag to determine whether or not the images
|
||||
# generated for formulas are transparent PNGs. Transparent PNGs are not
|
||||
@@ -1458,7 +1505,7 @@ FORMULA_TRANSPARENT = YES
|
||||
# The default value is: NO.
|
||||
# This tag requires that the tag GENERATE_HTML is set to YES.
|
||||
|
||||
USE_MATHJAX = NO
|
||||
USE_MATHJAX = YES
|
||||
|
||||
# When MathJax is enabled you can set the default output format to be used for
|
||||
# the MathJax output. See the MathJax site (see:
|
||||
@@ -1591,7 +1638,7 @@ EXTRA_SEARCH_MAPPINGS =
|
||||
# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
|
||||
# The default value is: YES.
|
||||
|
||||
GENERATE_LATEX = YES
|
||||
GENERATE_LATEX = NO
|
||||
|
||||
# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
|
||||
# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
|
||||
@@ -1745,14 +1792,6 @@ LATEX_SOURCE_CODE = NO
|
||||
|
||||
LATEX_BIB_STYLE = plain
|
||||
|
||||
# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
|
||||
# page will contain the date and time when the page was generated. Setting this
|
||||
# to NO can help when comparing the output of multiple runs.
|
||||
# The default value is: NO.
|
||||
# This tag requires that the tag GENERATE_LATEX is set to YES.
|
||||
|
||||
LATEX_TIMESTAMP = NO
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options related to the RTF output
|
||||
#---------------------------------------------------------------------------
|
||||
@@ -1871,7 +1910,7 @@ MAN_LINKS = NO
|
||||
# captures the structure of the code including all documentation.
|
||||
# The default value is: NO.
|
||||
|
||||
GENERATE_XML = NO
|
||||
GENERATE_XML = YES
|
||||
|
||||
# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
|
||||
# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
|
||||
@@ -1984,7 +2023,7 @@ ENABLE_PREPROCESSING = YES
|
||||
# The default value is: NO.
|
||||
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
|
||||
|
||||
MACRO_EXPANSION = NO
|
||||
MACRO_EXPANSION = YES
|
||||
|
||||
# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
|
||||
# the macro expansion is limited to the macros specified with the PREDEFINED and
|
||||
@@ -1992,14 +2031,14 @@ MACRO_EXPANSION = NO
|
||||
# The default value is: NO.
|
||||
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
|
||||
|
||||
EXPAND_ONLY_PREDEF = NO
|
||||
EXPAND_ONLY_PREDEF = YES
|
||||
|
||||
# If the SEARCH_INCLUDES tag is set to YES, the include files in the
|
||||
# INCLUDE_PATH will be searched if a #include is found.
|
||||
# The default value is: YES.
|
||||
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
|
||||
|
||||
SEARCH_INCLUDES = YES
|
||||
SEARCH_INCLUDES = NO
|
||||
|
||||
# The INCLUDE_PATH tag can be used to specify one or more directories that
|
||||
# contain include files that are not input files but should be processed by the
|
||||
@@ -2024,7 +2063,7 @@ INCLUDE_FILE_PATTERNS =
|
||||
# recursively expanded use the := operator instead of the = operator.
|
||||
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
|
||||
|
||||
PREDEFINED =
|
||||
PREDEFINED =
|
||||
|
||||
# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
|
||||
# tag can be used to specify a list of macro names that should be expanded. The
|
||||
@@ -2108,7 +2147,7 @@ PERL_PATH = /usr/bin/perl
|
||||
# powerful graphs.
|
||||
# The default value is: YES.
|
||||
|
||||
CLASS_DIAGRAMS = YES
|
||||
CLASS_DIAGRAMS = NO
|
||||
|
||||
# You can define message sequence charts within doxygen comments using the \msc
|
||||
# command. Doxygen will then run the mscgen tool (see:
|
||||
@@ -2137,7 +2176,7 @@ HIDE_UNDOC_RELATIONS = YES
|
||||
# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
|
||||
# Bell Labs. The other options in this section have no effect if this option is
|
||||
# set to NO
|
||||
# The default value is: YES.
|
||||
# The default value is: NO.
|
||||
|
||||
HAVE_DOT = NO
|
||||
|
||||
@@ -2293,9 +2332,7 @@ DIRECTORY_GRAPH = YES
|
||||
# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
|
||||
# to make the SVG files visible in IE 9+ (other browsers do not have this
|
||||
# requirement).
|
||||
# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
|
||||
# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
|
||||
# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
|
||||
# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
|
||||
# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
|
||||
# png:gdiplus:gdiplus.
|
||||
# The default value is: png.
|
||||
@@ -2396,7 +2433,7 @@ DOT_TRANSPARENT = NO
|
||||
# The default value is: NO.
|
||||
# This tag requires that the tag HAVE_DOT is set to YES.
|
||||
|
||||
DOT_MULTI_TARGETS = YES
|
||||
DOT_MULTI_TARGETS = NO
|
||||
|
||||
# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
|
||||
# explaining the meaning of the various boxes and arrows in the dot generated
|
||||
@@ -0,0 +1,19 @@
|
||||
# Anywhere {branch} is used, the branch name will be substituted.
|
||||
# These comments will also be removed.
|
||||
defaults:
|
||||
numbered: False
|
||||
root: index
|
||||
subtrees:
|
||||
- caption: User Guide
|
||||
entries:
|
||||
- file: user_guide/user_guide
|
||||
subtrees:
|
||||
- entries:
|
||||
- file: user_guide/install
|
||||
- file: user_guide/features
|
||||
- file: user_guide/integration
|
||||
- file: user_guide/handbook
|
||||
- file: user_guide/api
|
||||
- caption: API Reference
|
||||
entries:
|
||||
- file: api_ref
|
||||
@@ -0,0 +1 @@
|
||||
rocm-docs-core==0.2.0
|
||||
@@ -0,0 +1,268 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.10
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile .sphinx/requirements.in
|
||||
#
|
||||
accessible-pygments==0.0.4
|
||||
# via pydata-sphinx-theme
|
||||
alabaster==0.7.13
|
||||
# via sphinx
|
||||
asttokens==2.2.1
|
||||
# via stack-data
|
||||
attrs==22.2.0
|
||||
# via
|
||||
# jsonschema
|
||||
# jupyter-cache
|
||||
babel==2.12.1
|
||||
# via
|
||||
# pydata-sphinx-theme
|
||||
# sphinx
|
||||
backcall==0.2.0
|
||||
# via ipython
|
||||
beautifulsoup4==4.12.1
|
||||
# via pydata-sphinx-theme
|
||||
breathe==4.34.0
|
||||
# via rocm-docs-core
|
||||
certifi==2022.12.7
|
||||
# via requests
|
||||
cffi==1.15.1
|
||||
# via pynacl
|
||||
charset-normalizer==3.1.0
|
||||
# via requests
|
||||
click==8.1.3
|
||||
# via
|
||||
# jupyter-cache
|
||||
# sphinx-external-toc
|
||||
comm==0.1.3
|
||||
# via ipykernel
|
||||
debugpy==1.6.6
|
||||
# via ipykernel
|
||||
decorator==5.1.1
|
||||
# via ipython
|
||||
deprecated==1.2.13
|
||||
# via pygithub
|
||||
docutils==0.16
|
||||
# via
|
||||
# breathe
|
||||
# myst-parser
|
||||
# pydata-sphinx-theme
|
||||
# rocm-docs-core
|
||||
# sphinx
|
||||
executing==1.2.0
|
||||
# via stack-data
|
||||
fastjsonschema==2.16.3
|
||||
# via nbformat
|
||||
gitdb==4.0.10
|
||||
# via gitpython
|
||||
gitpython==3.1.31
|
||||
# via rocm-docs-core
|
||||
greenlet==2.0.2
|
||||
# via sqlalchemy
|
||||
idna==3.4
|
||||
# via requests
|
||||
imagesize==1.4.1
|
||||
# via sphinx
|
||||
importlib-metadata==6.1.0
|
||||
# via
|
||||
# jupyter-cache
|
||||
# myst-nb
|
||||
ipykernel==6.22.0
|
||||
# via myst-nb
|
||||
ipython==8.12.0
|
||||
# via
|
||||
# ipykernel
|
||||
# myst-nb
|
||||
jedi==0.18.2
|
||||
# via ipython
|
||||
jinja2==3.1.2
|
||||
# via
|
||||
# myst-parser
|
||||
# sphinx
|
||||
jsonschema==4.17.3
|
||||
# via nbformat
|
||||
jupyter-cache==0.5.0
|
||||
# via myst-nb
|
||||
jupyter-client==8.1.0
|
||||
# via
|
||||
# ipykernel
|
||||
# nbclient
|
||||
jupyter-core==5.3.0
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-client
|
||||
# nbformat
|
||||
linkify-it-py==1.0.3
|
||||
# via myst-parser
|
||||
markdown-it-py==2.2.0
|
||||
# via
|
||||
# mdit-py-plugins
|
||||
# myst-parser
|
||||
markupsafe==2.1.2
|
||||
# via jinja2
|
||||
matplotlib-inline==0.1.6
|
||||
# via
|
||||
# ipykernel
|
||||
# ipython
|
||||
mdit-py-plugins==0.3.5
|
||||
# via myst-parser
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
myst-nb==0.17.1
|
||||
# via rocm-docs-core
|
||||
myst-parser[linkify]==0.18.1
|
||||
# via
|
||||
# myst-nb
|
||||
# rocm-docs-core
|
||||
nbclient==0.5.13
|
||||
# via
|
||||
# jupyter-cache
|
||||
# myst-nb
|
||||
nbformat==5.8.0
|
||||
# via
|
||||
# jupyter-cache
|
||||
# myst-nb
|
||||
# nbclient
|
||||
nest-asyncio==1.5.6
|
||||
# via
|
||||
# ipykernel
|
||||
# nbclient
|
||||
packaging==23.0
|
||||
# via
|
||||
# ipykernel
|
||||
# pydata-sphinx-theme
|
||||
# sphinx
|
||||
parso==0.8.3
|
||||
# via jedi
|
||||
pexpect==4.8.0
|
||||
# via ipython
|
||||
pickleshare==0.7.5
|
||||
# via ipython
|
||||
platformdirs==3.2.0
|
||||
# via jupyter-core
|
||||
prompt-toolkit==3.0.38
|
||||
# via ipython
|
||||
psutil==5.9.4
|
||||
# via ipykernel
|
||||
ptyprocess==0.7.0
|
||||
# via pexpect
|
||||
pure-eval==0.2.2
|
||||
# via stack-data
|
||||
pycparser==2.21
|
||||
# via cffi
|
||||
pydata-sphinx-theme==0.13.3
|
||||
# via sphinx-book-theme
|
||||
pygithub==1.57
|
||||
# via rocm-docs-core
|
||||
pygments==2.14.0
|
||||
# via
|
||||
# accessible-pygments
|
||||
# ipython
|
||||
# pydata-sphinx-theme
|
||||
# sphinx
|
||||
pyjwt==2.6.0
|
||||
# via pygithub
|
||||
pynacl==1.5.0
|
||||
# via pygithub
|
||||
pyrsistent==0.19.3
|
||||
# via jsonschema
|
||||
python-dateutil==2.8.2
|
||||
# via jupyter-client
|
||||
pyyaml==6.0
|
||||
# via
|
||||
# jupyter-cache
|
||||
# myst-nb
|
||||
# myst-parser
|
||||
# sphinx-external-toc
|
||||
pyzmq==25.0.2
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-client
|
||||
requests==2.28.2
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==0.2.0
|
||||
# via -r .sphinx/requirements.in
|
||||
six==1.16.0
|
||||
# via
|
||||
# asttokens
|
||||
# python-dateutil
|
||||
smmap==5.0.0
|
||||
# via gitdb
|
||||
snowballstemmer==2.2.0
|
||||
# via sphinx
|
||||
soupsieve==2.4
|
||||
# via beautifulsoup4
|
||||
sphinx==4.3.1
|
||||
# via
|
||||
# breathe
|
||||
# myst-nb
|
||||
# myst-parser
|
||||
# pydata-sphinx-theme
|
||||
# rocm-docs-core
|
||||
# sphinx-book-theme
|
||||
# sphinx-copybutton
|
||||
# sphinx-design
|
||||
# sphinx-external-toc
|
||||
# sphinx-notfound-page
|
||||
sphinx-book-theme==1.0.0rc2
|
||||
# via rocm-docs-core
|
||||
sphinx-copybutton==0.5.1
|
||||
# via rocm-docs-core
|
||||
sphinx-design==0.3.0
|
||||
# via rocm-docs-core
|
||||
sphinx-external-toc==0.3.1
|
||||
# via rocm-docs-core
|
||||
sphinx-notfound-page==0.8.3
|
||||
# via rocm-docs-core
|
||||
sphinxcontrib-applehelp==1.0.4
|
||||
# via sphinx
|
||||
sphinxcontrib-devhelp==1.0.2
|
||||
# via sphinx
|
||||
sphinxcontrib-htmlhelp==2.0.1
|
||||
# via sphinx
|
||||
sphinxcontrib-jsmath==1.0.1
|
||||
# via sphinx
|
||||
sphinxcontrib-qthelp==1.0.3
|
||||
# via sphinx
|
||||
sphinxcontrib-serializinghtml==1.1.5
|
||||
# via sphinx
|
||||
sqlalchemy==1.4.47
|
||||
# via jupyter-cache
|
||||
stack-data==0.6.2
|
||||
# via ipython
|
||||
tabulate==0.9.0
|
||||
# via jupyter-cache
|
||||
tornado==6.2
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-client
|
||||
traitlets==5.9.0
|
||||
# via
|
||||
# comm
|
||||
# ipykernel
|
||||
# ipython
|
||||
# jupyter-client
|
||||
# jupyter-core
|
||||
# matplotlib-inline
|
||||
# nbclient
|
||||
# nbformat
|
||||
typing-extensions==4.5.0
|
||||
# via
|
||||
# myst-nb
|
||||
# myst-parser
|
||||
# pydata-sphinx-theme
|
||||
uc-micro-py==1.0.1
|
||||
# via linkify-it-py
|
||||
urllib3==1.26.15
|
||||
# via requests
|
||||
wcwidth==0.2.6
|
||||
# via prompt-toolkit
|
||||
wrapt==1.15.0
|
||||
# via deprecated
|
||||
zipp==3.15.0
|
||||
# via importlib-metadata
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
# setuptools
|
||||
@@ -0,0 +1,5 @@
|
||||
===============
|
||||
API Reference
|
||||
===============
|
||||
|
||||
.. doxygenindex::
|
||||
@@ -0,0 +1,14 @@
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
from rocm_docs import ROCmDocs
|
||||
|
||||
docs_core = ROCmDocs("ROCm Data Center Documentation")
|
||||
docs_core.run_doxygen()
|
||||
docs_core.setup()
|
||||
|
||||
for sphinx_var in ROCmDocs.SPHINX_VARS:
|
||||
globals()[sphinx_var] = getattr(docs_core, sphinx_var)
|
||||
|
После Ширина: | Высота: | Размер: 29 KiB |
|
После Ширина: | Высота: | Размер: 48 KiB |
|
После Ширина: | Высота: | Размер: 8.4 KiB |
|
После Ширина: | Высота: | Размер: 13 KiB |
|
После Ширина: | Высота: | Размер: 42 KiB |
|
После Ширина: | Высота: | Размер: 72 KiB |
|
После Ширина: | Высота: | Размер: 35 KiB |
|
После Ширина: | Высота: | Размер: 27 KiB |
|
После Ширина: | Высота: | Размер: 19 KiB |
|
После Ширина: | Высота: | Размер: 21 KiB |
|
После Ширина: | Высота: | Размер: 32 KiB |
|
После Ширина: | Высота: | Размер: 70 KiB |
|
После Ширина: | Высота: | Размер: 34 KiB |
@@ -0,0 +1,8 @@
|
||||
# ROCm Data Center Tool
|
||||
|
||||
The ROCm™ Data Center Tool simplifies the administration and addresses key infrastructure challenges in AMD GPUs in cluster and datacenter environments. The main features are:
|
||||
|
||||
- GPU telemetry
|
||||
- GPU statistics for jobs
|
||||
- Integration with third-party tools
|
||||
- Open source
|
||||
@@ -0,0 +1,86 @@
|
||||
# ROCm Data Center API
|
||||
|
||||
Disclaimer: This is the alpha version of RDC API™ and is subject to change without notice. The primary purpose of this API is to solicit feedback. AMD accepts no responsibility for any software breakage caused by API changes.
|
||||
|
||||
## RDC API
|
||||
|
||||
The RDC tool API is the core library that provides all the RDC features. This section focuses on how RDC API can be used by third-party software.
|
||||
|
||||
The RDC includes the following libraries:
|
||||
|
||||
• librdc_bootstrap.so: Loads during runtime one of the two libraries by detecting the mode.
|
||||
|
||||
• librdc_client.so: Exposes RDC functionality using gRPC client.
|
||||
|
||||
• librdc.so: RDC API. This depends on librocm_smi.so.
|
||||
|
||||
• librocm_smi.so: Stateless low overhead access to GPU data.
|
||||
|
||||

|
||||
|
||||
Different libraries and how they are linked.
|
||||
|
||||
Note that librdc_bootstrap.so loads different libraries based on the modes.
|
||||
|
||||
Example:
|
||||
|
||||
• rdci: librdc_bootstrap.so loads librdc_client.so
|
||||
|
||||
• rdcd: librdc_bootstrap.so loads librdc.so
|
||||
|
||||
For more information, see the ROCm Data Center Tool API Guide at https://docs.amd.com.
|
||||
|
||||
## Job Stats Use Case
|
||||
|
||||
The following pseudocode shows how RDC tool API can be directly used to record GPU statistics associated with any job or workload. Refer to the example code provided with RDC on how to build it.
|
||||
|
||||
For more information, see the [Job Stats section in Features](features.md).
|
||||
|
||||
```
|
||||
//Initialize the RDC
|
||||
rdc_handle_t rdc_handle;
|
||||
rdc_status_t result=rdc_init(0);
|
||||
|
||||
//Dynamically choose to run in standalone or embedded mode
|
||||
bool standalone = false;
|
||||
std::cin>> standalone;
|
||||
if (standalone)
|
||||
result = rdc_connect("127.0.0.1:50051", &rdc_handle, nullptr, nullptr, nullptr); //It will connect to the daemon
|
||||
else
|
||||
result = rdc_start_embedded(RDC_OPERATION_MODE_MANUAL, &rdc_handle); //call library directly, here we run embedded in manual mode
|
||||
|
||||
//Now we can use the same API for both standalone and embedded
|
||||
//(1) create group
|
||||
rdc_gpu_group_t groupId;
|
||||
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, "MyGroup1", &groupId);
|
||||
|
||||
//(2) Add the GPUs to the group
|
||||
result = rdc_group_gpu_add(rdc_handle, groupId, 0); //Add GPU 0
|
||||
result = rdc_group_gpu_add(rdc_handle, groupId, 1); //Add GPU 1
|
||||
|
||||
//(3) start the recording the Slurm job 123. Set the sample frequency to once per second
|
||||
result = rdc_job_start_stats(rdc_handle, group_id,
|
||||
"123", 1000000);
|
||||
|
||||
//For standalone mode, the daemon will update and cache the samples
|
||||
//In manual mode, we must call rdc_field_update_all periodically to take samples
|
||||
if (!standalone) { //embedded manual mode
|
||||
for (int i=5; i>0; i--) { //As an example, we will take 5 samples
|
||||
result = rdc_field_update_all(rdc_handle, 0);
|
||||
usleep(1000000);
|
||||
}
|
||||
} else { //standalone mode, do nothing
|
||||
usleep(5000000); //sleep 5 seconds before fetch the stats
|
||||
}
|
||||
|
||||
//(4) stop the Slurm job 123, which will stop the watch
|
||||
// Note: we do not have to stop the job to get stats. The rdc_job_get_stats can be called at any time before stop
|
||||
result = rdc_job_stop_stats(rdc_handle, "123");
|
||||
|
||||
//(5) Get the stats
|
||||
rdc_job_info_t job_info;
|
||||
result = rdc_job_get_stats(rdc_handle, "123", &job_info);
|
||||
std::cout<<"Average Memory Utilization: " <<job_info.summary.memoryUtilization.average <<std::endl;
|
||||
|
||||
//The cleanup and shutdown ....
|
||||
```
|
||||
@@ -0,0 +1,213 @@
|
||||
# Data Center Tool: Feature Overview
|
||||
|
||||
Note that RDC Tool is in active development. This section highlights the current feature set.
|
||||
|
||||

|
||||
|
||||
RDC components and framework for describing features
|
||||
|
||||
## Discovery
|
||||
|
||||
The Discovery feature enables you to locate and display information of GPUs present in the compute node.
|
||||
|
||||
Example:
|
||||
|
||||
```
|
||||
$ rdci discovery <host_name> -l
|
||||
2 GPUs found
|
||||
```
|
||||
|
||||
| GPU Index | Device Information |
|
||||
| --------- | ------------------------------------------- |
|
||||
| 0 | Name: AMD Radeon Instinct™ MI50 Accelerator |
|
||||
| 1 | Name: AMD Radeon Instinct™ MI50 Accelerator |
|
||||
|
||||
```
|
||||
$ rdci -l : list available GPUs
|
||||
$ rdci -u: No SSL authentication
|
||||
```
|
||||
|
||||
## Groups
|
||||
|
||||
This section explains the GPU and field groups features.
|
||||
|
||||
### GPU Groups
|
||||
|
||||
With the GPU groups feature, you can create, delete, and list logical groups of GPU.
|
||||
|
||||
```
|
||||
$ rdci group -c GPU_GROUP
|
||||
Successfully created a group with a group ID 1
|
||||
|
||||
$ rdci group -g 1 -a 0,1
|
||||
Successfully added the GPU 0,1 to group 1
|
||||
|
||||
$ rdci group –l
|
||||
|
||||
1 group found
|
||||
```
|
||||
|
||||
| Group ID | Group Name | GPU Index |
|
||||
| -------- | ------------ | --------- |
|
||||
| 1 | GPU_GROUP | 0, 1 |
|
||||
|
||||
```
|
||||
$ rdci group -d 1
|
||||
Successfully removed group 1
|
||||
|
||||
-c create; –g group id; –a add GPU index; –l list; -d delete group
|
||||
```
|
||||
|
||||
### Field Groups
|
||||
|
||||
The Field Groups feature provides you the options to create, delete, and list field groups.
|
||||
|
||||
```
|
||||
$ rdci fieldgroup -c <fgroup> -f 150,155
|
||||
Successfully created a field group with a group ID 1
|
||||
|
||||
$ rdci fieldgroup -l
|
||||
|
||||
1 group found
|
||||
```
|
||||
|
||||
| Group ID | Group Name | Field Ids |
|
||||
| -------- | ------------ | --------- |
|
||||
| 1 | Fgroup | 150, 155 |
|
||||
|
||||
```
|
||||
$ rdci fieldgroup -d 1
|
||||
Successfully removed field group 1
|
||||
|
||||
rdci dmon –l
|
||||
Supported fields Ids:
|
||||
100 RDC_FI_GPU_CLOCK: Current GPU clock freq.
|
||||
150 RDC_FI_GPU_TEMP: GPU temp. in milli Celsius.
|
||||
155 RDC_FI_POWER_USAGE: Power usage in microwatts.
|
||||
203 RDC_FI_GPU_UTIL: GPU busy percentage.
|
||||
525 RDC_FI_GPU_MEMORY_USAGE: VRAM Memory usage in bytes
|
||||
|
||||
-c create; –g group id; –a add GPU index; –l list; -d delete group
|
||||
```
|
||||
|
||||
### Monitor Errors
|
||||
|
||||
You can define RDC_FI_ECC_CORRECT_TOTAL or RDC_FI_ECC_UNCORRECT_TOTAL field to get the RAS Error-Correcting Code (ECC) counter:
|
||||
|
||||
• 312 RDC_FI_ECC_CORRECT_TOTAL: Accumulated correctable ECC errors
|
||||
|
||||
• 313 RDC_FI_ECC_UNCORRECT_TOTAL: Accumulated uncorrectable ECC errors
|
||||
|
||||
## Device Monitoring
|
||||
|
||||
The RDC Tool enables you to monitor the GPU fields.
|
||||
|
||||
```
|
||||
$ rdci dmon -f <field_group> -g <gpu_group> -c 5 -d 1000
|
||||
|
||||
|
||||
1 group found
|
||||
```
|
||||
|
||||
| GPU Index | TEMP (m°C) | POWER (µW) |
|
||||
| --------- | ------------ | ---------- |
|
||||
| 0 | 25000 | 520500 |
|
||||
|
||||
```
|
||||
rdci dmon –l
|
||||
Supported fields Ids:
|
||||
100 RDC_FI_GPU_CLOCK: Current GPU clock freq.
|
||||
150 RDC_FI_GPU_TEMP: GPU temp. in milli Celsius.
|
||||
155 RDC_FI_POWER_USAGE: Power usage in microwatts.
|
||||
203 RDC_FI_GPU_UTIL: GPU busy percentage.
|
||||
525 RDC_FI_GPU_MEMORY_USAGE: VRAM Memory usage in bytes
|
||||
|
||||
-e field ids; -i GPU index; -c count; -d delay; -l list; -f fieldgroup id
|
||||
```
|
||||
|
||||
## Job Stats
|
||||
|
||||
You can display GPU statistics for any given workload.
|
||||
|
||||
```
|
||||
$ rdci stats -s 2 -g 1
|
||||
Successfully started recording job 2 with a group ID 1
|
||||
|
||||
$ rdci stats -j 2
|
||||
```
|
||||
|
||||
| Summary | Executive Status |
|
||||
| --------------------------------- | ---------------------------- |
|
||||
| Start time | 1586795401 |
|
||||
| End time | 1586795445 |
|
||||
| Total execution time | 44 |
|
||||
| --------------------------------- | ---------------------------- |
|
||||
| Energy Consumed (Joules) | 21682 |
|
||||
| Power Usage (Watts) | Max: 49 Min: 13 Avg: 34 |
|
||||
| GPU Clock (MHz) | Max: 1000 Min: 300 Avg: 903 |
|
||||
| GPU Utilization (%) | Max: 69 Min: 0 Avg: 2 |
|
||||
| Max GPU Memory Used (bytes) | 524320768 |
|
||||
| Memory Utilization (%) | Max: 12 Min: 11 Avg: 12 |
|
||||
|
||||
```
|
||||
$ rdci stats -x 2
|
||||
Successfully stopped recording job 2
|
||||
|
||||
-s start recording on job id; -g group id; -j display job stats; –x stop recording.
|
||||
```
|
||||
|
||||
## Job Stats Use Case
|
||||
|
||||
A common use case is to record GPU statistics associated with any job or workload. The following example shows how all these features can be put together for this use case:
|
||||
|
||||

|
||||
|
||||
An example showing how job statistics can be recorded
|
||||
|
||||
rdci commands
|
||||
|
||||
```
|
||||
$ rdci group -c group1
|
||||
|
||||
successfully created a group with a group ID 1
|
||||
|
||||
$ rdci group -g 1 -a 0,1
|
||||
|
||||
GPU 0,1 is added to group 1 successfully.
|
||||
|
||||
rdci stats -s 123 -g 1
|
||||
|
||||
job 123 recorded successfully with the group ID
|
||||
|
||||
rdci stats -x 123
|
||||
|
||||
job 123 stops recording successfully
|
||||
|
||||
rdci stats -j 123
|
||||
|
||||
job stats printed
|
||||
```
|
||||
|
||||
## Error-Correcting Code Output
|
||||
|
||||
In the job output, this feature prints out the Error-Correcting Code (ECC) errors while running the job.
|
||||
|
||||
## Diagnostic
|
||||
|
||||
You can run diagnostic on a GPU group as shown below:
|
||||
|
||||
```
|
||||
$ rdci diag -g <gpu_group>
|
||||
|
||||
No compute process: Pass
|
||||
Node topology check: Pass
|
||||
GPU parameters check: Pass
|
||||
Compute Queue ready: Pass
|
||||
System memory check: Pass
|
||||
=============== Diagnostic Details ==================
|
||||
No compute process: No processes running on any devices.
|
||||
Node topology check: No link detected.
|
||||
GPU parameters check: GPU 0 Critical Edge temperature in range.
|
||||
Compute Queue ready: Run binary search task on GPU 0 Pass.
|
||||
System memory check: Max Single Allocation Memory Test for GPU 0 Pass. CPUAccessToGPUMemoryTest for GPU 0 Pass. GPUAccessToCPUMemoryTest for GPU 0 Pass.
|
||||
```
|
||||
@@ -0,0 +1,159 @@
|
||||
# Data Center Tool: Developer Handbook
|
||||
|
||||
The RDC tool is open source and available under the MIT License. This section is helpful for open source developers. Third-party integrators may also find this information useful.
|
||||
|
||||
## Prerequisites for Building RDC
|
||||
|
||||
NOTE: The RDC tool is tested on the following software versions. Earlier versions may not work.
|
||||
|
||||
• CMake 3.15
|
||||
|
||||
• g++ (5.4.0)
|
||||
|
||||
• AMD ROCm, which includes AMD ROCm SMI Library
|
||||
|
||||
• gRPC and protoc
|
||||
|
||||
The following components are required to build the latest documentation:
|
||||
|
||||
• Doxygen (1.8.11)
|
||||
|
||||
• Latex (pdfTeX 3.14159265-2.6-1.40.16)
|
||||
|
||||
```
|
||||
$ sudo apt install libcap-dev
|
||||
$ sudo apt install -y doxygen
|
||||
```
|
||||
|
||||
## Build and Install RDC
|
||||
|
||||
To build and install, clone the RDC source code from GitHub and use CMake.
|
||||
|
||||
```
|
||||
$ git clone <GitHub for RDC>
|
||||
$ cd rdc
|
||||
$ mkdir -p build; cd build
|
||||
$ cmake -DROCM_DIR=/opt/rocm -DGRPC_ROOT="$GRPC_PROTOC_ROOT"..
|
||||
$ make
|
||||
#Install library file and header and the default location is /opt/rocm
|
||||
$ make install
|
||||
```
|
||||
|
||||
## Build Documentation
|
||||
|
||||
You can generate PDF documentation after a successful build. The reference manual, refman.pdf, appears in the latex directory.
|
||||
|
||||
```
|
||||
$ make doc
|
||||
$ cd latex
|
||||
$ make
|
||||
```
|
||||
|
||||
## Build Unit Tests for RDC Tool
|
||||
|
||||
```
|
||||
$ cd rdc/tests/rdc_tests
|
||||
$ mkdir -p build; cd build
|
||||
$ cmake -DROCM_DIR=/opt/rocm -DGRPC_ROOT="$GRPC_PROTOC_ROOT"..
|
||||
$ make
|
||||
|
||||
# To run the tests
|
||||
|
||||
$ cd build/rdctst_tests
|
||||
$ ./rdctst
|
||||
```
|
||||
|
||||
## Test
|
||||
|
||||
```
|
||||
# Run rdcd daemon
|
||||
$ LD_LIBRARY_PATH=$PWD/rdc_libs/ ./server/rdcd -u
|
||||
|
||||
# In another console run the RDC command-line
|
||||
$ LD_LIBRARY_PATH=$PWD/rdc_libs/ ./rdci/rdci discovery -l -u
|
||||
```
|
||||
|
||||
## Authentication
|
||||
|
||||
The RDC tool supports encrypted communications between clients and servers.
|
||||
|
||||
### Generate Files for Authentication
|
||||
|
||||
The communication between the client and server can be configured to be authenticated or unauthenticated. By default, authentication is enabled.
|
||||
|
||||
To disable authentication, when starting the server, use the "--unauth_comm" flag (or "-u" for short). You must also use “-u” in rdci to access unauth rdcd. The /lib/systemd/system/rdc.service file can be edited to pass arguments to rdcd on starting. On the client side, when calling rdc_channel_create(), the "secure" argument must be set to False.
|
||||
|
||||
### Scripts
|
||||
|
||||
RDC users manage their own keys and certificates. However, some scripts generate self-signed certificates in the RDC source tree in the authentication directory for test purposes. The following flowchart depicts how to generate the root certificates using the openssl command in 01gen_root_cert.sh:
|
||||
|
||||
A picture containing sign, drawing Description automatically generated
|
||||
|
||||

|
||||
|
||||
Generation of root certificates using openssl command
|
||||
|
||||
The section where the default responses to openssl questions can be specified is included in openssl.conf. To locate the section, look for the following comment line:
|
||||
|
||||
```
|
||||
# < ** REPLACE VALUES IN THIS SECTION WITH APPROPRIATE VALUES FOR YOUR ORG. **>
|
||||
```
|
||||
|
||||
It is helpful to modify this section with values appropriate for your organization if you expect to call this script many times. Additionally, you must replace the dummy values and update the alt_names section for your environment.
|
||||
|
||||
To generate the keys and certificates using these scripts, make the following calls:
|
||||
|
||||
```
|
||||
$ 01gen_root_cert.sh
|
||||
# provide answers to posed questions
|
||||
$ 02gen_ssl_artifacts.sh
|
||||
# provide answers to posed questions
|
||||
```
|
||||
|
||||
At this point, the keys and certificates are in the newly created "CA/artifacts" directory. You must delete this directory if you need to rerun the scripts.
|
||||
|
||||
To install the keys and certificates, access the artifacts directory and run the install.sh script as root, specifying the install location. By default, RDC expects this to be in /etc/rdc:
|
||||
|
||||
```
|
||||
$ cd CA/artifacts
|
||||
$ sudo install_<client|server>.sh /etc/rdc
|
||||
```
|
||||
|
||||
These files must be copied to and installed on all client and server machines that are expected to communicate with one another.
|
||||
|
||||
### Known Limitation
|
||||
|
||||
The RDC tool has the following authentication limitation:
|
||||
|
||||
The client and server are hardcoded to look for the openssl certificate and key files in /etc/rdc. There is no workaround available currently.
|
||||
Verify Files for Authentication
|
||||
|
||||
Several SSL keys and certificates must be generated and installed on clients and servers for authentication to work properly. By default, the RDC server will look in the /etc/rdc folder for the following keys and certificates:
|
||||
|
||||
### Client
|
||||
|
||||
```
|
||||
$ sudo tree /etc/rdc
|
||||
/etc/rdc
|
||||
|-- client
|
||||
|-- certs
|
||||
| |-- rdc_cacert.pem
|
||||
| |-- rdc_client_cert.pem
|
||||
|-- private
|
||||
|-- rdc_client_cert.key
|
||||
```
|
||||
|
||||
NOTE: Machines that are clients and servers consist of both directory structures.
|
||||
|
||||
### Server
|
||||
|
||||
```
|
||||
$ sudo tree /etc/rdc
|
||||
/etc/rdc
|
||||
|-- server
|
||||
|-- certs
|
||||
| |-- rdc_cacert.pem
|
||||
| |-- rdc_server_cert.pem
|
||||
|-- private
|
||||
|-- rdc_server_cert.key
|
||||
```
|
||||
@@ -0,0 +1,161 @@
|
||||
# Data Center Tool: Installation and Integration
|
||||
|
||||
## Supported Platforms
|
||||
|
||||
The RDC tool is part of the AMD ROCm software and available on the distributions supported by AMD ROCm.
|
||||
|
||||
To see the list of supported operating systems, refer to the ROCm installation guide at https://docs.amd.com.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
For RDC installation from prebuilt packages, follow the instructions in this section.
|
||||
|
||||
The list of dependencies can be found on the [README.md on GitHub](https://github.com/RadeonOpenCompute/rdc#dependencies).
|
||||
|
||||
## Install gRPC
|
||||
|
||||
To see the instructions for building gRPC and protoc, refer to the [README.md on GitHub](https://github.com/RadeonOpenCompute/rdc#building-grpc-and-protoc).
|
||||
|
||||
### Authentication Keys
|
||||
|
||||
The RDC tool can be used with or without authentication. If authentication is required, you must configure proper authentication keys.
|
||||
|
||||
For configuring SSL keys, refer to the section on Authentication below.
|
||||
|
||||
## Prebuilt Packages
|
||||
|
||||
The RDC tool is packaged as part of the ROCm software repository. You must install the AMD ROCm software before installing RDC. For details on ROCm installation, see the AMD ROCm Installation Guide.
|
||||
|
||||
To install RDC after installing the ROCm package, follow the instructions below.
|
||||
|
||||
### Ubuntu
|
||||
|
||||
```
|
||||
$ sudo apt-get install rdc
|
||||
# to install a specific version
|
||||
$ sudo apt-get install rdc<x.y.z>
|
||||
SLES 15 Service Pack 3
|
||||
$ sudo zypper install rdc
|
||||
# to install a specific version
|
||||
$ sudo zypper install rdc<x.y.z>
|
||||
```
|
||||
|
||||
### SLES 15 Service Pack 3
|
||||
|
||||
```
|
||||
$ sudo zypper install rdc
|
||||
# to install a specific version
|
||||
$ sudo zypper install rdc<x.y.z>
|
||||
```
|
||||
|
||||
## Components
|
||||
|
||||
The components of RDC tool are as shown below:
|
||||
|
||||

|
||||
|
||||
High-level diagram of RDC components.
|
||||
|
||||
RDC (API) Library
|
||||
|
||||
This library is the central piece, which interacts with different modules and provides all the features described. This shared library provides C API and Python bindings so that third-party tools should be able to use it directly if required.
|
||||
|
||||
RDC Daemon (rdcd)
|
||||
|
||||
The daemon records telemetry information from GPUs. It also provides an interface to RDC command-line tool (rdci) running locally or remotely. It relies on the above RDC Library for all the core features.
|
||||
|
||||
RDC Command Line Tool (rdci)
|
||||
|
||||
A command-line tool to invoke all the features of the RDC tool. This CLI can be run locally or remotely.
|
||||
|
||||
ROCm-SMI Library
|
||||
|
||||
A stateless system management library that provides low-level interfaces to access GPU information
|
||||
|
||||
## Start RDC
|
||||
|
||||
The RDC tool can be run in the following two modes. The feature set is similar in both the cases. Users have the flexibility to choose the right option that best fits their environment.
|
||||
|
||||
• Standalone mode
|
||||
|
||||
• Embedded mode
|
||||
|
||||
The capability in each mode depends on the privileges the user has for starting RDC. A normal user has access only to monitor (access to GPU telemetry) capabilities. A privileged user can run the tool with full capability. In the full capability mode, GPU configuration features can be invoked. This may or may not affect all the users and processes sharing the GPU.
|
||||
|
||||
### Standalone Mode
|
||||
|
||||
This is the preferred mode of operation, as it does not have any external dependencies. To start RDC in standalone mode, RDC Server Daemon (rdcd) must run on each compute node. You can start RDC daemon (rdcd) as a systemd service or directly from the command-line.
|
||||
|
||||
#### Start RDC Tool Using systemd
|
||||
|
||||
If multiple RDC versions are installed, copy `/opt/rocm-<x.y.z>/rdc/lib/rdc.service`, which is installed with the desired RDC version, to the systemd folder. The capability of RDC can be configured by modifying the rdc.service system configuration file. Use the systemctl command to start rdcd.
|
||||
|
||||
```
|
||||
$ systemctl start rdc
|
||||
```
|
||||
|
||||
By default, rdcd starts with full capability. To change to monitor only, comment out the following two lines:
|
||||
|
||||
```
|
||||
$ sudo vi /lib/systemd/system/rdc.service
|
||||
# CapabilityBoundingSet=CAP_DAC_OVERRIDE
|
||||
# AmbientCapabilities=CAP_DAC_OVERRIDE
|
||||
```
|
||||
|
||||
NOTE: rdcd can be started by using the systemctl command.
|
||||
|
||||
```
|
||||
$ systemctl start rdc
|
||||
```
|
||||
|
||||
If the GPU reset fails, restart the server. Note that restarting the server also initiates rdcd. Users may then encounter the following two scenarios:
|
||||
|
||||
• rdcd returns the correct GPU information to rdci.
|
||||
|
||||
• rdcd returns the "No GPUs found on the system" error to rdci. To resolve this error, restart rdcd with the following instruction:
|
||||
|
||||
```
|
||||
sudo systemctl restart rdcd
|
||||
```
|
||||
|
||||
### Start RDC Tool from Command-line
|
||||
|
||||
While systemctl is the preferred way to start rdcd, you can also start directly from the command-line. The installation scripts create a default user - “rdc”. Users have the option to edit the profile file (rdc.service installed at /lib/systemd/system) and change these lines accordingly:
|
||||
|
||||
```
|
||||
[Service]
|
||||
User=rdc
|
||||
Group=rdc
|
||||
```
|
||||
|
||||
```
|
||||
#Start as user rdc
|
||||
$ sudo -u rdc rdcd
|
||||
|
||||
# Start as root
|
||||
$ sudo rdcd
|
||||
```
|
||||
|
||||
From the command-line, start rdcd as a user (for example, rdc) or root.
|
||||
|
||||
Note that in this use case, the rdc.service file mentioned in the previous section is not involved. Here, the capability of RDC is determined by the privilege of the user starting rdcd. If rdcd is running under a normal user account, it has the Monitor-only capability. If rdcd is running as root, rdcd has full capability.
|
||||
|
||||
NOTE: If a user other than rdc or root starts the rdcd daemon, the file ownership of the SSL keys mentioned in the Authentication section must be modified to allow read and write access.
|
||||
|
||||
### Troubleshoot rdcd
|
||||
|
||||
When rdcd is started using systemctl, the logs can be viewed using the following command:
|
||||
|
||||
```
|
||||
$ journalctl -u rdc
|
||||
```
|
||||
|
||||
These messages provide useful status and debugging information. The logs can also help debug problems like rdcd failing to start, communication issues with a client, and others.
|
||||
|
||||
## Embedded Mode
|
||||
|
||||
The embedded mode is useful if the end user has a monitoring agent running on the compute node. The monitoring agent can directly use the RDC library and will have a finer-grain control on how and when RDC features are invoked. For example, if the monitoring agent has a facility to synchronize across multiple nodes, it can synchronize GPU telemetry across these nodes.
|
||||
|
||||
The RDC daemon rdcd can be used as a reference code for this purpose. The dependency on gRPC is also eliminated if the RDC library is directly used.
|
||||
|
||||
CAUTION: RDC command-line rdci will not function in this mode. Third-party monitoring software is responsible for providing the user interface and remote access/monitoring.
|
||||
@@ -0,0 +1,528 @@
|
||||
# Data Center Tool: Third-Party Integration
|
||||
|
||||
This section lists all the third-party plugins such as Prometheus, Grafana, and Reliability, Availability and Serviceability (RAS) plugin.
|
||||
|
||||
## Python Bindings
|
||||
|
||||
The RDC Tool provides a generic Python class RdcReader to simplify telemetry gathering. RdcReader simplifies usage by providing the following functionalities:
|
||||
|
||||
• The user only needs to specify telemetry fields. RdcReader creates the necessary groups and fieldgroups, watch the fields, and fetch the fields.
|
||||
|
||||
• The RdcReader can support embedded and standalone mode. The standalone mode can be with or without authentication.
|
||||
|
||||
• In standalone mode, the RdcReader can automatically reconnect to rdcd if the connection is lost.
|
||||
|
||||
• When rdcd is restarted, the previously created group and fieldgroup may be lost. The RdcReader can re-create them and watch the fields after reconnecting.
|
||||
|
||||
• If the client is restarted, RdcReader can detect the groups and fieldgroups created before and avoid re-creating them.
|
||||
|
||||
• A custom unit converter can be passed to RdcReader to override the default RDC unit.
|
||||
|
||||
See the sample program to monitor the power and GPU utilization using the RdcReader below:
|
||||
|
||||
```
|
||||
from RdcReader import RdcReader
|
||||
from RdcUtil import RdcUtil
|
||||
from rdc_bootstrap import *
|
||||
|
||||
default_field_ids = [
|
||||
rdc_field_t.RDC_FI_POWER_USAGE,
|
||||
rdc_field_t.RDC_FI_GPU_UTIL
|
||||
]
|
||||
|
||||
class SimpleRdcReader(RdcReader):
|
||||
def __init__(self):
|
||||
RdcReader.__init__(self,ip_port=None, field_ids = default_field_ids, update_freq=1000000)
|
||||
def handle_field(self, gpu_index, value):
|
||||
field_name = self.rdc_util.field_id_string(value.field_id).lower()
|
||||
print("%d %d:%s %d" % (value.ts, gpu_index, field_name, value.value.l_int))
|
||||
|
||||
if __name__ == '__main__':
|
||||
reader = SimpleRdcReader()
|
||||
while True:
|
||||
time.sleep(1)
|
||||
reader.process()
|
||||
```
|
||||
|
||||
In the sample program,
|
||||
|
||||
• Class SimpleRdcReader is derived from the RdcReader.
|
||||
|
||||
• The field "ip_port=None" in RdcReader dictates that the RDC tool runs in the embedded mode.
|
||||
|
||||
• SimpleRdcReader::process(), then, fetches fields specified in default_field_ids. RdcReader.py can be found in the python_binding folder located at RDC install path.
|
||||
|
||||
To run the example, use:
|
||||
|
||||
```
|
||||
# Ensure that RDC shared libraries are in the library path and
|
||||
# RdcReader.py is in PYTHONPATH
|
||||
|
||||
$ python SimpleReader.py
|
||||
```
|
||||
|
||||
## Prometheus Plugin
|
||||
|
||||
Prometheus plugin helps to monitor events and send alerts. The Prometheus installation and integration details are given below.
|
||||
|
||||
### Prometheus Plugin Installation
|
||||
|
||||
The RDC tool’s Prometheus plugin rdc_prometheus.py can be found in the python_binding folder.
|
||||
|
||||
NOTE: Ensure the Prometheus client is installed before the Prometheus plugin installation process.
|
||||
|
||||
```
|
||||
$ pip install prometheus_client
|
||||
```
|
||||
|
||||
To view the options provided with the plugin, use --help.
|
||||
|
||||
```
|
||||
% python rdc_prometheus.py –help
|
||||
usage: rdc_prometheus.py [-h] [--listen_port LISTEN_PORT] [--rdc_embedded]
|
||||
[--rdc_ip_port RDC_IP_PORT] [--rdc_unauth]
|
||||
[--rdc_update_freq RDC_UPDATE_FREQ]
|
||||
[--rdc_max_keep_age RDC_MAX_KEEP_AGE]
|
||||
[--rdc_max_keep_samples RDC_MAX_KEEP_SAMPLES]
|
||||
[--rdc_fields RDC_FIELDS [RDC_FIELDS ...]]
|
||||
[--rdc_fields_file RDC_FIELDS_FILE]
|
||||
[--rdc_gpu_indexes RDC_GPU_INDEXES [RDC_GPU_INDEXES ...]]
|
||||
[--enable_plugin_monitoring]
|
||||
|
||||
RDC Prometheus plugin.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--listen_port LISTEN_PORT
|
||||
The listen port of the plugin (default: 5000)
|
||||
--rdc_embedded Run RDC in embedded mode (default: standalone mode)
|
||||
--rdc_ip_port RDC_IP_PORT
|
||||
The rdcd IP and port in standalone mode (default:
|
||||
localhost:50051)
|
||||
--rdc_unauth Set this option if the rdcd is running with unauth in
|
||||
standalone mode (default: false)
|
||||
--rdc_update_freq RDC_UPDATE_FREQ
|
||||
The fields update frequency in seconds (default: 10))
|
||||
--rdc_max_keep_age RDC_MAX_KEEP_AGE
|
||||
The max keep age of the fields in seconds (default:
|
||||
3600)
|
||||
--rdc_max_keep_samples RDC_MAX_KEEP_SAMPLES
|
||||
The max samples to keep for each field in the cache
|
||||
(default: 1000)
|
||||
--rdc_fields RDC_FIELDS [RDC_FIELDS ...]
|
||||
The list of fields name needs to be watched, for
|
||||
example, " --rdc_fields RDC_FI_GPU_TEMP
|
||||
RDC_FI_POWER_USAGE " (default: fields in the
|
||||
plugin)
|
||||
--rdc_fields_file RDC_FIELDS_FILE
|
||||
The list of fields name can also be read from a file
|
||||
with each field name in a separated line (default:
|
||||
None)
|
||||
--rdc_gpu_indexes RDC_GPU_INDEXES [RDC_GPU_INDEXES ...]
|
||||
The list of GPUs to be watched (default: All GPUs)
|
||||
--enable_plugin_monitoring
|
||||
Set this option to collect process metrics of
|
||||
the plugin itself (default: false)
|
||||
```
|
||||
|
||||
By default, the plugin runs in the standalone mode and connects to rdcd at localhost:50051 to fetch fields. The plugin should use the same authentication mode as rdcd, e.g., if rdcd is running with -u/--unauth flag, the plugin should use --rdc_unauth flag. You can use the plugin in the embedded mode without rdcd by setting --rdc_embedded flag.
|
||||
|
||||
To override the default fields that are monitored, you can use the --rdc_fields option to specify the list of fields. If the fields list is long, the --rdc_fields_file option provides a convenient way to fetch fields list from a file. You can use the max_keep_age and max_keep_samples to control how the fields are cached.
|
||||
|
||||
The plugin can provide the metrics of the plugin itself, including the plugin process CPU, memory, file descriptor usage, and native threads count, including the process start and up times. You can enable this using --enable_plugin_monitoring.
|
||||
|
||||
You can test the plugin with the default settings.
|
||||
|
||||
```
|
||||
# Ensure that rdcd is running on the same machine
|
||||
$ python rdc_prometheus.py
|
||||
|
||||
# Check the plugin using curl
|
||||
$ curl localhost:5000
|
||||
# HELP gpu_util gpu_util
|
||||
# TYPE gpu_util gauge
|
||||
gpu_util{gpu_index="0"} 0.0
|
||||
# HELP gpu_clock gpu_clock
|
||||
# TYPE gpu_clock gauge
|
||||
gpu_clock{gpu_index="0"} 300.0
|
||||
# HELP gpu_memory_total gpu_memory_total
|
||||
# TYPE gpu_memory_total gauge
|
||||
gpu_memory_total{gpu_index="0"} 4294.0
|
||||
# HELP gpu_temp gpu_temp
|
||||
# TYPE gpu_temp gauge
|
||||
# HELP power_usage power_usage
|
||||
# TYPE power_usage gauge
|
||||
power_usage{gpu_index="0"} 9.0
|
||||
# HELP gpu_memory_usage gpu_memory_usage
|
||||
# TYPE gpu_memory_usage gauge
|
||||
gpu_memory_usage{gpu_index="0"} 134.0
|
||||
```
|
||||
|
||||
## Prometheus Integration
|
||||
|
||||
Follow these steps:
|
||||
|
||||
1. [Download and install Prometheus](https://github.com/prometheus/prometheus) in the management machine.
|
||||
|
||||
2. Use the example configuration file rdc_prometheus_example.yml in the python_binding folder. You can use this file in its original state. However, note that this file refers to prometheus_targets.json. Ensure that this is modified to point to the correct compute nodes.
|
||||
|
||||
```
|
||||
// Sample file: prometheus_targets.json
|
||||
// Replace rdc_test*.amd.com to point the correct compute nodes
|
||||
// Add as many compute nodes as necessary
|
||||
[
|
||||
{
|
||||
"targets": [
|
||||
"rdc_test1.amd.com:5000",
|
||||
"rdc_test2.amd.com:5000"
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
NOTE: In the above example, there are two compute nodes, rdc_test1.adm.com and rdc_test2.adm.com. Ensure that the Prometheus plugin is running on those compute nodes.
|
||||
|
||||
3. Start the Prometheus plugin.
|
||||
|
||||
```
|
||||
% prometheus --config.file=<full path of the rdc_prometheus_example.yml>
|
||||
```
|
||||
|
||||
4. From the management node, using a browser, open the URL http://localhost:9090.
|
||||
|
||||
5. Select one of the available metrics.
|
||||
|
||||
Example: gpu_clock
|
||||
|
||||

|
||||
|
||||
The Prometheus image showing the GPU clock for both rdc_test1 and rdc_test2.
|
||||
|
||||
## Grafana Plugin
|
||||
|
||||
Grafana is a common monitoring stack used for storing and visualizing time series data. Prometheus acts as the storage backend, and Grafana is used as the interface for analysis and visualization. Grafana has a plethora of visualization options and can be integrated with Prometheus for the RDC tool’s dashboard.
|
||||
|
||||
### Grafana Plugin Installation
|
||||
|
||||
To install Grafana plugin, follow these steps:
|
||||
|
||||
1. [Download Grafana](https://grafana.com/grafana/download)
|
||||
|
||||
2. Read the [installation instructions](https://grafana.com/docs/grafana/latest/setup-grafana/installation/debian/) to install Grafana
|
||||
|
||||
3. To start Grafana, follow these instructions:
|
||||
|
||||
```
|
||||
sudo systemctl start grafana-server
|
||||
sudo systemctl status grafana-server
|
||||
```
|
||||
|
||||
4. Browse to http://localhost:3000/.
|
||||
|
||||
5. Log in using the default username and password (admin/admin) as shown in the image below:
|
||||
|
||||

|
||||
|
||||
### Grafana Integration
|
||||
|
||||
As a prerequisite, ensure:
|
||||
|
||||
• The RDC Prometheus plugin is running in each compute node.
|
||||
|
||||
• Prometheus is set up to collect metrics from the plugin.
|
||||
|
||||
For more information about installing and configuring Prometheus, see the section on [Prometheus Plugin](https://docs.amd.com/bundle/ROCm-DataCenter-Tool-User-Guide-v5.3/page/Data_Center_Tool_Third-Party_Integration.html#_Prometheus_Plugin).
|
||||
|
||||
### Grafana Configuration
|
||||
|
||||
Follow these steps:
|
||||
|
||||
1. Click Configuration.
|
||||
|
||||

|
||||
|
||||
2. Select Data Sources, as shown in the image below:
|
||||
|
||||

|
||||
|
||||
3. Click Add data source.
|
||||
|
||||

|
||||
|
||||
4. Select Prometheus.
|
||||
|
||||

|
||||
|
||||
NOTE: Ensure the name of the data source is "Prometheus." If Prometheus and Grafana are running on the same machine, use the default URL http://localhost:9090. Otherwise, ensure the URL matches the Prometheus URL, save, and test it.
|
||||
|
||||

|
||||
|
||||
5. To import the RDC tool dashboard, click “+” and select Import.
|
||||
|
||||
6. Click the Upload.json file.
|
||||
|
||||
7. Choose rdc_grafana_dashboard_example.json, which is in the python_binding folder.
|
||||
|
||||
8. Import the rdc_grafana_dashboard_example.json file, and select the desired compute node on the dashboard, as shown in the image below:
|
||||
|
||||

|
||||
|
||||
## Prometheus (Grafana) Integration with Automatic Node Detection
|
||||
|
||||
The RDC tool enables you to use Consul to discover the rdc_prometheus service automatically. Consul is “a service mesh solution providing a fully featured control plane with service discovery, configuration, and segmentation functionality.” For more information, refer to [Consul](https://developer.hashicorp.com/consul/docs/intro).
|
||||
|
||||
The RDC tool uses Consul for health checks of RDC’s integration with the Prometheus plug-in (rdc_prometheus), and these checks provide information on its efficiency.
|
||||
|
||||
Previously, when a new compute node was added, users had to manually change prometheus_targets.json to use Consul. Now, with the Consul agent integration, a new compute node can be discovered automatically.
|
||||
|
||||
### Installing the Consul Agent for Compute and Management Nodes
|
||||
|
||||
To install the latest Consul agent for compute and management nodes, follow the instructions below:
|
||||
|
||||
1. Set up the apt repository to download and install the Consul agent.
|
||||
|
||||
```
|
||||
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
|
||||
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
|
||||
sudo apt-get update && sudo apt-get install consul
|
||||
```
|
||||
|
||||
2. Generate a key to encrypt the communication between consul agents. Note that you can generate the key once, and both the compute and management nodes use the same key for communication.
|
||||
|
||||
```
|
||||
$ consul keygen
|
||||
```
|
||||
|
||||
For the purposes of this feature documentation, the following key is used in the configuration file:
|
||||
|
||||
```
|
||||
$ consul keygen
|
||||
4lgGQXr3/R2QeTi5vEp7q5Xs1KoYBhCsk9+VgJZZHAo=
|
||||
```
|
||||
|
||||
### Setting up the Consul Server in Management Nodes
|
||||
|
||||
While Consul can function with one server, it is recommended to use three to five servers to avoid failure scenarios, which often lead to data loss.
|
||||
|
||||
NOTE: For example purposes, the configuration settings documented below are for a single server.
|
||||
|
||||
Follow these steps:
|
||||
|
||||
1. Create a configuration file /etc/consul.d/server.hcl.
|
||||
|
||||
```
|
||||
server = true
|
||||
encrypt = "<CONSUL_ENCRYPTION_KEY>"
|
||||
bootstrap_expect = 1
|
||||
ui = true
|
||||
client_addr = "0.0.0.0"
|
||||
bind_addr = "<The IP address can be reached by client>"
|
||||
```
|
||||
|
||||
2. Run the agent in server mode, and set the encrypt to the key generated in the first step. The bootstrap_expect variable indicates the number of servers required to form the first Consul cluster.
|
||||
|
||||
3. Set the number of servers to 1 to allow a cluster with a single server.
|
||||
|
||||
• The User Interface (UI) variable is used to enable the Consul Web UI.
|
||||
|
||||
• The client_addr variable is used to connect the API and UI.
|
||||
|
||||
• The bind_addr variable is used to connect the client to the server. If you have multiple private IP addresses, use the address that can connect to a client.
|
||||
|
||||
4. Start the agent using the following instruction:
|
||||
|
||||
```
|
||||
sudo consul agent -config-dir=/etc/consul.d/
|
||||
```
|
||||
|
||||
5. Browse to http://localhost:8500/ on the management node. You will see a single instance running.
|
||||
|
||||
### Setting up the Consul Client in Compute Nodes
|
||||
|
||||
Follow these steps:
|
||||
|
||||
1. Create a configuration file /etc/consul.d/client.hcl.
|
||||
|
||||
```
|
||||
server = false
|
||||
encrypt = "<CONSUL_ENCRYPTION_KEY>"
|
||||
retry_join = ["<The consul server address>"]
|
||||
client_addr = "0.0.0.0"
|
||||
bind_addr = "<The IP address can reach server>"
|
||||
```
|
||||
|
||||
NOTE: Use the same CONSUL_ENCRYPTION_KEY as the servers. In the retry_join, use the IP address of the management nodes.
|
||||
|
||||
2. Start the Consul agent.
|
||||
|
||||
```
|
||||
sudo consul agent -config-dir=/etc/consul.d/
|
||||
```
|
||||
|
||||
The client has now joined the Consul.
|
||||
|
||||
```
|
||||
$ consul members
|
||||
Node Address Status Type Build Protocol DC Segment
|
||||
management-node 10.4.22.70:8301 alive server 1.9.3 2 dc1 <all>
|
||||
compute-node 10.4.22.112:8301 alive client 1.9.3 2 dc1 <default>
|
||||
```
|
||||
|
||||
3. Set up the Consul client to monitor the health of the RDC Prometheus plugin.
|
||||
|
||||
4. Start the RDC Prometheus plugin.
|
||||
|
||||
```
|
||||
python rdc_prometheus.py --rdc_embedded
|
||||
```
|
||||
|
||||
5. Add the configuration file /etc/consul.d/rdc_prometheus.hcl.
|
||||
|
||||
```
|
||||
{
|
||||
"service": {
|
||||
"name": "rdc_prometheus",
|
||||
"tags": [
|
||||
"rdc_prometheus",
|
||||
"rdc"
|
||||
],
|
||||
"port": 5000,
|
||||
"check": {
|
||||
"id": "rdc_plugin",
|
||||
"name": "RDC Prometheus plugin on port 5000",
|
||||
"http": "http://localhost:5000",
|
||||
"method": "GET",
|
||||
"interval": "15s",
|
||||
"timeout": "1s"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
NOTE: By default, the Prometheus plugin uses port 5000. If you do not use the default setting, ensure you change the configuration file accordingly.
|
||||
|
||||
After the configuration file is changed, restart the Consul client agent.
|
||||
|
||||
```
|
||||
sudo consul agent -config-dir=/etc/consul.d/
|
||||
```
|
||||
|
||||
6. Enable the Prometheus integration in the Management node. For more information, refer to the Prometheus Integration section above.
|
||||
|
||||
7. In the Management node, inspect the service.
|
||||
|
||||
```
|
||||
$ consul catalog nodes -service=rdc_prometheus
|
||||
|
||||
Node ID Address DC
|
||||
compute-node 76694ab1 10.4.22.112 dc1
|
||||
```
|
||||
|
||||
8. Create a new Prometheus configuration rdc_prometheus_consul.yml file for the Consul integration.
|
||||
|
||||
```
|
||||
global:
|
||||
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
|
||||
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
|
||||
scrape_configs:
|
||||
- job_name: 'consul'
|
||||
consul_sd_configs:
|
||||
- server: 'localhost:8500'
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_consul_tags]
|
||||
regex: .*,rdc,.*
|
||||
action: keep
|
||||
- source_labels: [__meta_consul_service]
|
||||
target_label: job
|
||||
```
|
||||
|
||||
NOTE: If you are not running the consul server and Prometheus in the same machine, change the server under consul_sd_configs to your consul server address.
|
||||
|
||||
9. Start Prometheus.
|
||||
|
||||
```
|
||||
$ ./prometheus --config.file="rdc_prometheus_consul.yml"
|
||||
```
|
||||
|
||||
10. Browse the Prometheus UI at http://localhost:9090 on the Management node and query RDC Prometheus metrics. Ensure that the plugin starts before running the query.
|
||||
|
||||
## Reliability, Availability, and Serviceability Plugin
|
||||
|
||||
The RAS plugin helps to gather and count errors. The details of RAS integration with RDC are given below.
|
||||
|
||||
### RAS Plugin Installation
|
||||
|
||||
In this release, the RDC tool extends support to the Reliability, Availability, and Serviceability (RAS) integration. When the RAS feature is enabled in the graphic card, users can use RDC to monitor RAS errors.
|
||||
|
||||
#### Prerequisite
|
||||
|
||||
You must ensure the graphic card supports RAS.
|
||||
|
||||
NOTE: The RAS library is installed as part of the RDC installation, and no additional configuration is required for RDC.
|
||||
|
||||
The RDC tool installation dynamically loads the RAS library librdc_ras.so. The configuration files required by the RAS library are installed in the sp3 and config folders.
|
||||
|
||||
```
|
||||
% ls /opt/rocm-4.2.0/rdc/lib
|
||||
... librdc_ras.so ...
|
||||
... sp3 ... config ...
|
||||
```
|
||||
|
||||
### RAS Integration
|
||||
|
||||
RAS exposes a list of Error-Correcting Code (ECC) correctable and uncorrectable errors for different IP blocks and enables users to successfully troubleshoot issues.
|
||||
|
||||
For example, the dmon command passes the ECC_CORRECT and ECC_UNCORRECT counters field id to the command.
|
||||
|
||||
```
|
||||
rdci dmon -i 0 -e 600,601
|
||||
```
|
||||
|
||||
The dmon command monitors GPU index 0, field 600, and 601, where 600 is for the ECC_CORRECT counter and 601 is for the ECC_UNCORRECT counter.
|
||||
|
||||
```
|
||||
% rdci dmon -l
|
||||
... ...
|
||||
600 RDC_FI_ECC_CORRECT_TOTAL : Accumulated Single Error Correction.
|
||||
601 RDC_FI_ECC_UNCORRECT_TOTAL : Accumulated Double Error Detection.
|
||||
602 RDC_FI_ECC_SDMA_SEC : SDMA Single Error Correction.
|
||||
603 RDC_FI_ECC_SDMA_DED : SDMA Double Error Detection.
|
||||
604 RDC_FI_ECC_GFX_SEC : GFX Single Error Correction.
|
||||
605 RDC_FI_ECC_GFX_DED : GFX Double Error Detection.
|
||||
606 RDC_FI_ECC_MMHUB_SEC : MMHUB Single Error Correction.
|
||||
607 RDC_FI_ECC_MMHUB_DED : MMHUB Double Error Detection.
|
||||
608 RDC_FI_ECC_ATHUB_SEC : ATHUB Single Error Correction.
|
||||
609 RDC_FI_ECC_ATHUB_DED : ATHUB Double Error Detection.
|
||||
610 RDC_FI_ECC_BIF_SEC : BIF Single Error Correction.
|
||||
611 RDC_FI_ECC_BIF_DED : BIF Double Error Detection.
|
||||
612 RDC_FI_ECC_HDP_SEC : HDP Single Error Correction.
|
||||
613 RDC_FI_ECC_HDP_DED : HDP Double Error Detection.
|
||||
614 RDC_FI_ECC_XGMI_WAFL_SEC : XGMI WAFL Single Error Correction.
|
||||
615 RDC_FI_ECC_XGMI_WAFL_DED : XGMI WAFL Double Error Detection.
|
||||
616 RDC_FI_ECC_DF_SEC : DF Single Error Correction.
|
||||
617 RDC_FI_ECC_DF_DED : DF Double Error Detection.
|
||||
618 RDC_FI_ECC_SMN_SEC : SMN Single Error Correction.
|
||||
619 RDC_FI_ECC_SMN_DED : SMN Double Error Detection.
|
||||
620 RDC_FI_ECC_SEM_SEC : SEM Single Error Correction.
|
||||
621 RDC_FI_ECC_SEM_DED : SEM Double Error Detection.
|
||||
622 RDC_FI_ECC_MP0_SEC : MP0 Single Error Correction.
|
||||
623 RDC_FI_ECC_MP0_DED : MP0 Double Error Detection.
|
||||
624 RDC_FI_ECC_MP1_SEC : MP1 Single Error Correction.
|
||||
|
||||
625 RDC_FI_ECC_MP1_DED : MP1 Double Error Detection.
|
||||
626 RDC_FI_ECC_FUSE_SEC : FUSE Single Error Correction.
|
||||
627 RDC_FI_ECC_FUSE_DED : FUSE Double Error Detection.
|
||||
628 RDC_FI_ECC_UMC_SEC : UMC Single Error Correction.
|
||||
629 RDC_FI_ECC_UMC_DED : UMC Double Error Detection.
|
||||
... ...
|
||||
```
|
||||
|
||||
To access the ECC correctable and uncorrectable error counters, use the following command:
|
||||
|
||||
```
|
||||
% rdci dmon -i 0 -e 600,601
|
||||
GPU ECC_CORRECT ECC_UNCORRECT
|
||||
0 0 0
|
||||
0 0 0
|
||||
0 0 0
|
||||
```
|
||||
@@ -0,0 +1,53 @@
|
||||
# Introduction to ROCm Data Center Tool User Guide
|
||||
|
||||
The ROCm™ Data Center Tool™ (RDC) simplifies the administration and addresses key infrastructure challenges in AMD GPUs in cluster and datacenter environments. The main features are:
|
||||
|
||||
• GPU telemetry
|
||||
|
||||
• GPU statistics for jobs
|
||||
|
||||
• Integration with third-party tools
|
||||
|
||||
• Open source
|
||||
|
||||
You can use the tool in standalone mode if all components are installed. However, the existing management tools can use the same set of features available in a library format.
|
||||
|
||||
For details on different modes of operation, refer to [Starting RDC](install).
|
||||
|
||||
## Objective
|
||||
|
||||
This user guide is intended to:
|
||||
|
||||
• Provide an overview of the RDC tool features.
|
||||
|
||||
• Describe how system administrators and Data Center (or HPC) users can administer and configure AMD GPUs.
|
||||
|
||||
• Describe the components.
|
||||
|
||||
• Provide an overview of the open source developer handbook.
|
||||
|
||||
## Terminology
|
||||
|
||||
Table 1: Terminologies and Abbreviations
|
||||
|
||||
| Term | Description |
|
||||
| ------------------------ | ------------------------- |
|
||||
| RDC | ROCm Data Center tool |
|
||||
| Compute node (CN) | One of many nodes containing one or more GPUs in the Data Center on which compute jobs are run |
|
||||
| Management node (MN) or Main console | A machine running system administration applications to administer and manage the Data Center |
|
||||
| GPU Groups | Logical grouping of one or more GPUs in a compute node |
|
||||
| Fields | A metric that can be monitored by the RDC, such as GPU temperature, memory usage, and power usage |
|
||||
| Field Groups | Logical grouping of multiple fields |
|
||||
| Job | A workload that is submitted to one or more compute nodes |
|
||||
|
||||
## Target Audience
|
||||
|
||||
The audience for the AMD RDC tool consists of:
|
||||
|
||||
• Administrators: The tool provides the cluster administrator with the capability of monitoring, validating, and configuring policies.
|
||||
|
||||
• HPC Users: Provides GPU-centric feedback for their workload submissions.
|
||||
|
||||
• OEM: Add GPU information to their existing cluster management software.
|
||||
|
||||
• Open source Contributors: RDC is open source and accepts contributions from the community.
|
||||