Add rocmtools support
This commit adds integration with ROCmTools
Additional changes:
- Fix DEB and RPM installation issue when systemd is not present
- Fix typos in rdc.h
- Wrap negative values in parentheses in rdc.h
- CMAKE: Improve rocm_smi searching
- README: Improve formatting, add info about ROCmTools
Metrics added: 700-714
Metrics can be listed with `rdci dmon --list-all`
Majority of the metrics are only supported by Instict (MI) series GPUs
700 RDC_FI_PROF_ELAPSED_CYCLES should be available on most devices
See README for more information
Change-Id: I907d3eacdc92fc5588ca6c76c2fa1ce0ad900770
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
[ROCm/rdc commit: 861a843ed7]
Этот коммит содержится в:
@@ -1,3 +1,6 @@
|
||||
# my install directory used for testing
|
||||
install/
|
||||
|
||||
# build directories generated by cmake
|
||||
build/
|
||||
cmake/build/
|
||||
|
||||
@@ -49,6 +49,10 @@ option(BUILD_RASLIB "Build targets for raslib" OFF)
|
||||
# which requires the Rocm run time.
|
||||
option(BUILD_ROCRTEST "Build targets for librdc_rocr.so" ON)
|
||||
|
||||
# When cmake -DBUILD_ROCPTEST=off, it will not build the librdc_rocp.so
|
||||
# which requires the Rocm profiler.
|
||||
option(BUILD_ROCPTEST "Build targets for librdc_rocp.so" ON)
|
||||
|
||||
# When cmake -DBUILD_TESTS=off, it will not build RDC tests.
|
||||
option(BUILD_TESTS "Build test suite" OFF)
|
||||
|
||||
@@ -81,9 +85,6 @@ endif()
|
||||
|
||||
set(COMMON_DIR "${CMAKE_CURRENT_SOURCE_DIR}/common")
|
||||
|
||||
set(RSMI_INC_DIR "${ROCM_DIR}/${CMAKE_INSTALL_INCLUDEDIR}" CACHE INTERNAL "ROCm SMI include directory.")
|
||||
set(RSMI_LIB_DIR "${ROCM_DIR}/${CMAKE_INSTALL_LIBDIR}" CACHE INTERNAL "ROCm SMI library directory.")
|
||||
|
||||
set(GRPC_ROOT_DEFAULT "/usr")
|
||||
set(GRPC_ROOT ${GRPC_ROOT_DEFAULT} CACHE STRING "GRPC installation directory.")
|
||||
set(GRPC_DESIRED_VERSION 1.44.0 CACHE STRING "GRPC desired package version.")
|
||||
@@ -141,6 +142,13 @@ if(NOT EXISTS "${CMAKE_SOURCE_DIR}/raslib/.git" AND BUILD_RASLIB)
|
||||
If you do not want to build raslib, use cmake -DBUILD_RASLIB=off")
|
||||
endif()
|
||||
|
||||
find_package(RSMI
|
||||
NAMES rocm_smi
|
||||
HINTS ${ROCM_DIR}/lib/cmake
|
||||
CONFIGURE REQUIRED)
|
||||
set(RSMI_INC_DIR "${ROCM_SMI_INCLUDE_DIR}" CACHE INTERNAL "ROCm SMI include directory.")
|
||||
set(RSMI_LIB_DIR "${ROCM_SMI_LIB_DIR}" CACHE INTERNAL "ROCm SMI library directory.")
|
||||
|
||||
if(NOT EXISTS "${RSMI_INC_DIR}" OR NOT EXISTS "${RSMI_LIB_DIR}")
|
||||
message(FATAL_ERROR "rocm_smi not found in ${RSMI_INC_DIR}. Please
|
||||
make sure rocm_smi is installed and present in ${RSMI_INC_DIR}.")
|
||||
|
||||
+104
-55
@@ -23,6 +23,7 @@ RDC can run on AMD ROCm supported platforms, please refer to [List of Supported
|
||||
Latex (pdfTeX 3.14159265-2.6-1.40.16) ## required to build the latest documentation
|
||||
gRPC and protoc ## required for communication
|
||||
libcap-dev ## required to manage the privileges.
|
||||
rocmtools ## required for profiler metrics
|
||||
|
||||
AMD ROCm platform (https://github.com/RadeonOpenCompute/ROCm)
|
||||
* It is recommended to install the complete AMD ROCm platform.
|
||||
@@ -30,6 +31,8 @@ RDC can run on AMD ROCm supported platforms, please refer to [List of Supported
|
||||
* At the minimum, these two components are required
|
||||
(i) AMD ROCm SMI Library (https://github.com/RadeonOpenCompute/rocm_smi_lib)
|
||||
(ii) AMD ROCk Kernel driver (https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver)
|
||||
* For profiler metrics, this component is required:
|
||||
(i) AMD ROCm Tools (https://github.com/RadeonOpenCompute/rocm_smi_lib)
|
||||
|
||||
## Building gRPC and protoc
|
||||
**NOTE:** gRPC and protoc compiler must be built when building RDC from source as pre-built packages are not available. When installing RDC from a package, gRPC and protoc will be installed from the package.
|
||||
@@ -40,55 +43,65 @@ The following tools are required for gRPC build & installation
|
||||
|
||||
automake make g++ unzip build-essential autoconf libtool pkg-config libgflags-dev libgtest-dev clang-5.0 libc++-dev curl
|
||||
|
||||
Download and build gRPC
|
||||
Download and build gRPC
|
||||
```bash
|
||||
git clone -b v1.44.0 https://github.com/grpc/grpc
|
||||
cd grpc
|
||||
git submodule update --init
|
||||
mkdir -p cmake/build
|
||||
cd cmake/build
|
||||
```
|
||||
|
||||
$ git clone -b v1.44.0 https://github.com/grpc/grpc
|
||||
$ cd grpc
|
||||
$ git submodule update --init
|
||||
$ mkdir -p cmake/build
|
||||
$ cd cmake/build
|
||||
|
||||
## By default (without using CMAKE_INSTALL_PREFIX option), the following will install to /usr/local lib, include and bin directories
|
||||
|
||||
$ cmake -DgRPC_INSTALL=ON -DBUILD_SHARED_LIBS=ON <-DCMAKE_INSTALL_PREFIX=<install dir>> ../..
|
||||
$ make
|
||||
$ sudo make install
|
||||
$ echo "<install dir>/lib" | sudo tee /etc/ld.so.conf.d/grpc.conf
|
||||
By default (without using CMAKE_INSTALL_PREFIX option), the following will install to /usr/local lib, include and bin directories
|
||||
```bash
|
||||
cmake -DgRPC_INSTALL=ON -DBUILD_SHARED_LIBS=ON <-DCMAKE_INSTALL_PREFIX=<install dir>> ../..
|
||||
make
|
||||
sudo make install
|
||||
echo "<install dir>/lib" | sudo tee /etc/ld.so.conf.d/grpc.conf
|
||||
```
|
||||
|
||||
## Building RDC
|
||||
|
||||
Clone the RDC source code from GitHub and use CMake to build and install
|
||||
|
||||
$ git clone https://github.com/RadeonOpenCompute/rdc
|
||||
$ cd rdc
|
||||
$ mkdir -p build; cd build
|
||||
$ cmake -DROCM_DIR=/opt/rocm -DGRPC_ROOT="$GRPC_PROTOC_ROOT" <-DCMAKE_INSTALL_PREFIX=<install dir>> ..
|
||||
$ make
|
||||
$ make install ## default installation location is /opt/rocm
|
||||
```bash
|
||||
git clone https://github.com/RadeonOpenCompute/rdc
|
||||
cd rdc
|
||||
mkdir -p build; cd build
|
||||
cmake -DGRPC_ROOT="$GRPC_PROTOC_ROOT" <-DCMAKE_INSTALL_PREFIX=<install dir>> ..
|
||||
make
|
||||
make install ## default installation location is /opt/rocm, specify with -DROCM_DIR
|
||||
```
|
||||
|
||||
## Building RDC library only without gRPC (optional)
|
||||
|
||||
If only the RDC libraries are needed (i.e. only "embedded mode" is required), the user can choose to not build rdci and rdcd. This will eliminate the need for gRPC and protoc. To build in this way, -DBUILD_STANDALONE=off should be passed on the the cmake command line:
|
||||
|
||||
$ cmake -DROCM_DIR=/opt/rocm -DBUILD_STANDALONE=off <-DCMAKE_INSTALL_PREFIX=<install dir>> ..
|
||||
```bash
|
||||
cmake DBUILD_STANDALONE=off <-DCMAKE_INSTALL_PREFIX=<install dir>> ..
|
||||
```
|
||||
|
||||
## Building RDC library without ROCM Run time (optional)
|
||||
|
||||
The user can choose to not build RDC diagnostic ROCM Run time. This will eliminate the need for ROCM Run time. To build in this way, -DBUILD_ROCRTEST=off should be passed on the the cmake command line:
|
||||
|
||||
$ cmake -DROCM_DIR=/opt/rocm -DBUILD_ROCRTEST=off <-DCMAKE_INSTALL_PREFIX=<install dir>> ..
|
||||
```bash
|
||||
cmake DBUILD_ROCRTEST=off <-DCMAKE_INSTALL_PREFIX=<install dir>> ..
|
||||
```
|
||||
|
||||
## Update System Library Path
|
||||
|
||||
The following commands need to be executed as root (sudo). It may be easiest to put them into a script and then run that script as root:
|
||||
|
||||
$ RDC_LIB_DIR=<RDC install dir>/lib
|
||||
$ GRPC_LIB_DIR=<gRPC install dir>/lib
|
||||
$ echo "$GRPC_LIB_DIR" > /etc/ld.so.conf.d/x86_64-librdc_client.conf
|
||||
$ echo "$GRPC_LIB_DIR"64 >> /etc/ld.so.conf.d/x86_64-librdc_client.conf
|
||||
$ echo "$RDC_LIB_DIR" >> /etc/ld.so.conf.d/x86_64-librdc_client.conf
|
||||
$ echo "$RDC_LIB_DIR"64 >> /etc/ld.so.conf.d/x86_64-librdc_client.conf
|
||||
$ ldconfig
|
||||
```bash
|
||||
RDC_LIB_DIR=<RDC install dir>/lib
|
||||
GRPC_LIB_DIR=<gRPC install dir>/lib
|
||||
echo "$GRPC_LIB_DIR" > /etc/ld.so.conf.d/x86_64-librdc_client.conf
|
||||
echo "$GRPC_LIB_DIR"64 >> /etc/ld.so.conf.d/x86_64-librdc_client.conf
|
||||
echo "$RDC_LIB_DIR" >> /etc/ld.so.conf.d/x86_64-librdc_client.conf
|
||||
echo "$RDC_LIB_DIR"64 >> /etc/ld.so.conf.d/x86_64-librdc_client.conf
|
||||
ldconfig
|
||||
```
|
||||
|
||||
# Running RDC
|
||||
RDC supports encrypted communications between clients and servers. The
|
||||
@@ -100,48 +113,84 @@ For an RDC client application to monitor and/or control a remote system, the RDC
|
||||
### Start RDCD from command-line
|
||||
When *rdcd* is started from a command-line the *capabilities* are determined by privilege of the *user* starting *rdcd*
|
||||
|
||||
$ cd rdc_install_prefix ## If specified in Building RDC section
|
||||
```bash
|
||||
## If RDC_FI_PROF_* metrics are required - you MUST export ROCMTOOLS_METRICS_PATH before starting rdcd
|
||||
export ROCMTOOLS_METRICS_PATH=/opt/rocm-<version>/libexec/rocmtools/counters/derived_counters.xml
|
||||
|
||||
## To run with authentication. Ensure SSL keys are setup properly
|
||||
## version will be the version number(ex:3.10.0) of ROCm where RDC was pacakged with
|
||||
$ /opt/rocm-<version>/rdc/bin/rdcd ## rdcd is started with monitor-only capabilities
|
||||
$ sudo /opt/rocm-<version>/rdc/bin/rdcd ## rdcd is started will full-capabilities
|
||||
cd rdc_install_prefix ## If specified in Building RDC section
|
||||
|
||||
## To run without authentication. SSL key & certificates are not required.
|
||||
## version will be the version number(ex:3.10.0) of ROCm where RDC was pacakged with
|
||||
$ /opt/rocm-<version>/rdc/bin/rdcd -u ## rdcd is started with monitor-only capabilities
|
||||
$ sudo /opt/rocm-<version>/rdc/bin/rdcd -u ## rdcd is started will full-capabilities
|
||||
## To run with authentication. Ensure SSL keys are setup properly
|
||||
## version will be the version number(ex:3.10.0) of ROCm where RDC was pacakged with
|
||||
/opt/rocm-<version>/rdc/bin/rdcd ## rdcd is started with monitor-only capabilities
|
||||
sudo /opt/rocm-<version>/rdc/bin/rdcd ## rdcd is started will full-capabilities
|
||||
|
||||
## To run without authentication. SSL key & certificates are not required.
|
||||
## version will be the version number(ex:3.10.0) of ROCm where RDC was pacakged with
|
||||
/opt/rocm-<version>/rdc/bin/rdcd -u ## rdcd is started with monitor-only capabilities
|
||||
sudo /opt/rocm-<version>/rdc/bin/rdcd -u ## rdcd is started will full-capabilities
|
||||
```
|
||||
|
||||
### Start RDCD using systemd
|
||||
*rdcd* can be started by using the systemctl command. You can copy /opt/rocm-\<version\>/rdc/lib/rdc.service, which is installed with RDC, to the systemd folder. This file has 2 lines that control what *capabilities* with which *rdcd* will run. If left uncommented, rdcd will run with full-capabilities.
|
||||
|
||||
```bash
|
||||
## file: /opt/rocm-<version>/rdc/lib/rdc.service
|
||||
## Comment the following two lines to run with monitor-only capabilities
|
||||
CapabilityBoundingSet=CAP_DAC_OVERRIDE
|
||||
AmbientCapabilities=CAP_DAC_OVERRIDE
|
||||
```
|
||||
|
||||
## file: /opt/rocm-<version>/rdc/lib/rdc.service
|
||||
## Comment the following two lines to run with monitor-only capabilities
|
||||
CapabilityBoundingSet=CAP_DAC_OVERRIDE
|
||||
AmbientCapabilities=CAP_DAC_OVERRIDE
|
||||
|
||||
systemctl start rdc ## start rdc as systemd service
|
||||
```bash
|
||||
systemctl start rdc ## start rdc as systemd service
|
||||
```
|
||||
|
||||
## Invoke RDC using ROCm™ Data Center Interface (RDCI)
|
||||
RDCI provides command-line interface to all RDC features. This CLI can be run locally or remotely. Refer to [**user guide**](https://docs.amd.com/bundle/ROCm-Data-Center-Tool-User-Guide-v5.1/page/Feature_Overview.html) for the current list of features.
|
||||
|
||||
## sample rdci commands to test RDC functionality
|
||||
## discover devices in a local or remote compute node
|
||||
## NOTE: option -u (for unauthenticated) is required is rdcd was started in this mode
|
||||
```bash
|
||||
## sample rdci commands to test RDC functionality
|
||||
## discover devices in a local or remote compute node
|
||||
## NOTE: option -u (for unauthenticated) is required if rdcd was started in this mode
|
||||
|
||||
$ cd rdc_install_prefix ## If specified in Building RDC section
|
||||
./opt/rocm-<version>/rdc/bin/rdci discovery -l <-u> ## list available GPUs in localhost
|
||||
./opt/rocm-<version>/rdc/bin/rdci discovery <host> -l <-u> ## list available GPUs in host machine
|
||||
cd rdc_install_prefix ## If specified in Building RDC section
|
||||
cd ./opt/rocm-<version>/rdc/bin
|
||||
./rdci discovery -l <-u> ## list available GPUs in localhost
|
||||
./rdci discovery <host> -l <-u> ## list available GPUs in host machine
|
||||
./rdci dmon <host> <-u> -l ## list most GPU counters
|
||||
# assuming rdcd is running locally, using -u instead of <host>
|
||||
./rdci dmon -u --list-all ## list all GPU counters
|
||||
./rdci dmon -u -i 0 -c 1 -e 100 ## monitor field 100 on gpu 0 for count of 1
|
||||
./rdci dmon -u -i 0 -c 1 -e 1,2 ## monitor fields 1,2 on gpu 0 for count of 1
|
||||
# below requires rocmtools to be installed
|
||||
./rdci dmon -u -i 0 -c 5 -e 700 ## monitor field 700 on gpu 0 for count of 5
|
||||
# below is only likely to work on MI series GPUs
|
||||
./rdci dmon -u -i 0 -c 5 -e 700,701,702,706 ## monitor fields 700,701,702,706
|
||||
```
|
||||
|
||||
## Troubleshooting rdcd
|
||||
|
||||
Log messages that can provide useful debug information.
|
||||
- Log messages that can provide useful debug information.
|
||||
|
||||
## If rdcd was started as a systemd service, then use journalctl to view rdcd logs
|
||||
journalctl -u rdc
|
||||
If rdcd was started as a systemd service, then use journalctl to view rdcd logs
|
||||
```bash
|
||||
journalctl -u rdc
|
||||
```
|
||||
|
||||
## To run rdcd with debug log from command-line use
|
||||
## version will be the version number(ex:3.10.0) of ROCm where RDC was pacakged with
|
||||
RDC_LOG=DEBUG /opt/rocm-<version>/rdc/bin/rdcd
|
||||
To run rdcd with debug log from command-line use
|
||||
version will be the version number(ex:3.10.0) of ROCm where RDC was packaged with
|
||||
```bash
|
||||
RDC_LOG=DEBUG /opt/rocm-<version>/rdc/bin/rdcd
|
||||
```
|
||||
|
||||
RDC_LOG=DEBUG also works on rdci
|
||||
|
||||
ERROR, INFO, DEBUG logging levels are supported
|
||||
|
||||
- All `RDC_FI_PROF_*` metrics return N/A
|
||||
|
||||
1. Is `ROCMTOOLS_METRICS_PATH` set?
|
||||
2. Does your GPU support selected fields?
|
||||
Field 700 (`RDC_FI_PROF_ELAPSED_CYCLES`) is supposed to be accessible on most GPUs.
|
||||
Others are mostly intended for MI series.
|
||||
3. Set `RDC_LOG=DEBUG` as stated above
|
||||
4. Is rocmtools installed? Can you find `librocmtools.so`?
|
||||
|
||||
@@ -0,0 +1,38 @@
|
||||
# This module provides a rocmtools::rocmtools package
|
||||
# You can specify the ROCM directory by setting ROCM_DIR
|
||||
|
||||
set(NAME rocmtools)
|
||||
|
||||
if(NOT DEFINED ROCM_DIR)
|
||||
set(ROCM_DIR "/opt/rocm")
|
||||
endif()
|
||||
|
||||
find_library(
|
||||
${NAME}_LIBRARY
|
||||
NAMES ${NAME} ${NAME}64
|
||||
HINTS "${ROCM_DIR}"
|
||||
REGISTRY_VIEW BOTH
|
||||
PATH_SUFFIXES lib)
|
||||
|
||||
if(NOT DEFINED (${NAME}_INCLUDE_DIR))
|
||||
find_path(
|
||||
${NAME}_INCLUDE_DIR
|
||||
NAMES ${NAME}.h
|
||||
HINTS "${ROCM_DIR}/include"
|
||||
PATH_SUFFIXES ${NAME} ${NAME}/inc)
|
||||
endif()
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(${NAME}
|
||||
FOUND_VAR ${NAME}_FOUND
|
||||
REQUIRED_VARS
|
||||
${NAME}_LIBRARY
|
||||
${NAME}_INCLUDE_DIR)
|
||||
|
||||
if(${NAME}_FOUND AND NOT TARGET ${NAME}::${NAME})
|
||||
add_library(${NAME}::${NAME} UNKNOWN IMPORTED)
|
||||
set_target_properties(${NAME}::${NAME} PROPERTIES
|
||||
IMPORTED_LOCATION "${${NAME}_LIBRARY}"
|
||||
INTERFACE_COMPILE_OPTIONS "${PC_${NAME}_CFLAGS_OTHER}"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${${NAME}_INCLUDE_DIR}")
|
||||
endif()
|
||||
@@ -58,9 +58,11 @@ function(create_library_symlink)
|
||||
endforeach()
|
||||
# Symlink for private libraries
|
||||
set(LIB_RDC_ROCR "librdc_rocr.so")
|
||||
set(LIB_RDC_ROCP "librdc_rocp.so")
|
||||
set(LIB_RDC_RAS "librdc_ras.so")
|
||||
set(LIB_RDC_CLIENT_SMI "librdc_client_smi.so")
|
||||
set(library_files "${LIB_RDC_ROCR}" "${LIB_RDC_ROCR}.${MAJ_VERSION}" "${LIB_RDC_ROCR}.${SO_VERSION}" )
|
||||
set(library_files "${LIB_RDC_ROCP}" "${LIB_RDC_ROCP}.${MAJ_VERSION}" "${LIB_RDC_ROCP}.${SO_VERSION}" )
|
||||
set(library_files "${library_files}" "${LIB_RDC_CLIENT_SMI}" "${LIB_RDC_CLIENT_SMI}.${MAJ_VERSION}" "${LIB_RDC_CLIENT_SMI}.${SO_VERSION}" )
|
||||
set(library_files "${library_files}" "${LIB_RDC_RAS}")
|
||||
|
||||
|
||||
@@ -29,6 +29,10 @@ THE SOFTWARE.
|
||||
// 4 bool do or do not display in rdci
|
||||
// rdc_field_t Description rdci label To Display
|
||||
// =========== =========== ========= ==========
|
||||
#ifndef FLD_DESC_ENT
|
||||
#define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY)
|
||||
#endif
|
||||
|
||||
FLD_DESC_ENT(RDC_FI_INVALID, "Unknown/Invalid field", "INVALID", false)
|
||||
FLD_DESC_ENT(RDC_FI_GPU_COUNT, "GPU count in the system", "GPU_COUNT", true)
|
||||
FLD_DESC_ENT(RDC_FI_DEV_NAME, "Name of the device", "DEV_NAME", true)
|
||||
@@ -73,6 +77,28 @@ FLD_DESC_ENT(RDC_FI_ECC_FUSE_DED, "FUSE Double Error Detection",
|
||||
FLD_DESC_ENT(RDC_FI_ECC_UMC_SEC, "UMC Single Error Correction", "ECC_UMC_SEC", true)
|
||||
FLD_DESC_ENT(RDC_FI_ECC_UMC_DED, "UMC Double Error Detection", "ECC_UMC_DED", true)
|
||||
|
||||
// ROCProfiler fields
|
||||
// This doesn't map to rocprofiler counters directly
|
||||
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
|
||||
// See metrics.xml in rocmtools
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "PROF_ELAPSED_COUNT", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "PROF_ACTIVE_WAVES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "PROF_ACTIVE_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "PROF_CU_OCCUPANCY", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_CU_UTILIZATION, "Active Cycles / total Elapsed Cycles", "PROF_CU_UTILIZATION", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_FETCH_SIZE, "kb fetched from video memory", "PROF_FETCH_SIZE", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_WRITE_SIZE, "kb written to video memory", "PROF_WRITE_SIZE", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_16, "Number of fp16 OPS / second", "PROF_FLOPS_16", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_32, "Number of fp32 OPS / second", "PROF_FLOPS_32", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_64, "Number of fp64 OPS / second", "PROF_FLOPS_64", false)
|
||||
// TODO: uncomment when below are implemented
|
||||
FLD_DESC_ENT(RDC_FI_PROF_GFLOPS_16, "Number of fp16 GOPS / second", "PROF_GFLOPS_16", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_GFLOPS_32, "Number of fp32 GOPS / second", "PROF_GFLOPS_32", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_GFLOPS_64, "Number of fp64 GOPS / second", "PROF_GFLOPS_64", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_MEMR_BW_KBPNS, "HBM Read Bandwidth in kb/ns", "PROF_MEMR_BW_KBPNS", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_MEMW_BW_KBPNS, "HBM Write Bandwidth in kb/ns", "PROF_MEMW_BW_KBPNS", false)
|
||||
|
||||
// Events
|
||||
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_XGMI_0_REQ_TX, "Outgoing requests to neighbor 0", "XGMI_REQ_0", false)
|
||||
FLD_DESC_ENT(RDC_EVNT_XGMI_0_RESP_TX, "Outgoing responses to neighbor 0", "XGMI_RES_0", false)
|
||||
|
||||
@@ -104,11 +104,11 @@ typedef enum {
|
||||
|
||||
|
||||
//! ID used to represent an invalid GPU
|
||||
#define GPU_ID_INVALID -1
|
||||
#define GPU_ID_INVALID (-1)
|
||||
//! Used to specify all GPUs
|
||||
#define RDC_GROUP_ALL_GPUS -1000
|
||||
#define RDC_GROUP_ALL_GPUS (-1000)
|
||||
//! Used to specify all stats fields
|
||||
#define RDC_JOB_STATS_FIELDS -1000
|
||||
#define RDC_JOB_STATS_FIELDS (-1000)
|
||||
|
||||
/**
|
||||
* @brief The max rdc field string length
|
||||
@@ -223,6 +223,25 @@ typedef enum {
|
||||
RDC_FI_ECC_UMC_SEC, //!< UMC Single Error Correction
|
||||
RDC_FI_ECC_UMC_DED, //!< UMC Double Error Detection
|
||||
|
||||
/**
|
||||
* @brief ROC-profiler related fields
|
||||
*/
|
||||
RDC_FI_PROF_ELAPSED_CYCLES = 700, //!< Number of elapsed cycles over all SMs
|
||||
RDC_FI_PROF_ACTIVE_WAVES, //!< Number of Active Waves
|
||||
RDC_FI_PROF_ACTIVE_CYCLES, //!< Number of Active Cycles
|
||||
RDC_FI_PROF_CU_OCCUPANCY, //!< Active Waves / maximum active Waves supported
|
||||
RDC_FI_PROF_CU_UTILIZATION, //!< Total active cycles / Total elapsed cycles
|
||||
RDC_FI_PROF_FETCH_SIZE, //!< Number of kilobytes fetched from video memory
|
||||
RDC_FI_PROF_WRITE_SIZE, //!< Number of kilobytes written to video memory
|
||||
RDC_FI_PROF_FLOPS_16, //!< Number of fp16 OPS / second
|
||||
RDC_FI_PROF_FLOPS_32, //!< Number of fp32 OPS / second
|
||||
RDC_FI_PROF_FLOPS_64, //!< Number of fp64 OPS / second
|
||||
RDC_FI_PROF_GFLOPS_16, //!< Number of fp16 GOPS / second
|
||||
RDC_FI_PROF_GFLOPS_32, //!< Number of fp32 GOPS / second
|
||||
RDC_FI_PROF_GFLOPS_64, //!< Number of fp64 GOPS / second
|
||||
RDC_FI_PROF_MEMR_BW_KBPNS, //!< HBM Read Bandwidth in kilobytes / nanosecond
|
||||
RDC_FI_PROF_MEMW_BW_KBPNS, //!< HBM Write Bandwidth in kilobytes / nanosecond
|
||||
|
||||
/*
|
||||
* @brief Raw XGMI counter events
|
||||
*/
|
||||
@@ -253,7 +272,7 @@ typedef enum {
|
||||
//!< neighbor 1
|
||||
RDC_EVNT_XGMI_1_BEATS_TX, //!< Data beats sent to
|
||||
//!< neighbor 1; Each beat
|
||||
//!< represnts 32 bytes
|
||||
//!< represents 32 bytes
|
||||
|
||||
// "Composite" events. These events have additional processing beyond
|
||||
// the value provided by the rocm_smi library.
|
||||
@@ -328,7 +347,7 @@ typedef struct {
|
||||
|
||||
uint64_t energy_consumed; //!< GPU Energy consumed
|
||||
uint64_t ecc_correct; //!< Correctable errors
|
||||
uint64_t ecc_uncorrect; //!< Uncorrtable errors
|
||||
uint64_t ecc_uncorrect; //!< Uncorrectable errors
|
||||
rdc_stats_summary_t pcie_tx; //!< Bytes sent over PCIe stats
|
||||
rdc_stats_summary_t pcie_rx; //!< Bytes received over PCIe stats
|
||||
rdc_stats_summary_t power_usage; //!< GPU Power usage stats
|
||||
@@ -348,7 +367,7 @@ typedef struct {
|
||||
uint32_t num_gpus; //!< Number of GPUs used by job
|
||||
rdc_gpu_usage_info_t summary; //!< Job usage summary statistics
|
||||
//!< (overall)
|
||||
rdc_gpu_usage_info_t gpus[16]; //!< Job usage summary staticstics by GPU
|
||||
rdc_gpu_usage_info_t gpus[16]; //!< Job usage summary statistics by GPU
|
||||
} rdc_job_info_t;
|
||||
|
||||
/**
|
||||
@@ -958,18 +977,18 @@ rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle,
|
||||
/**
|
||||
* @brief Run the diagnostic test cases
|
||||
*
|
||||
* @details Run the diagnostic test cases at differenet levles.
|
||||
* @details Run the diagnostic test cases at different levels.
|
||||
*
|
||||
* @param[in] p_rdc_handle The RDC handler.
|
||||
*
|
||||
* @param[in] group_id The GPU group id.
|
||||
*
|
||||
* @param[in] level The level decides how long the test will run.
|
||||
* The RDC_DIAG_LVL_SHORT only take a few seconds, and the
|
||||
* The RDC_DIAG_LVL_SHORT only take a few seconds, and the
|
||||
* the RDC_DIAG_LVL_LONG may take up to 15 minutes.
|
||||
*
|
||||
*
|
||||
* @param[inout] response The detail results of the tests run.
|
||||
*
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_diagnostic_run(
|
||||
@@ -988,9 +1007,9 @@ rdc_status_t rdc_diagnostic_run(
|
||||
* @param[in] group_id The GPU group id.
|
||||
*
|
||||
* @param[in] test_case The test case to run.
|
||||
*
|
||||
*
|
||||
* @param[inout] result The results of the test.
|
||||
*
|
||||
*
|
||||
* @retval ::RDC_ST_OK is returned upon successful call.
|
||||
*/
|
||||
rdc_status_t rdc_test_case_run(
|
||||
|
||||
@@ -23,21 +23,24 @@ THE SOFTWARE.
|
||||
#define INCLUDE_RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_
|
||||
|
||||
#include <memory>
|
||||
#include "rdc_lib/RdcModuleMgr.h"
|
||||
|
||||
#include "rdc_lib/RdcMetricFetcher.h"
|
||||
#include "rdc_lib/RdcModuleMgr.h"
|
||||
#include "rdc_lib/RdcTelemetry.h"
|
||||
#include "rdc_lib/impl/RdcRasLib.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
#include "rdc_lib/impl/RdcRocpLib.h"
|
||||
#include "rdc_lib/impl/RdcRocrLib.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RdcModuleMgrImpl: public RdcModuleMgr {
|
||||
class RdcModuleMgrImpl : public RdcModuleMgr {
|
||||
public:
|
||||
RdcTelemetryPtr get_telemetry_module() override;
|
||||
RdcDiagnosticPtr get_diagnostic_module() override;
|
||||
explicit RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher);
|
||||
|
||||
private:
|
||||
// Function module
|
||||
RdcTelemetryPtr rdc_telemetry_module_;
|
||||
@@ -48,10 +51,10 @@ class RdcModuleMgrImpl: public RdcModuleMgr {
|
||||
RdcSmiLibPtr smi_lib_;
|
||||
RdcMetricFetcherPtr fetcher_;
|
||||
RdcRocrLibPtr rocr_lib_;
|
||||
RdcRocpLibPtr rocp_lib_;
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
|
||||
#endif // INCLUDE_RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_
|
||||
|
||||
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef INCLUDE_RDC_LIB_IMPL_RDCROCPLIB_H_
|
||||
#define INCLUDE_RDC_LIB_IMPL_RDCROCPLIB_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "rdc_lib/RdcLibraryLoader.h"
|
||||
#include "rdc_lib/RdcTelemetry.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
class RdcRocpLib : public RdcTelemetry {
|
||||
public:
|
||||
/* Telemetry */
|
||||
|
||||
// get support field ids
|
||||
rdc_status_t rdc_telemetry_fields_query(
|
||||
uint32_t field_ids[MAX_NUM_FIELDS],
|
||||
uint32_t* field_count) override;
|
||||
|
||||
// Fetch
|
||||
rdc_status_t rdc_telemetry_fields_value_get(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count,
|
||||
rdc_field_value_f callback,
|
||||
void* user_data) override;
|
||||
|
||||
rdc_status_t rdc_telemetry_fields_watch(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) override;
|
||||
|
||||
rdc_status_t rdc_telemetry_fields_unwatch(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) override;
|
||||
|
||||
uint64_t get_profiler_version();
|
||||
|
||||
explicit RdcRocpLib(const char* lib_name);
|
||||
|
||||
~RdcRocpLib();
|
||||
|
||||
private:
|
||||
RdcLibraryLoader lib_loader_;
|
||||
|
||||
rdc_status_t (*telemetry_fields_query_)(
|
||||
uint32_t field_ids[MAX_NUM_FIELDS],
|
||||
uint32_t* field_count);
|
||||
|
||||
rdc_status_t (*telemetry_fields_value_get_)(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count,
|
||||
rdc_field_value_f callback,
|
||||
void* user_data);
|
||||
|
||||
rdc_status_t (*telemetry_fields_watch_)(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count);
|
||||
|
||||
rdc_status_t (*telemetry_fields_unwatch_)(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count);
|
||||
};
|
||||
|
||||
using RdcRocpLibPtr = std::shared_ptr<RdcRocpLib>;
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // INCLUDE_RDC_LIB_IMPL_RDCROCPLIB_H_
|
||||
@@ -29,6 +29,7 @@ THE SOFTWARE.
|
||||
#include "rdc_lib/RdcTelemetry.h"
|
||||
#include "rdc_lib/impl/RdcRasLib.h"
|
||||
#include "rdc_lib/impl/RdcSmiLib.h"
|
||||
#include "rdc_lib/impl/RdcRocpLib.h"
|
||||
#include "rdc_lib/RdcMetricFetcher.h"
|
||||
|
||||
namespace amd {
|
||||
@@ -50,7 +51,8 @@ class RdcTelemetryModule : public RdcTelemetry {
|
||||
uint32_t fields_count);
|
||||
|
||||
RdcTelemetryModule(const RdcSmiLibPtr& smi_lib,
|
||||
const RdcRasLibPtr& ras_module);
|
||||
const RdcRasLibPtr& ras_module,
|
||||
const RdcRocpLibPtr& rocp_module);
|
||||
|
||||
private:
|
||||
//< Helper function to dispatch fields to module
|
||||
|
||||
@@ -0,0 +1,137 @@
|
||||
/*
|
||||
Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_
|
||||
#define RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_
|
||||
#include <rocmtools.h>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <typeinfo>
|
||||
#include <unordered_map>
|
||||
#include "rdc/rdc.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
/**
|
||||
* @brief Map of RDC fields to rocmtools counters
|
||||
*
|
||||
* See metrics.xml in rocmtools for more info.
|
||||
* RDC_CALC fields are calculated over time by RDC.
|
||||
*/
|
||||
static const std::unordered_map<rdc_field_t, const char*> counter_map_k = {
|
||||
{RDC_FI_PROF_ELAPSED_CYCLES, "GRBM_COUNT"},
|
||||
{RDC_FI_PROF_ACTIVE_WAVES, "SQ_WAVES"},
|
||||
{RDC_FI_PROF_ACTIVE_CYCLES, "SQ_BUSY_CU_CYCLES"},
|
||||
{RDC_FI_PROF_CU_OCCUPANCY, "CU_OCCUPANCY"},
|
||||
{RDC_FI_PROF_CU_UTILIZATION, "CU_UTILIZATION"},
|
||||
{RDC_FI_PROF_FETCH_SIZE, "FETCH_SIZE"},
|
||||
{RDC_FI_PROF_WRITE_SIZE, "WRITE_SIZE"},
|
||||
{RDC_FI_PROF_FLOPS_16, "TOTAL_16_OPS"},
|
||||
{RDC_FI_PROF_FLOPS_32, "TOTAL_32_OPS"},
|
||||
{RDC_FI_PROF_FLOPS_64, "TOTAL_64_OPS"},
|
||||
// fields below require special handling
|
||||
{RDC_FI_PROF_GFLOPS_16, "TOTAL_16_OPS"},
|
||||
{RDC_FI_PROF_GFLOPS_32, "TOTAL_32_OPS"},
|
||||
{RDC_FI_PROF_GFLOPS_64, "TOTAL_64_OPS"},
|
||||
{RDC_FI_PROF_MEMR_BW_KBPNS, "FETCH_SIZE"},
|
||||
{RDC_FI_PROF_MEMW_BW_KBPNS, "WRITE_SIZE"},
|
||||
};
|
||||
|
||||
/// Common interface for RocP tests and samples
|
||||
class RdcRocpBase {
|
||||
typedef std::pair<uint32_t, rdc_field_t> pair_gpu_field_t;
|
||||
typedef struct session_info_t {
|
||||
rocmtools_session_id_t id{};
|
||||
std::chrono::
|
||||
time_point<std::chrono::system_clock, std::chrono::nanoseconds>
|
||||
start_time;
|
||||
std::chrono::
|
||||
time_point<std::chrono::system_clock, std::chrono::nanoseconds>
|
||||
stop_time;
|
||||
} session_info_t;
|
||||
|
||||
public:
|
||||
RdcRocpBase();
|
||||
RdcRocpBase(const RdcRocpBase&) = default;
|
||||
RdcRocpBase(RdcRocpBase&&) = delete;
|
||||
RdcRocpBase& operator=(const RdcRocpBase&) = delete;
|
||||
RdcRocpBase& operator=(RdcRocpBase&&) = delete;
|
||||
~RdcRocpBase();
|
||||
|
||||
/**
|
||||
* @brief Lookup ROCProfiler counter
|
||||
*
|
||||
* @param[in] field An existing field already added to sessions dictionary
|
||||
* @param[out] value A pointer that will be populated with returned value
|
||||
*
|
||||
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
|
||||
* successfully.
|
||||
*/
|
||||
rdc_status_t rocp_lookup(pair_gpu_field_t gpu_field, double* value);
|
||||
|
||||
/**
|
||||
* @brief Destroy ROCmTools session responsible for monitoring a given
|
||||
* field
|
||||
*
|
||||
* @details While rocmtools supports multiple fields per ID - it has a
|
||||
* limit to how many counters it can query internally.
|
||||
* To avoid concerning ourselves with said limit, we limit each session to
|
||||
* 1 field.
|
||||
* In the future this can be optimized to allow for multiple fields per
|
||||
* session.
|
||||
*
|
||||
* @param[in] field A field to start monitoring
|
||||
*
|
||||
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
|
||||
* successfully.
|
||||
*/
|
||||
rdc_status_t create_session(pair_gpu_field_t gpu_field);
|
||||
|
||||
/**
|
||||
* @brief Destroy ROCmTools session responsible for monitoring a given
|
||||
* field
|
||||
*
|
||||
* @param[in] field A field to stop monitoring
|
||||
*
|
||||
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
|
||||
* successfully.
|
||||
*/
|
||||
rdc_status_t destroy_session(pair_gpu_field_t gpu_field);
|
||||
|
||||
protected:
|
||||
private:
|
||||
std::map<pair_gpu_field_t, session_info_t> sessions;
|
||||
|
||||
/**
|
||||
* @brief Convert from rocmtools status into RDC status
|
||||
*/
|
||||
rdc_status_t Rocp2RdcError(rocmtools_status_t rocm_status);
|
||||
};
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
#endif // RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_
|
||||
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef RDC_MODULES_RDC_DIAGNOSTIC_RDCDIAGNOSTICLIB_H_
|
||||
#define RDC_MODULES_RDC_DIAGNOSTIC_RDCDIAGNOSTICLIB_H_
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcTelemetryLibInterface.h"
|
||||
|
||||
#endif // RDC_MODULES_RDC_DIAGNOSTIC_RDCDIAGNOSTICLIB_H_
|
||||
@@ -59,6 +59,7 @@ set(RDC_LIB_INC_DIR "${INC_DIR}")
|
||||
set(BOOTSTRAP_LIB "rdc_bootstrap")
|
||||
set(RDC_LIB "rdc")
|
||||
set(RDC_ROCR_LIB "rdc_rocr")
|
||||
set(RDC_ROCP_LIB "rdc_rocp")
|
||||
set(RDCCLIENT_LIB "rdc_client")
|
||||
|
||||
## Set RUNPATH if ROCM_RPATH is defined and passed by the environment
|
||||
@@ -90,7 +91,8 @@ set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core")
|
||||
set(CPACK_RPM_PACKAGE_REQUIRES "rocm-core")
|
||||
|
||||
# link grpc and ROCm to RSMI
|
||||
link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64" "${ROCM_DIR}/lib")
|
||||
#link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64" "${ROCM_DIR}/lib")
|
||||
link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64")
|
||||
|
||||
# add librdc_bootstrap.so
|
||||
add_subdirectory(bootstrap)
|
||||
@@ -98,9 +100,12 @@ add_subdirectory(bootstrap)
|
||||
# add librdc.so
|
||||
add_subdirectory(rdc)
|
||||
|
||||
# add librdc_rocr.so to RDC_LIBS_MODULES
|
||||
# add librdc_rocr.so to RDC_LIB_MODULES
|
||||
add_subdirectory(rdc_modules/rdc_rocr)
|
||||
|
||||
# add librdc_rocp.so to RDC_LIB_MODULES
|
||||
add_subdirectory(rdc_modules/rdc_rocp)
|
||||
|
||||
if(BUILD_STANDALONE)
|
||||
# add librdc_client.so
|
||||
add_subdirectory(rdc_client)
|
||||
@@ -125,7 +130,7 @@ else()
|
||||
endif()
|
||||
|
||||
# Add module directives if those exist
|
||||
if(RDC_LIBS_MODULES)
|
||||
if(RDC_LIB_MODULES)
|
||||
install(TARGETS ${RDC_LIB_MODULES}
|
||||
EXPORT rdcTargets
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${RDC}
|
||||
|
||||
@@ -25,11 +25,11 @@ message("BOOTSTRAP_LIB_INC_LIST=${BOOTSTRAP_LIB_INC_LIST}")
|
||||
add_library(${BOOTSTRAP_LIB} SHARED ${BOOTSTRAP_LIB_SRC_LIST} ${BOOTSTRAP_LIB_INC_LIST})
|
||||
target_link_libraries(${BOOTSTRAP_LIB} pthread dl)
|
||||
target_include_directories(${BOOTSTRAP_LIB} PRIVATE
|
||||
"${RSMI_INC_DIR}"
|
||||
"${ROCM_DIR}/include"
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
"${COMMON_DIR}")
|
||||
"${COMMON_DIR}"
|
||||
"${RSMI_INC_DIR}"
|
||||
"${ROCM_DIR}/include")
|
||||
|
||||
target_include_directories(${BOOTSTRAP_LIB}
|
||||
PUBLIC
|
||||
|
||||
@@ -19,6 +19,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST}
|
||||
"${SRC_DIR}/RdcNotificationImpl.cc"
|
||||
"${SRC_DIR}/RdcPerfTimer.cc"
|
||||
"${SRC_DIR}/RdcRasLib.cc"
|
||||
"${SRC_DIR}/RdcRocpLib.cc"
|
||||
"${SRC_DIR}/RdcRocrLib.cc"
|
||||
"${SRC_DIR}/RdcSmiDiagnosticImpl.cc"
|
||||
"${SRC_DIR}/RdcSmiLib.cc"
|
||||
@@ -50,6 +51,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST}
|
||||
"${INC_DIR}/impl/RdcModuleMgrImpl.h"
|
||||
"${INC_DIR}/impl/RdcNotificationImpl.h"
|
||||
"${INC_DIR}/impl/RdcRasLib.h"
|
||||
"${INC_DIR}/impl/RdcRocpLib.h"
|
||||
"${INC_DIR}/impl/RdcRocrLib.h"
|
||||
"${INC_DIR}/impl/RdcSmiDiagnosticImpl.h"
|
||||
"${INC_DIR}/impl/RdcSmiLib.h"
|
||||
|
||||
@@ -20,20 +20,21 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include "rdc_lib/impl/RdcModuleMgrImpl.h"
|
||||
#include "rdc_lib/impl/RdcTelemetryModule.h"
|
||||
|
||||
#include "rdc_lib/impl/RdcDiagnosticModule.h"
|
||||
#include "rdc_lib/impl/RdcRasLib.h"
|
||||
#include "rdc_lib/impl/RdcRocpLib.h"
|
||||
#include "rdc_lib/impl/RdcRocrLib.h"
|
||||
#include "rdc_lib/impl/RdcTelemetryModule.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
RdcModuleMgrImpl::RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher):
|
||||
smi_lib_(std::make_shared<RdcSmiLib>(fetcher)) {
|
||||
RdcModuleMgrImpl::RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher)
|
||||
: smi_lib_(std::make_shared<RdcSmiLib>(fetcher)) {
|
||||
// The smi_lib_ always need to be loaded.
|
||||
}
|
||||
|
||||
|
||||
RdcTelemetryPtr RdcModuleMgrImpl::get_telemetry_module() {
|
||||
if (rdc_telemetry_module_) {
|
||||
return rdc_telemetry_module_;
|
||||
@@ -44,8 +45,13 @@ RdcTelemetryPtr RdcModuleMgrImpl::get_telemetry_module() {
|
||||
ras_lib_.reset(new RdcRasLib("librdc_ras.so"));
|
||||
}
|
||||
|
||||
if (!rocp_lib_) {
|
||||
rocp_lib_.reset(new RdcRocpLib("librdc_rocp.so"));
|
||||
}
|
||||
|
||||
if (!rdc_telemetry_module_) {
|
||||
rdc_telemetry_module_.reset(new RdcTelemetryModule(smi_lib_, ras_lib_));
|
||||
rdc_telemetry_module_.reset(
|
||||
new RdcTelemetryModule(smi_lib_, ras_lib_, rocp_lib_));
|
||||
}
|
||||
|
||||
return rdc_telemetry_module_;
|
||||
@@ -75,4 +81,3 @@ RdcDiagnosticPtr RdcModuleMgrImpl::get_diagnostic_module() {
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include "rdc_lib/impl/RdcRocpLib.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
// TODO: Add init and destroy calls support
|
||||
RdcRocpLib::RdcRocpLib(const char* lib_name)
|
||||
: telemetry_fields_query_(nullptr),
|
||||
telemetry_fields_value_get_(nullptr),
|
||||
telemetry_fields_watch_(nullptr),
|
||||
telemetry_fields_unwatch_(nullptr) {
|
||||
rdc_status_t status = lib_loader_.load(lib_name);
|
||||
if (status != RDC_ST_OK) {
|
||||
RDC_LOG(RDC_ERROR, "Rocp related function will not work.");
|
||||
return;
|
||||
}
|
||||
|
||||
status = lib_loader_.load_symbol(
|
||||
&telemetry_fields_query_, "rdc_telemetry_fields_query");
|
||||
if (status != RDC_ST_OK) {
|
||||
telemetry_fields_query_ = nullptr;
|
||||
}
|
||||
|
||||
status = lib_loader_.load_symbol(
|
||||
&telemetry_fields_value_get_, "rdc_telemetry_fields_value_get");
|
||||
if (status != RDC_ST_OK) {
|
||||
telemetry_fields_value_get_ = nullptr;
|
||||
}
|
||||
|
||||
status = lib_loader_.load_symbol(
|
||||
&telemetry_fields_watch_, "rdc_telemetry_fields_watch");
|
||||
if (status != RDC_ST_OK) {
|
||||
telemetry_fields_watch_ = nullptr;
|
||||
}
|
||||
|
||||
status = lib_loader_.load_symbol(
|
||||
&telemetry_fields_unwatch_, "rdc_telemetry_fields_unwatch");
|
||||
if (status != RDC_ST_OK) {
|
||||
telemetry_fields_unwatch_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
RdcRocpLib::~RdcRocpLib() = default;
|
||||
|
||||
// get support field ids
|
||||
rdc_status_t RdcRocpLib::rdc_telemetry_fields_query(
|
||||
uint32_t field_ids[MAX_NUM_FIELDS],
|
||||
uint32_t* field_count) {
|
||||
if (field_count == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
return telemetry_fields_query_(field_ids, field_count);
|
||||
}
|
||||
|
||||
// Fetch
|
||||
rdc_status_t RdcRocpLib::rdc_telemetry_fields_value_get(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count,
|
||||
rdc_field_value_f callback,
|
||||
void* user_data) {
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from rocp_lib.");
|
||||
|
||||
return telemetry_fields_value_get_(
|
||||
fields, fields_count, callback, user_data);
|
||||
}
|
||||
|
||||
rdc_status_t RdcRocpLib::rdc_telemetry_fields_watch(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) {
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
return telemetry_fields_watch_(fields, fields_count);
|
||||
}
|
||||
|
||||
rdc_status_t RdcRocpLib::rdc_telemetry_fields_unwatch(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) {
|
||||
if (fields == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
return telemetry_fields_unwatch_(fields, fields_count);
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
@@ -94,11 +94,15 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch(
|
||||
|
||||
RdcTelemetryModule::RdcTelemetryModule(
|
||||
const RdcSmiLibPtr& smi_lib,
|
||||
const RdcRasLibPtr& ras_module) {
|
||||
const RdcRasLibPtr& ras_module,
|
||||
const RdcRocpLibPtr& rocp_module) {
|
||||
telemetry_modules_.push_back(smi_lib);
|
||||
if (ras_module) {
|
||||
telemetry_modules_.push_back(ras_module);
|
||||
}
|
||||
if (rocp_module) {
|
||||
telemetry_modules_.push_back(rocp_module);
|
||||
}
|
||||
|
||||
auto ite = telemetry_modules_.begin();
|
||||
for (; ite != telemetry_modules_.end(); ite++) {
|
||||
@@ -166,4 +170,3 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get(
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
message(" Cmake RDC Lib-ROCP ")
|
||||
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
|
||||
|
||||
set(SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
set(INC_DIR "${PROJECT_SOURCE_DIR}/include/rdc_modules/rdc_rocp")
|
||||
|
||||
set(RDC_ROCP_LIB_COMPONENT "lib${RDC_ROCP_LIB}")
|
||||
set(RDC_ROCP_LIB_SRC_LIST
|
||||
"${BOOTSTRAP_LIB_SRC_DIR}/RdcLogger.cc"
|
||||
"${SRC_DIR}/RdcTelemetryLib.cc"
|
||||
"${SRC_DIR}/RdcRocpBase.cc")
|
||||
set(RDC_ROCP_LIB_INC_LIST
|
||||
"${PROJECT_SOURCE_DIR}/include/rdc/rdc.h"
|
||||
"${RDC_LIB_INC_DIR}/RdcDiagnosticLibInterface.h"
|
||||
"${RDC_LIB_INC_DIR}/rdc_common.h"
|
||||
"${RDC_LIB_INC_DIR}/RdcLogger.h"
|
||||
"${INC_DIR}/RdcTelemetryLib.h"
|
||||
"${INC_DIR}/RdcRocpBase.h")
|
||||
|
||||
if(BUILD_ROCPTEST)
|
||||
message("Build librdc_rocp.so is enabled, make sure ROCmTools is installed.")
|
||||
|
||||
message("RDC_ROCP_LIB_INC_LIST=${RDC_ROCP_LIB_INC_LIST}")
|
||||
|
||||
set(ROCMTOOLS_LIB rocmtools::rocmtools)
|
||||
# below provides rocmtools::rocmtools package
|
||||
include(Findrocmtools)
|
||||
|
||||
set(HSA_LIB "hsa-runtime64")
|
||||
set(RDC_LIB_MODULES ${RDC_LIB_MODULES} ${RDC_ROCP_LIB} PARENT_SCOPE)
|
||||
add_library(${RDC_ROCP_LIB} SHARED ${RDC_ROCP_LIB_SRC_LIST} ${RDC_ROCP_LIB_INC_LIST})
|
||||
target_link_libraries(${RDC_ROCP_LIB} ${RDC_LIB} ${BOOTSTRAP_LIB} ${HSA_LIB} ${ROCMTOOLS_LIB} pthread dl)
|
||||
target_include_directories(${RDC_ROCP_LIB} PRIVATE
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
"${COMMON_DIR}"
|
||||
"${RSMI_INC_DIR}"
|
||||
"${ROCM_DIR}/include"
|
||||
"${ROCM_DIR}/include/hsa")
|
||||
|
||||
# Set the VERSION and SOVERSION values
|
||||
set_property(TARGET ${RDC_ROCP_LIB} PROPERTY
|
||||
SOVERSION "${VERSION_MAJOR}")
|
||||
set_property(TARGET ${RDC_ROCP_LIB} PROPERTY
|
||||
VERSION "${SO_VERSION_STRING}")
|
||||
|
||||
# If the library is a release, strip the target library
|
||||
if("${CMAKE_BUILD_TYPE}" STREQUAL Release)
|
||||
add_custom_command(
|
||||
TARGET ${RDC_ROCP_LIB}
|
||||
POST_BUILD COMMAND ${CMAKE_STRIP} ${RDC_ROCP_LIB_COMPONENT}.so)
|
||||
endif()
|
||||
endif()
|
||||
@@ -0,0 +1,166 @@
|
||||
/*
|
||||
Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <rocmtools.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rdc_modules/rdc_rocp/RdcRocpBase.h"
|
||||
|
||||
namespace amd {
|
||||
namespace rdc {
|
||||
|
||||
RdcRocpBase::RdcRocpBase() {
|
||||
auto status = rocmtools_initialize();
|
||||
RDC_LOG(RDC_INFO, "rocmtools_initialize status: " << status);
|
||||
}
|
||||
|
||||
RdcRocpBase::~RdcRocpBase() {
|
||||
for (auto& session : sessions) {
|
||||
const rdc_status_t status = destroy_session(session.first);
|
||||
assert(status == RDC_ST_OK);
|
||||
}
|
||||
sessions.clear();
|
||||
auto status = rocmtools_finalize();
|
||||
RDC_LOG(RDC_INFO, "rocmtools_finalize status: " << status);
|
||||
}
|
||||
|
||||
rdc_status_t RdcRocpBase::rocp_lookup(
|
||||
pair_gpu_field_t gpu_field,
|
||||
double* value) {
|
||||
if (sessions.empty()) {
|
||||
return RDC_ST_NOT_FOUND;
|
||||
}
|
||||
|
||||
if (value == nullptr) {
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
}
|
||||
|
||||
rocmtools_device_profile_metric_t counter;
|
||||
session_info_t session = sessions.at(gpu_field);
|
||||
const rocmtools_status_t status =
|
||||
rocmtools_device_profiling_session_poll(session.id, &counter);
|
||||
session.stop_time = std::chrono::high_resolution_clock::now();
|
||||
if (status != ROCMTOOLS_STATUS_SUCCESS) {
|
||||
return Rocp2RdcError(status);
|
||||
}
|
||||
const auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(
|
||||
session.stop_time - session.start_time)
|
||||
.count();
|
||||
// some metrics are derived from others and depend on time passed
|
||||
switch (gpu_field.second) {
|
||||
case RDC_FI_PROF_GFLOPS_16:
|
||||
case RDC_FI_PROF_GFLOPS_32:
|
||||
case RDC_FI_PROF_GFLOPS_64:
|
||||
case RDC_FI_PROF_MEMR_BW_KBPNS:
|
||||
case RDC_FI_PROF_MEMW_BW_KBPNS:
|
||||
*value = counter.value.value / elapsed;
|
||||
break;
|
||||
default:
|
||||
*value = counter.value.value;
|
||||
break;
|
||||
}
|
||||
return Rocp2RdcError(status);
|
||||
}
|
||||
|
||||
rdc_status_t RdcRocpBase::create_session(pair_gpu_field_t gpu_field) {
|
||||
if (sessions.count(gpu_field) != 0) {
|
||||
RDC_LOG(
|
||||
RDC_DEBUG, "Session for field (" << gpu_field.second << ") on GPU ["
|
||||
<< gpu_field.first
|
||||
<< "] already exists!");
|
||||
return RDC_ST_ALREADY_EXIST;
|
||||
}
|
||||
|
||||
session_info_t session = {};
|
||||
|
||||
std::vector<const char*> rocmtools_fields = {
|
||||
counter_map_k.at(gpu_field.second)};
|
||||
// create session
|
||||
rocmtools_status_t status = rocmtools_device_profiling_session_create(
|
||||
rocmtools_fields.data(), rocmtools_fields.size(), &session.id, 0,
|
||||
gpu_field.first);
|
||||
|
||||
if (status != ROCMTOOLS_STATUS_SUCCESS) {
|
||||
return Rocp2RdcError(status);
|
||||
}
|
||||
|
||||
// add start time
|
||||
session.start_time = std::chrono::high_resolution_clock::now();
|
||||
sessions.emplace(gpu_field, session);
|
||||
|
||||
// start session
|
||||
status = rocmtools_device_profiling_session_start(session.id);
|
||||
|
||||
return Rocp2RdcError(status);
|
||||
}
|
||||
|
||||
rdc_status_t RdcRocpBase::destroy_session(pair_gpu_field_t gpu_field) {
|
||||
if (sessions.empty()) {
|
||||
RDC_LOG(RDC_DEBUG, "Cannot destroy empty session...");
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
// no session with field
|
||||
if (sessions.count(gpu_field) == 0) {
|
||||
RDC_LOG(
|
||||
RDC_DEBUG, "Cannot destroy session with field ("
|
||||
<< gpu_field.second << ") on GPU ["
|
||||
<< gpu_field.first
|
||||
<< "] because it doesn't exist...");
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
const rocmtools_session_id_t session_id = sessions.at(gpu_field).id;
|
||||
const rocmtools_status_t status =
|
||||
rocmtools_device_profiling_session_destroy(session_id);
|
||||
if (status == ROCMTOOLS_STATUS_SUCCESS) {
|
||||
const auto num_of_destroyed_sessions = sessions.erase(gpu_field);
|
||||
RDC_LOG(
|
||||
RDC_DEBUG,
|
||||
"destroyed (" << num_of_destroyed_sessions << ") sessions");
|
||||
}
|
||||
return Rocp2RdcError(status);
|
||||
}
|
||||
|
||||
rdc_status_t RdcRocpBase::Rocp2RdcError(rocmtools_status_t rocm_status) {
|
||||
switch (rocm_status) {
|
||||
case ROCMTOOLS_STATUS_SUCCESS:
|
||||
return RDC_ST_OK;
|
||||
case ROCMTOOLS_STATUS_ERROR_HAS_ACTIVE_SESSION:
|
||||
return RDC_ST_ALREADY_EXIST;
|
||||
case ROCMTOOLS_STATUS_ERROR_SESSION_FILTER_DATA_MISMATCH:
|
||||
case ROCMTOOLS_STATUS_ERROR_SESSION_MISSING_FILTER:
|
||||
case ROCMTOOLS_STATUS_ERROR_SESSION_NOT_FOUND:
|
||||
return RDC_ST_BAD_PARAMETER;
|
||||
default:
|
||||
return RDC_ST_UNKNOWN_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rdc
|
||||
} // namespace amd
|
||||
@@ -0,0 +1,143 @@
|
||||
/*
|
||||
Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
#include "rdc/rdc.h"
|
||||
#include "rdc_lib/RdcLogger.h"
|
||||
#include "rdc_lib/RdcTelemetryLibInterface.h"
|
||||
#include "rdc_lib/rdc_common.h"
|
||||
#include "rdc_modules/rdc_rocp/RdcRocpBase.h"
|
||||
|
||||
amd::rdc::RdcRocpBase rocp;
|
||||
|
||||
// get supported field ids
|
||||
// TODO: Query fields with rocprofiler
|
||||
rdc_status_t rdc_telemetry_fields_query(
|
||||
uint32_t field_ids[MAX_NUM_FIELDS],
|
||||
uint32_t* field_count) {
|
||||
// extract all keys from counter_map
|
||||
std::vector<uint32_t> counter_keys;
|
||||
counter_keys.reserve(amd::rdc::counter_map_k.size());
|
||||
for (auto it : amd::rdc::counter_map_k) {
|
||||
counter_keys.push_back(it.first);
|
||||
}
|
||||
|
||||
*field_count = counter_keys.size();
|
||||
// copy from vector into array
|
||||
std::copy(counter_keys.begin(), counter_keys.end(), field_ids);
|
||||
|
||||
return RDC_ST_OK;
|
||||
}
|
||||
|
||||
// Fetch
|
||||
rdc_status_t rdc_telemetry_fields_value_get(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count,
|
||||
rdc_field_value_f callback,
|
||||
void* user_data) {
|
||||
//
|
||||
// Bulk fetch fields
|
||||
std::vector<rdc_gpu_field_value_t> bulk_results;
|
||||
|
||||
struct timeval tv {};
|
||||
gettimeofday(&tv, nullptr);
|
||||
const uint64_t curTime =
|
||||
static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
|
||||
|
||||
// Fetch it one by one for left fields
|
||||
const int BULK_FIELDS_MAX = 16;
|
||||
rdc_gpu_field_value_t values[BULK_FIELDS_MAX];
|
||||
uint32_t bulk_count = 0;
|
||||
rdc_status_t status = RDC_ST_UNKNOWN_ERROR;
|
||||
double value = 0;
|
||||
for (uint32_t i = 0; i < fields_count; i++) {
|
||||
if (bulk_count >= BULK_FIELDS_MAX) {
|
||||
status = callback(values, bulk_count, user_data);
|
||||
// When the callback returns errors, stop processing and return.
|
||||
if (status != RDC_ST_OK) {
|
||||
return status;
|
||||
}
|
||||
bulk_count = 0;
|
||||
}
|
||||
|
||||
RDC_LOG(RDC_DEBUG, "ID: " << fields[i].field_id);
|
||||
|
||||
status = rocp.rocp_lookup(
|
||||
std::make_pair(fields[i].gpu_index, fields[i].field_id), &value);
|
||||
|
||||
// get value
|
||||
values[bulk_count].gpu_index = fields[i].gpu_index;
|
||||
values[bulk_count].field_value.type = DOUBLE;
|
||||
values[bulk_count].field_value.status = status;
|
||||
values[bulk_count].field_value.ts = curTime;
|
||||
values[bulk_count].field_value.value.dbl = value;
|
||||
values[bulk_count].field_value.field_id = fields[i].field_id;
|
||||
RDC_LOG(RDC_DEBUG, "VALUE: " << value);
|
||||
bulk_count++;
|
||||
}
|
||||
if (bulk_count != 0) {
|
||||
rdc_status_t status = callback(values, bulk_count, user_data);
|
||||
if (status != RDC_ST_OK) {
|
||||
return status;
|
||||
}
|
||||
bulk_count = 0;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
rdc_status_t rdc_telemetry_fields_watch(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) {
|
||||
rdc_status_t status = RDC_ST_OK;
|
||||
for (uint32_t i = 0; i < fields_count; i++) {
|
||||
RDC_LOG(RDC_DEBUG, "WATCH: " << fields[i].field_id);
|
||||
const rdc_status_t temp_status = rocp.create_session(
|
||||
std::make_pair(fields[i].gpu_index, fields[i].field_id));
|
||||
if (temp_status != RDC_ST_OK) {
|
||||
status = temp_status;
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
rdc_status_t rdc_telemetry_fields_unwatch(
|
||||
rdc_gpu_field_t* fields,
|
||||
uint32_t fields_count) {
|
||||
rdc_status_t status = RDC_ST_OK;
|
||||
for (uint32_t i = 0; i < fields_count; i++) {
|
||||
RDC_LOG(RDC_DEBUG, "UNWATCH: " << fields[i].field_id);
|
||||
const rdc_status_t temp_status = rocp.destroy_session(
|
||||
std::make_pair(fields[i].gpu_index, fields[i].field_id));
|
||||
// return last non-ok status
|
||||
if (temp_status != RDC_ST_OK) {
|
||||
status = temp_status;
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
@@ -36,15 +36,15 @@ if(BUILD_ROCRTEST)
|
||||
message("RDC_ROCR_LIB_INC_LIST=${RDC_ROCR_LIB_INC_LIST}")
|
||||
|
||||
set(HSA_LIB "hsa-runtime64")
|
||||
set(RDC_LIBS_MODULES ${RDC_LIBS_MODULES} ${RDC_ROCR_LIB} PARENT_SCOPE)
|
||||
set(RDC_LIB_MODULES ${RDC_LIB_MODULES} ${RDC_ROCR_LIB} PARENT_SCOPE)
|
||||
add_library(${RDC_ROCR_LIB} SHARED ${RDC_ROCR_LIB_SRC_LIST} ${RDC_ROCR_LIB_INC_LIST})
|
||||
target_link_libraries(${RDC_ROCR_LIB} ${RDC_LIB} ${BOOTSTRAP_LIB} ${HSA_LIB} pthread dl)
|
||||
target_include_directories(${RDC_ROCR_LIB} PRIVATE
|
||||
"${RSMI_INC_DIR}"
|
||||
"${ROCM_DIR}/include"
|
||||
"${PROJECT_SOURCE_DIR}"
|
||||
"${PROJECT_SOURCE_DIR}/include"
|
||||
"${COMMON_DIR}")
|
||||
"${COMMON_DIR}"
|
||||
"${RSMI_INC_DIR}"
|
||||
"${ROCM_DIR}/include")
|
||||
|
||||
# Set the VERSION and SOVERSION values
|
||||
set_property(TARGET ${RDC_ROCR_LIB} PROPERTY
|
||||
|
||||
@@ -1,9 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# This will return 0 if an id is created and non-zero if
|
||||
# it already exists
|
||||
# https://www.debian.org/doc/debian-policy/ch-opersys.html#users-and-groups
|
||||
do_create_rdc_user() {
|
||||
useradd -r -s /bin/nologin rdc
|
||||
adduser \
|
||||
--system \
|
||||
--quiet \
|
||||
--home /nonexistent \
|
||||
--no-create-home \
|
||||
--disabled-password \
|
||||
rdc
|
||||
if [ $(getent group render) ]; then
|
||||
usermod -a -G render rdc
|
||||
else
|
||||
@@ -13,16 +22,25 @@ do_create_rdc_user() {
|
||||
return 0
|
||||
}
|
||||
|
||||
create_rdc_service() {
|
||||
#Symlink RDC Service
|
||||
if [ -d /run/systemd/system ]; then
|
||||
ln -s -f -r /@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rdc/rdc.service /lib/systemd/system/rdc.service
|
||||
fi
|
||||
}
|
||||
|
||||
reload_systemd() {
|
||||
systemctl daemon-reload
|
||||
if [ -d /run/systemd/system ]; then
|
||||
systemctl daemon-reload
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
case "$1" in
|
||||
configure)
|
||||
do_create_rdc_user
|
||||
#Symlink RDC Service
|
||||
ln -s -f -r /@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rdc/rdc.service /lib/systemd/system/rdc.service
|
||||
create_rdc_service
|
||||
reload_systemd
|
||||
exit 0
|
||||
;;
|
||||
|
||||
@@ -3,13 +3,24 @@
|
||||
set -e
|
||||
|
||||
stop_rdc() {
|
||||
#stop RDC if running
|
||||
systemctl stop rdc
|
||||
if [ -d /run/systemd/system ]; then
|
||||
#stop RDC if running
|
||||
systemctl stop rdc
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
rm_rdc_service() {
|
||||
if [ -e /run/systemd/system ]; then
|
||||
unlink /lib/systemd/system/rdc.service
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
reload_systemd() {
|
||||
systemctl daemon-reload
|
||||
if [ -d /run/systemd/system ]; then
|
||||
systemctl daemon-reload
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -21,7 +32,7 @@ rm_pyc() {
|
||||
case "$1" in
|
||||
remove | upgrade )
|
||||
stop_rdc
|
||||
unlink /lib/systemd/system/rdc.service
|
||||
rm_rdc_service
|
||||
reload_systemd
|
||||
rm_pyc
|
||||
;;
|
||||
|
||||
@@ -1,17 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
stop_rdc() {
|
||||
#stop RDC if running
|
||||
systemctl stop rdc
|
||||
if [ -d /run/systemd/system ]; then
|
||||
systemctl stop rdc
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
rm_rdc_service() {
|
||||
if [ -e /run/systemd/system ]; then
|
||||
unlink @DISTRO_ROOT@/rdc.service
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
reload_systemd() {
|
||||
systemctl daemon-reload
|
||||
if [ -d /run/systemd/system ]; then
|
||||
systemctl daemon-reload
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
if [ $1 -le 1 ]; then
|
||||
# perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
|
||||
stop_rdc
|
||||
unlink @DISTRO_ROOT@/rdc.service
|
||||
rm_rdc_service
|
||||
reload_systemd
|
||||
fi
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
# https://fedoraproject.org/wiki/Packaging%3aUsersAndGroups
|
||||
do_create_rdc_user() {
|
||||
useradd -r -s /bin/nologin rdc
|
||||
useradd -r -s /sbin/nologin rdc
|
||||
if [ $(getent group render) ]; then
|
||||
usermod -a -G render rdc
|
||||
else
|
||||
@@ -12,14 +13,23 @@ do_create_rdc_user() {
|
||||
return 0
|
||||
}
|
||||
|
||||
create_rdc_service() {
|
||||
if [ -d /run/systemd/system ]; then
|
||||
ln -s -f -r /@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rdc/rdc.service @DISTRO_ROOT@/rdc.service
|
||||
fi
|
||||
}
|
||||
|
||||
reload_systemd() {
|
||||
systemctl daemon-reload
|
||||
if [ -d /run/systemd/system ]; then
|
||||
systemctl daemon-reload
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
do_create_rdc_user
|
||||
|
||||
#Symlink RDC Service
|
||||
ln -s -f -r /@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rdc/rdc.service @DISTRO_ROOT@/rdc.service
|
||||
create_rdc_service
|
||||
|
||||
#Request systemctl to reload file since RDC is adding new file/service
|
||||
reload_systemd
|
||||
|
||||
Ссылка в новой задаче
Block a user