This commit adds integration with ROCmTools

Additional changes:
- Fix DEB and RPM installation issue when systemd is not present
- Fix typos in rdc.h
- Wrap negative values in parentheses in rdc.h
- CMAKE: Improve rocm_smi searching
- README: Improve formatting, add info about ROCmTools

Metrics added: 700-714
Metrics can be listed with `rdci dmon --list-all`
Majority of the metrics are only supported by Instict (MI) series GPUs
700 RDC_FI_PROF_ELAPSED_CYCLES should be available on most devices
See README for more information

Change-Id: I907d3eacdc92fc5588ca6c76c2fa1ce0ad900770
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: 861a843ed7]
Этот коммит содержится в:
Galantsev, Dmitrii
2022-12-13 14:37:59 -06:00
родитель 001461e975
Коммит eccb4e202c
26 изменённых файлов: 1061 добавлений и 107 удалений
+3
Просмотреть файл
@@ -1,3 +1,6 @@
# my install directory used for testing
install/
# build directories generated by cmake
build/
cmake/build/
+11 -3
Просмотреть файл
@@ -49,6 +49,10 @@ option(BUILD_RASLIB "Build targets for raslib" OFF)
# which requires the Rocm run time.
option(BUILD_ROCRTEST "Build targets for librdc_rocr.so" ON)
# When cmake -DBUILD_ROCPTEST=off, it will not build the librdc_rocp.so
# which requires the Rocm profiler.
option(BUILD_ROCPTEST "Build targets for librdc_rocp.so" ON)
# When cmake -DBUILD_TESTS=off, it will not build RDC tests.
option(BUILD_TESTS "Build test suite" OFF)
@@ -81,9 +85,6 @@ endif()
set(COMMON_DIR "${CMAKE_CURRENT_SOURCE_DIR}/common")
set(RSMI_INC_DIR "${ROCM_DIR}/${CMAKE_INSTALL_INCLUDEDIR}" CACHE INTERNAL "ROCm SMI include directory.")
set(RSMI_LIB_DIR "${ROCM_DIR}/${CMAKE_INSTALL_LIBDIR}" CACHE INTERNAL "ROCm SMI library directory.")
set(GRPC_ROOT_DEFAULT "/usr")
set(GRPC_ROOT ${GRPC_ROOT_DEFAULT} CACHE STRING "GRPC installation directory.")
set(GRPC_DESIRED_VERSION 1.44.0 CACHE STRING "GRPC desired package version.")
@@ -141,6 +142,13 @@ if(NOT EXISTS "${CMAKE_SOURCE_DIR}/raslib/.git" AND BUILD_RASLIB)
If you do not want to build raslib, use cmake -DBUILD_RASLIB=off")
endif()
find_package(RSMI
NAMES rocm_smi
HINTS ${ROCM_DIR}/lib/cmake
CONFIGURE REQUIRED)
set(RSMI_INC_DIR "${ROCM_SMI_INCLUDE_DIR}" CACHE INTERNAL "ROCm SMI include directory.")
set(RSMI_LIB_DIR "${ROCM_SMI_LIB_DIR}" CACHE INTERNAL "ROCm SMI library directory.")
if(NOT EXISTS "${RSMI_INC_DIR}" OR NOT EXISTS "${RSMI_LIB_DIR}")
message(FATAL_ERROR "rocm_smi not found in ${RSMI_INC_DIR}. Please
make sure rocm_smi is installed and present in ${RSMI_INC_DIR}.")
+104 -55
Просмотреть файл
@@ -23,6 +23,7 @@ RDC can run on AMD ROCm supported platforms, please refer to [List of Supported
Latex (pdfTeX 3.14159265-2.6-1.40.16) ## required to build the latest documentation
gRPC and protoc ## required for communication
libcap-dev ## required to manage the privileges.
rocmtools ## required for profiler metrics
AMD ROCm platform (https://github.com/RadeonOpenCompute/ROCm)
* It is recommended to install the complete AMD ROCm platform.
@@ -30,6 +31,8 @@ RDC can run on AMD ROCm supported platforms, please refer to [List of Supported
* At the minimum, these two components are required
(i) AMD ROCm SMI Library (https://github.com/RadeonOpenCompute/rocm_smi_lib)
(ii) AMD ROCk Kernel driver (https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver)
* For profiler metrics, this component is required:
(i) AMD ROCm Tools (https://github.com/RadeonOpenCompute/rocm_smi_lib)
## Building gRPC and protoc
**NOTE:** gRPC and protoc compiler must be built when building RDC from source as pre-built packages are not available. When installing RDC from a package, gRPC and protoc will be installed from the package.
@@ -40,55 +43,65 @@ The following tools are required for gRPC build & installation
automake make g++ unzip build-essential autoconf libtool pkg-config libgflags-dev libgtest-dev clang-5.0 libc++-dev curl
Download and build gRPC
Download and build gRPC
```bash
git clone -b v1.44.0 https://github.com/grpc/grpc
cd grpc
git submodule update --init
mkdir -p cmake/build
cd cmake/build
```
$ git clone -b v1.44.0 https://github.com/grpc/grpc
$ cd grpc
$ git submodule update --init
$ mkdir -p cmake/build
$ cd cmake/build
## By default (without using CMAKE_INSTALL_PREFIX option), the following will install to /usr/local lib, include and bin directories
$ cmake -DgRPC_INSTALL=ON -DBUILD_SHARED_LIBS=ON <-DCMAKE_INSTALL_PREFIX=<install dir>> ../..
$ make
$ sudo make install
$ echo "<install dir>/lib" | sudo tee /etc/ld.so.conf.d/grpc.conf
By default (without using CMAKE_INSTALL_PREFIX option), the following will install to /usr/local lib, include and bin directories
```bash
cmake -DgRPC_INSTALL=ON -DBUILD_SHARED_LIBS=ON <-DCMAKE_INSTALL_PREFIX=<install dir>> ../..
make
sudo make install
echo "<install dir>/lib" | sudo tee /etc/ld.so.conf.d/grpc.conf
```
## Building RDC
Clone the RDC source code from GitHub and use CMake to build and install
$ git clone https://github.com/RadeonOpenCompute/rdc
$ cd rdc
$ mkdir -p build; cd build
$ cmake -DROCM_DIR=/opt/rocm -DGRPC_ROOT="$GRPC_PROTOC_ROOT" <-DCMAKE_INSTALL_PREFIX=<install dir>> ..
$ make
$ make install ## default installation location is /opt/rocm
```bash
git clone https://github.com/RadeonOpenCompute/rdc
cd rdc
mkdir -p build; cd build
cmake -DGRPC_ROOT="$GRPC_PROTOC_ROOT" <-DCMAKE_INSTALL_PREFIX=<install dir>> ..
make
make install ## default installation location is /opt/rocm, specify with -DROCM_DIR
```
## Building RDC library only without gRPC (optional)
If only the RDC libraries are needed (i.e. only "embedded mode" is required), the user can choose to not build rdci and rdcd. This will eliminate the need for gRPC and protoc. To build in this way, -DBUILD_STANDALONE=off should be passed on the the cmake command line:
$ cmake -DROCM_DIR=/opt/rocm -DBUILD_STANDALONE=off <-DCMAKE_INSTALL_PREFIX=<install dir>> ..
```bash
cmake DBUILD_STANDALONE=off <-DCMAKE_INSTALL_PREFIX=<install dir>> ..
```
## Building RDC library without ROCM Run time (optional)
The user can choose to not build RDC diagnostic ROCM Run time. This will eliminate the need for ROCM Run time. To build in this way, -DBUILD_ROCRTEST=off should be passed on the the cmake command line:
$ cmake -DROCM_DIR=/opt/rocm -DBUILD_ROCRTEST=off <-DCMAKE_INSTALL_PREFIX=<install dir>> ..
```bash
cmake DBUILD_ROCRTEST=off <-DCMAKE_INSTALL_PREFIX=<install dir>> ..
```
## Update System Library Path
The following commands need to be executed as root (sudo). It may be easiest to put them into a script and then run that script as root:
$ RDC_LIB_DIR=<RDC install dir>/lib
$ GRPC_LIB_DIR=<gRPC install dir>/lib
$ echo "$GRPC_LIB_DIR" > /etc/ld.so.conf.d/x86_64-librdc_client.conf
$ echo "$GRPC_LIB_DIR"64 >> /etc/ld.so.conf.d/x86_64-librdc_client.conf
$ echo "$RDC_LIB_DIR" >> /etc/ld.so.conf.d/x86_64-librdc_client.conf
$ echo "$RDC_LIB_DIR"64 >> /etc/ld.so.conf.d/x86_64-librdc_client.conf
$ ldconfig
```bash
RDC_LIB_DIR=<RDC install dir>/lib
GRPC_LIB_DIR=<gRPC install dir>/lib
echo "$GRPC_LIB_DIR" > /etc/ld.so.conf.d/x86_64-librdc_client.conf
echo "$GRPC_LIB_DIR"64 >> /etc/ld.so.conf.d/x86_64-librdc_client.conf
echo "$RDC_LIB_DIR" >> /etc/ld.so.conf.d/x86_64-librdc_client.conf
echo "$RDC_LIB_DIR"64 >> /etc/ld.so.conf.d/x86_64-librdc_client.conf
ldconfig
```
# Running RDC
RDC supports encrypted communications between clients and servers. The
@@ -100,48 +113,84 @@ For an RDC client application to monitor and/or control a remote system, the RDC
### Start RDCD from command-line
When *rdcd* is started from a command-line the *capabilities* are determined by privilege of the *user* starting *rdcd*
$ cd rdc_install_prefix ## If specified in Building RDC section
```bash
## If RDC_FI_PROF_* metrics are required - you MUST export ROCMTOOLS_METRICS_PATH before starting rdcd
export ROCMTOOLS_METRICS_PATH=/opt/rocm-<version>/libexec/rocmtools/counters/derived_counters.xml
## To run with authentication. Ensure SSL keys are setup properly
## version will be the version number(ex:3.10.0) of ROCm where RDC was pacakged with
$ /opt/rocm-<version>/rdc/bin/rdcd ## rdcd is started with monitor-only capabilities
$ sudo /opt/rocm-<version>/rdc/bin/rdcd ## rdcd is started will full-capabilities
cd rdc_install_prefix ## If specified in Building RDC section
## To run without authentication. SSL key & certificates are not required.
## version will be the version number(ex:3.10.0) of ROCm where RDC was pacakged with
$ /opt/rocm-<version>/rdc/bin/rdcd -u ## rdcd is started with monitor-only capabilities
$ sudo /opt/rocm-<version>/rdc/bin/rdcd -u ## rdcd is started will full-capabilities
## To run with authentication. Ensure SSL keys are setup properly
## version will be the version number(ex:3.10.0) of ROCm where RDC was pacakged with
/opt/rocm-<version>/rdc/bin/rdcd ## rdcd is started with monitor-only capabilities
sudo /opt/rocm-<version>/rdc/bin/rdcd ## rdcd is started will full-capabilities
## To run without authentication. SSL key & certificates are not required.
## version will be the version number(ex:3.10.0) of ROCm where RDC was pacakged with
/opt/rocm-<version>/rdc/bin/rdcd -u ## rdcd is started with monitor-only capabilities
sudo /opt/rocm-<version>/rdc/bin/rdcd -u ## rdcd is started will full-capabilities
```
### Start RDCD using systemd
*rdcd* can be started by using the systemctl command. You can copy /opt/rocm-\<version\>/rdc/lib/rdc.service, which is installed with RDC, to the systemd folder. This file has 2 lines that control what *capabilities* with which *rdcd* will run. If left uncommented, rdcd will run with full-capabilities.
```bash
## file: /opt/rocm-<version>/rdc/lib/rdc.service
## Comment the following two lines to run with monitor-only capabilities
CapabilityBoundingSet=CAP_DAC_OVERRIDE
AmbientCapabilities=CAP_DAC_OVERRIDE
```
## file: /opt/rocm-<version>/rdc/lib/rdc.service
## Comment the following two lines to run with monitor-only capabilities
CapabilityBoundingSet=CAP_DAC_OVERRIDE
AmbientCapabilities=CAP_DAC_OVERRIDE
systemctl start rdc ## start rdc as systemd service
```bash
systemctl start rdc ## start rdc as systemd service
```
## Invoke RDC using ROCm™ Data Center Interface (RDCI)
RDCI provides command-line interface to all RDC features. This CLI can be run locally or remotely. Refer to [**user guide**](https://docs.amd.com/bundle/ROCm-Data-Center-Tool-User-Guide-v5.1/page/Feature_Overview.html) for the current list of features.
## sample rdci commands to test RDC functionality
## discover devices in a local or remote compute node
## NOTE: option -u (for unauthenticated) is required is rdcd was started in this mode
```bash
## sample rdci commands to test RDC functionality
## discover devices in a local or remote compute node
## NOTE: option -u (for unauthenticated) is required if rdcd was started in this mode
$ cd rdc_install_prefix ## If specified in Building RDC section
./opt/rocm-<version>/rdc/bin/rdci discovery -l <-u> ## list available GPUs in localhost
./opt/rocm-<version>/rdc/bin/rdci discovery <host> -l <-u> ## list available GPUs in host machine
cd rdc_install_prefix ## If specified in Building RDC section
cd ./opt/rocm-<version>/rdc/bin
./rdci discovery -l <-u> ## list available GPUs in localhost
./rdci discovery <host> -l <-u> ## list available GPUs in host machine
./rdci dmon <host> <-u> -l ## list most GPU counters
# assuming rdcd is running locally, using -u instead of <host>
./rdci dmon -u --list-all ## list all GPU counters
./rdci dmon -u -i 0 -c 1 -e 100 ## monitor field 100 on gpu 0 for count of 1
./rdci dmon -u -i 0 -c 1 -e 1,2 ## monitor fields 1,2 on gpu 0 for count of 1
# below requires rocmtools to be installed
./rdci dmon -u -i 0 -c 5 -e 700 ## monitor field 700 on gpu 0 for count of 5
# below is only likely to work on MI series GPUs
./rdci dmon -u -i 0 -c 5 -e 700,701,702,706 ## monitor fields 700,701,702,706
```
## Troubleshooting rdcd
Log messages that can provide useful debug information.
- Log messages that can provide useful debug information.
## If rdcd was started as a systemd service, then use journalctl to view rdcd logs
journalctl -u rdc
If rdcd was started as a systemd service, then use journalctl to view rdcd logs
```bash
journalctl -u rdc
```
## To run rdcd with debug log from command-line use
## version will be the version number(ex:3.10.0) of ROCm where RDC was pacakged with
RDC_LOG=DEBUG /opt/rocm-<version>/rdc/bin/rdcd
To run rdcd with debug log from command-line use
version will be the version number(ex:3.10.0) of ROCm where RDC was packaged with
```bash
RDC_LOG=DEBUG /opt/rocm-<version>/rdc/bin/rdcd
```
RDC_LOG=DEBUG also works on rdci
ERROR, INFO, DEBUG logging levels are supported
- All `RDC_FI_PROF_*` metrics return N/A
1. Is `ROCMTOOLS_METRICS_PATH` set?
2. Does your GPU support selected fields?
Field 700 (`RDC_FI_PROF_ELAPSED_CYCLES`) is supposed to be accessible on most GPUs.
Others are mostly intended for MI series.
3. Set `RDC_LOG=DEBUG` as stated above
4. Is rocmtools installed? Can you find `librocmtools.so`?
+38
Просмотреть файл
@@ -0,0 +1,38 @@
# This module provides a rocmtools::rocmtools package
# You can specify the ROCM directory by setting ROCM_DIR
set(NAME rocmtools)
if(NOT DEFINED ROCM_DIR)
set(ROCM_DIR "/opt/rocm")
endif()
find_library(
${NAME}_LIBRARY
NAMES ${NAME} ${NAME}64
HINTS "${ROCM_DIR}"
REGISTRY_VIEW BOTH
PATH_SUFFIXES lib)
if(NOT DEFINED (${NAME}_INCLUDE_DIR))
find_path(
${NAME}_INCLUDE_DIR
NAMES ${NAME}.h
HINTS "${ROCM_DIR}/include"
PATH_SUFFIXES ${NAME} ${NAME}/inc)
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(${NAME}
FOUND_VAR ${NAME}_FOUND
REQUIRED_VARS
${NAME}_LIBRARY
${NAME}_INCLUDE_DIR)
if(${NAME}_FOUND AND NOT TARGET ${NAME}::${NAME})
add_library(${NAME}::${NAME} UNKNOWN IMPORTED)
set_target_properties(${NAME}::${NAME} PROPERTIES
IMPORTED_LOCATION "${${NAME}_LIBRARY}"
INTERFACE_COMPILE_OPTIONS "${PC_${NAME}_CFLAGS_OTHER}"
INTERFACE_INCLUDE_DIRECTORIES "${${NAME}_INCLUDE_DIR}")
endif()
+2
Просмотреть файл
@@ -58,9 +58,11 @@ function(create_library_symlink)
endforeach()
# Symlink for private libraries
set(LIB_RDC_ROCR "librdc_rocr.so")
set(LIB_RDC_ROCP "librdc_rocp.so")
set(LIB_RDC_RAS "librdc_ras.so")
set(LIB_RDC_CLIENT_SMI "librdc_client_smi.so")
set(library_files "${LIB_RDC_ROCR}" "${LIB_RDC_ROCR}.${MAJ_VERSION}" "${LIB_RDC_ROCR}.${SO_VERSION}" )
set(library_files "${LIB_RDC_ROCP}" "${LIB_RDC_ROCP}.${MAJ_VERSION}" "${LIB_RDC_ROCP}.${SO_VERSION}" )
set(library_files "${library_files}" "${LIB_RDC_CLIENT_SMI}" "${LIB_RDC_CLIENT_SMI}.${MAJ_VERSION}" "${LIB_RDC_CLIENT_SMI}.${SO_VERSION}" )
set(library_files "${library_files}" "${LIB_RDC_RAS}")
+26
Просмотреть файл
@@ -29,6 +29,10 @@ THE SOFTWARE.
// 4 bool do or do not display in rdci
// rdc_field_t Description rdci label To Display
// =========== =========== ========= ==========
#ifndef FLD_DESC_ENT
#define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY)
#endif
FLD_DESC_ENT(RDC_FI_INVALID, "Unknown/Invalid field", "INVALID", false)
FLD_DESC_ENT(RDC_FI_GPU_COUNT, "GPU count in the system", "GPU_COUNT", true)
FLD_DESC_ENT(RDC_FI_DEV_NAME, "Name of the device", "DEV_NAME", true)
@@ -73,6 +77,28 @@ FLD_DESC_ENT(RDC_FI_ECC_FUSE_DED, "FUSE Double Error Detection",
FLD_DESC_ENT(RDC_FI_ECC_UMC_SEC, "UMC Single Error Correction", "ECC_UMC_SEC", true)
FLD_DESC_ENT(RDC_FI_ECC_UMC_DED, "UMC Double Error Detection", "ECC_UMC_DED", true)
// ROCProfiler fields
// This doesn't map to rocprofiler counters directly
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
// See metrics.xml in rocmtools
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "PROF_ELAPSED_COUNT", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "PROF_ACTIVE_WAVES", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "PROF_ACTIVE_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_CU_OCCUPANCY, "Active Waves / maximum Active Waves per CU", "PROF_CU_OCCUPANCY", false)
FLD_DESC_ENT(RDC_FI_PROF_CU_UTILIZATION, "Active Cycles / total Elapsed Cycles", "PROF_CU_UTILIZATION", false)
FLD_DESC_ENT(RDC_FI_PROF_FETCH_SIZE, "kb fetched from video memory", "PROF_FETCH_SIZE", false)
FLD_DESC_ENT(RDC_FI_PROF_WRITE_SIZE, "kb written to video memory", "PROF_WRITE_SIZE", false)
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_16, "Number of fp16 OPS / second", "PROF_FLOPS_16", false)
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_32, "Number of fp32 OPS / second", "PROF_FLOPS_32", false)
FLD_DESC_ENT(RDC_FI_PROF_FLOPS_64, "Number of fp64 OPS / second", "PROF_FLOPS_64", false)
// TODO: uncomment when below are implemented
FLD_DESC_ENT(RDC_FI_PROF_GFLOPS_16, "Number of fp16 GOPS / second", "PROF_GFLOPS_16", false)
FLD_DESC_ENT(RDC_FI_PROF_GFLOPS_32, "Number of fp32 GOPS / second", "PROF_GFLOPS_32", false)
FLD_DESC_ENT(RDC_FI_PROF_GFLOPS_64, "Number of fp64 GOPS / second", "PROF_GFLOPS_64", false)
FLD_DESC_ENT(RDC_FI_PROF_MEMR_BW_KBPNS, "HBM Read Bandwidth in kb/ns", "PROF_MEMR_BW_KBPNS", false)
FLD_DESC_ENT(RDC_FI_PROF_MEMW_BW_KBPNS, "HBM Write Bandwidth in kb/ns", "PROF_MEMW_BW_KBPNS", false)
// Events
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_0_REQ_TX, "Outgoing requests to neighbor 0", "XGMI_REQ_0", false)
FLD_DESC_ENT(RDC_EVNT_XGMI_0_RESP_TX, "Outgoing responses to neighbor 0", "XGMI_RES_0", false)
+31 -12
Просмотреть файл
@@ -104,11 +104,11 @@ typedef enum {
//! ID used to represent an invalid GPU
#define GPU_ID_INVALID -1
#define GPU_ID_INVALID (-1)
//! Used to specify all GPUs
#define RDC_GROUP_ALL_GPUS -1000
#define RDC_GROUP_ALL_GPUS (-1000)
//! Used to specify all stats fields
#define RDC_JOB_STATS_FIELDS -1000
#define RDC_JOB_STATS_FIELDS (-1000)
/**
* @brief The max rdc field string length
@@ -223,6 +223,25 @@ typedef enum {
RDC_FI_ECC_UMC_SEC, //!< UMC Single Error Correction
RDC_FI_ECC_UMC_DED, //!< UMC Double Error Detection
/**
* @brief ROC-profiler related fields
*/
RDC_FI_PROF_ELAPSED_CYCLES = 700, //!< Number of elapsed cycles over all SMs
RDC_FI_PROF_ACTIVE_WAVES, //!< Number of Active Waves
RDC_FI_PROF_ACTIVE_CYCLES, //!< Number of Active Cycles
RDC_FI_PROF_CU_OCCUPANCY, //!< Active Waves / maximum active Waves supported
RDC_FI_PROF_CU_UTILIZATION, //!< Total active cycles / Total elapsed cycles
RDC_FI_PROF_FETCH_SIZE, //!< Number of kilobytes fetched from video memory
RDC_FI_PROF_WRITE_SIZE, //!< Number of kilobytes written to video memory
RDC_FI_PROF_FLOPS_16, //!< Number of fp16 OPS / second
RDC_FI_PROF_FLOPS_32, //!< Number of fp32 OPS / second
RDC_FI_PROF_FLOPS_64, //!< Number of fp64 OPS / second
RDC_FI_PROF_GFLOPS_16, //!< Number of fp16 GOPS / second
RDC_FI_PROF_GFLOPS_32, //!< Number of fp32 GOPS / second
RDC_FI_PROF_GFLOPS_64, //!< Number of fp64 GOPS / second
RDC_FI_PROF_MEMR_BW_KBPNS, //!< HBM Read Bandwidth in kilobytes / nanosecond
RDC_FI_PROF_MEMW_BW_KBPNS, //!< HBM Write Bandwidth in kilobytes / nanosecond
/*
* @brief Raw XGMI counter events
*/
@@ -253,7 +272,7 @@ typedef enum {
//!< neighbor 1
RDC_EVNT_XGMI_1_BEATS_TX, //!< Data beats sent to
//!< neighbor 1; Each beat
//!< represnts 32 bytes
//!< represents 32 bytes
// "Composite" events. These events have additional processing beyond
// the value provided by the rocm_smi library.
@@ -328,7 +347,7 @@ typedef struct {
uint64_t energy_consumed; //!< GPU Energy consumed
uint64_t ecc_correct; //!< Correctable errors
uint64_t ecc_uncorrect; //!< Uncorrtable errors
uint64_t ecc_uncorrect; //!< Uncorrectable errors
rdc_stats_summary_t pcie_tx; //!< Bytes sent over PCIe stats
rdc_stats_summary_t pcie_rx; //!< Bytes received over PCIe stats
rdc_stats_summary_t power_usage; //!< GPU Power usage stats
@@ -348,7 +367,7 @@ typedef struct {
uint32_t num_gpus; //!< Number of GPUs used by job
rdc_gpu_usage_info_t summary; //!< Job usage summary statistics
//!< (overall)
rdc_gpu_usage_info_t gpus[16]; //!< Job usage summary staticstics by GPU
rdc_gpu_usage_info_t gpus[16]; //!< Job usage summary statistics by GPU
} rdc_job_info_t;
/**
@@ -958,18 +977,18 @@ rdc_status_t rdc_field_unwatch(rdc_handle_t p_rdc_handle,
/**
* @brief Run the diagnostic test cases
*
* @details Run the diagnostic test cases at differenet levles.
* @details Run the diagnostic test cases at different levels.
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @param[in] level The level decides how long the test will run.
* The RDC_DIAG_LVL_SHORT only take a few seconds, and the
* The RDC_DIAG_LVL_SHORT only take a few seconds, and the
* the RDC_DIAG_LVL_LONG may take up to 15 minutes.
*
*
* @param[inout] response The detail results of the tests run.
*
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_diagnostic_run(
@@ -988,9 +1007,9 @@ rdc_status_t rdc_diagnostic_run(
* @param[in] group_id The GPU group id.
*
* @param[in] test_case The test case to run.
*
*
* @param[inout] result The results of the test.
*
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_test_case_run(
+7 -4
Просмотреть файл
@@ -23,21 +23,24 @@ THE SOFTWARE.
#define INCLUDE_RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_
#include <memory>
#include "rdc_lib/RdcModuleMgr.h"
#include "rdc_lib/RdcMetricFetcher.h"
#include "rdc_lib/RdcModuleMgr.h"
#include "rdc_lib/RdcTelemetry.h"
#include "rdc_lib/impl/RdcRasLib.h"
#include "rdc_lib/impl/RdcSmiLib.h"
#include "rdc_lib/impl/RdcRocpLib.h"
#include "rdc_lib/impl/RdcRocrLib.h"
#include "rdc_lib/impl/RdcSmiLib.h"
namespace amd {
namespace rdc {
class RdcModuleMgrImpl: public RdcModuleMgr {
class RdcModuleMgrImpl : public RdcModuleMgr {
public:
RdcTelemetryPtr get_telemetry_module() override;
RdcDiagnosticPtr get_diagnostic_module() override;
explicit RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher);
private:
// Function module
RdcTelemetryPtr rdc_telemetry_module_;
@@ -48,10 +51,10 @@ class RdcModuleMgrImpl: public RdcModuleMgr {
RdcSmiLibPtr smi_lib_;
RdcMetricFetcherPtr fetcher_;
RdcRocrLibPtr rocr_lib_;
RdcRocpLibPtr rocp_lib_;
};
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCMODULEMGRIMPL_H_
+92
Просмотреть файл
@@ -0,0 +1,92 @@
/*
Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef INCLUDE_RDC_LIB_IMPL_RDCROCPLIB_H_
#define INCLUDE_RDC_LIB_IMPL_RDCROCPLIB_H_
#include <cstdint>
#include <memory>
#include <vector>
#include "rdc_lib/RdcLibraryLoader.h"
#include "rdc_lib/RdcTelemetry.h"
namespace amd {
namespace rdc {
class RdcRocpLib : public RdcTelemetry {
public:
/* Telemetry */
// get support field ids
rdc_status_t rdc_telemetry_fields_query(
uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) override;
// Fetch
rdc_status_t rdc_telemetry_fields_value_get(
rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data) override;
rdc_status_t rdc_telemetry_fields_watch(
rdc_gpu_field_t* fields,
uint32_t fields_count) override;
rdc_status_t rdc_telemetry_fields_unwatch(
rdc_gpu_field_t* fields,
uint32_t fields_count) override;
uint64_t get_profiler_version();
explicit RdcRocpLib(const char* lib_name);
~RdcRocpLib();
private:
RdcLibraryLoader lib_loader_;
rdc_status_t (*telemetry_fields_query_)(
uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count);
rdc_status_t (*telemetry_fields_value_get_)(
rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data);
rdc_status_t (*telemetry_fields_watch_)(
rdc_gpu_field_t* fields,
uint32_t fields_count);
rdc_status_t (*telemetry_fields_unwatch_)(
rdc_gpu_field_t* fields,
uint32_t fields_count);
};
using RdcRocpLibPtr = std::shared_ptr<RdcRocpLib>;
} // namespace rdc
} // namespace amd
#endif // INCLUDE_RDC_LIB_IMPL_RDCROCPLIB_H_
+3 -1
Просмотреть файл
@@ -29,6 +29,7 @@ THE SOFTWARE.
#include "rdc_lib/RdcTelemetry.h"
#include "rdc_lib/impl/RdcRasLib.h"
#include "rdc_lib/impl/RdcSmiLib.h"
#include "rdc_lib/impl/RdcRocpLib.h"
#include "rdc_lib/RdcMetricFetcher.h"
namespace amd {
@@ -50,7 +51,8 @@ class RdcTelemetryModule : public RdcTelemetry {
uint32_t fields_count);
RdcTelemetryModule(const RdcSmiLibPtr& smi_lib,
const RdcRasLibPtr& ras_module);
const RdcRasLibPtr& ras_module,
const RdcRocpLibPtr& rocp_module);
private:
//< Helper function to dispatch fields to module
+137
Просмотреть файл
@@ -0,0 +1,137 @@
/*
Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_
#define RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_
#include <rocmtools.h>
#include <chrono>
#include <cstdint>
#include <cstdio>
#include <map>
#include <string>
#include <typeinfo>
#include <unordered_map>
#include "rdc/rdc.h"
namespace amd {
namespace rdc {
/**
* @brief Map of RDC fields to rocmtools counters
*
* See metrics.xml in rocmtools for more info.
* RDC_CALC fields are calculated over time by RDC.
*/
static const std::unordered_map<rdc_field_t, const char*> counter_map_k = {
{RDC_FI_PROF_ELAPSED_CYCLES, "GRBM_COUNT"},
{RDC_FI_PROF_ACTIVE_WAVES, "SQ_WAVES"},
{RDC_FI_PROF_ACTIVE_CYCLES, "SQ_BUSY_CU_CYCLES"},
{RDC_FI_PROF_CU_OCCUPANCY, "CU_OCCUPANCY"},
{RDC_FI_PROF_CU_UTILIZATION, "CU_UTILIZATION"},
{RDC_FI_PROF_FETCH_SIZE, "FETCH_SIZE"},
{RDC_FI_PROF_WRITE_SIZE, "WRITE_SIZE"},
{RDC_FI_PROF_FLOPS_16, "TOTAL_16_OPS"},
{RDC_FI_PROF_FLOPS_32, "TOTAL_32_OPS"},
{RDC_FI_PROF_FLOPS_64, "TOTAL_64_OPS"},
// fields below require special handling
{RDC_FI_PROF_GFLOPS_16, "TOTAL_16_OPS"},
{RDC_FI_PROF_GFLOPS_32, "TOTAL_32_OPS"},
{RDC_FI_PROF_GFLOPS_64, "TOTAL_64_OPS"},
{RDC_FI_PROF_MEMR_BW_KBPNS, "FETCH_SIZE"},
{RDC_FI_PROF_MEMW_BW_KBPNS, "WRITE_SIZE"},
};
/// Common interface for RocP tests and samples
class RdcRocpBase {
typedef std::pair<uint32_t, rdc_field_t> pair_gpu_field_t;
typedef struct session_info_t {
rocmtools_session_id_t id{};
std::chrono::
time_point<std::chrono::system_clock, std::chrono::nanoseconds>
start_time;
std::chrono::
time_point<std::chrono::system_clock, std::chrono::nanoseconds>
stop_time;
} session_info_t;
public:
RdcRocpBase();
RdcRocpBase(const RdcRocpBase&) = default;
RdcRocpBase(RdcRocpBase&&) = delete;
RdcRocpBase& operator=(const RdcRocpBase&) = delete;
RdcRocpBase& operator=(RdcRocpBase&&) = delete;
~RdcRocpBase();
/**
* @brief Lookup ROCProfiler counter
*
* @param[in] field An existing field already added to sessions dictionary
* @param[out] value A pointer that will be populated with returned value
*
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
* successfully.
*/
rdc_status_t rocp_lookup(pair_gpu_field_t gpu_field, double* value);
/**
* @brief Destroy ROCmTools session responsible for monitoring a given
* field
*
* @details While rocmtools supports multiple fields per ID - it has a
* limit to how many counters it can query internally.
* To avoid concerning ourselves with said limit, we limit each session to
* 1 field.
* In the future this can be optimized to allow for multiple fields per
* session.
*
* @param[in] field A field to start monitoring
*
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
* successfully.
*/
rdc_status_t create_session(pair_gpu_field_t gpu_field);
/**
* @brief Destroy ROCmTools session responsible for monitoring a given
* field
*
* @param[in] field A field to stop monitoring
*
* @retval ::ROCMTOOLS_STATUS_SUCCESS The function has been executed
* successfully.
*/
rdc_status_t destroy_session(pair_gpu_field_t gpu_field);
protected:
private:
std::map<pair_gpu_field_t, session_info_t> sessions;
/**
* @brief Convert from rocmtools status into RDC status
*/
rdc_status_t Rocp2RdcError(rocmtools_status_t rocm_status);
};
} // namespace rdc
} // namespace amd
#endif // RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_
+27
Просмотреть файл
@@ -0,0 +1,27 @@
/*
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef RDC_MODULES_RDC_DIAGNOSTIC_RDCDIAGNOSTICLIB_H_
#define RDC_MODULES_RDC_DIAGNOSTIC_RDCDIAGNOSTICLIB_H_
#include "rdc/rdc.h"
#include "rdc_lib/RdcTelemetryLibInterface.h"
#endif // RDC_MODULES_RDC_DIAGNOSTIC_RDCDIAGNOSTICLIB_H_
+8 -3
Просмотреть файл
@@ -59,6 +59,7 @@ set(RDC_LIB_INC_DIR "${INC_DIR}")
set(BOOTSTRAP_LIB "rdc_bootstrap")
set(RDC_LIB "rdc")
set(RDC_ROCR_LIB "rdc_rocr")
set(RDC_ROCP_LIB "rdc_rocp")
set(RDCCLIENT_LIB "rdc_client")
## Set RUNPATH if ROCM_RPATH is defined and passed by the environment
@@ -90,7 +91,8 @@ set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-core")
set(CPACK_RPM_PACKAGE_REQUIRES "rocm-core")
# link grpc and ROCm to RSMI
link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64" "${ROCM_DIR}/lib")
#link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64" "${ROCM_DIR}/lib")
link_directories(${RSMI_LIB_DIR} "${GRPC_ROOT}/lib" "${GRPC_ROOT}/lib64")
# add librdc_bootstrap.so
add_subdirectory(bootstrap)
@@ -98,9 +100,12 @@ add_subdirectory(bootstrap)
# add librdc.so
add_subdirectory(rdc)
# add librdc_rocr.so to RDC_LIBS_MODULES
# add librdc_rocr.so to RDC_LIB_MODULES
add_subdirectory(rdc_modules/rdc_rocr)
# add librdc_rocp.so to RDC_LIB_MODULES
add_subdirectory(rdc_modules/rdc_rocp)
if(BUILD_STANDALONE)
# add librdc_client.so
add_subdirectory(rdc_client)
@@ -125,7 +130,7 @@ else()
endif()
# Add module directives if those exist
if(RDC_LIBS_MODULES)
if(RDC_LIB_MODULES)
install(TARGETS ${RDC_LIB_MODULES}
EXPORT rdcTargets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${RDC}
+3 -3
Просмотреть файл
@@ -25,11 +25,11 @@ message("BOOTSTRAP_LIB_INC_LIST=${BOOTSTRAP_LIB_INC_LIST}")
add_library(${BOOTSTRAP_LIB} SHARED ${BOOTSTRAP_LIB_SRC_LIST} ${BOOTSTRAP_LIB_INC_LIST})
target_link_libraries(${BOOTSTRAP_LIB} pthread dl)
target_include_directories(${BOOTSTRAP_LIB} PRIVATE
"${RSMI_INC_DIR}"
"${ROCM_DIR}/include"
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
"${COMMON_DIR}")
"${COMMON_DIR}"
"${RSMI_INC_DIR}"
"${ROCM_DIR}/include")
target_include_directories(${BOOTSTRAP_LIB}
PUBLIC
+2
Просмотреть файл
@@ -19,6 +19,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST}
"${SRC_DIR}/RdcNotificationImpl.cc"
"${SRC_DIR}/RdcPerfTimer.cc"
"${SRC_DIR}/RdcRasLib.cc"
"${SRC_DIR}/RdcRocpLib.cc"
"${SRC_DIR}/RdcRocrLib.cc"
"${SRC_DIR}/RdcSmiDiagnosticImpl.cc"
"${SRC_DIR}/RdcSmiLib.cc"
@@ -50,6 +51,7 @@ set(RDC_LIB_INC_LIST ${RDC_LIB_INC_LIST}
"${INC_DIR}/impl/RdcModuleMgrImpl.h"
"${INC_DIR}/impl/RdcNotificationImpl.h"
"${INC_DIR}/impl/RdcRasLib.h"
"${INC_DIR}/impl/RdcRocpLib.h"
"${INC_DIR}/impl/RdcRocrLib.h"
"${INC_DIR}/impl/RdcSmiDiagnosticImpl.h"
"${INC_DIR}/impl/RdcSmiLib.h"
+11 -6
Просмотреть файл
@@ -20,20 +20,21 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcModuleMgrImpl.h"
#include "rdc_lib/impl/RdcTelemetryModule.h"
#include "rdc_lib/impl/RdcDiagnosticModule.h"
#include "rdc_lib/impl/RdcRasLib.h"
#include "rdc_lib/impl/RdcRocpLib.h"
#include "rdc_lib/impl/RdcRocrLib.h"
#include "rdc_lib/impl/RdcTelemetryModule.h"
namespace amd {
namespace rdc {
RdcModuleMgrImpl::RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher):
smi_lib_(std::make_shared<RdcSmiLib>(fetcher)) {
RdcModuleMgrImpl::RdcModuleMgrImpl(const RdcMetricFetcherPtr& fetcher)
: smi_lib_(std::make_shared<RdcSmiLib>(fetcher)) {
// The smi_lib_ always need to be loaded.
}
RdcTelemetryPtr RdcModuleMgrImpl::get_telemetry_module() {
if (rdc_telemetry_module_) {
return rdc_telemetry_module_;
@@ -44,8 +45,13 @@ RdcTelemetryPtr RdcModuleMgrImpl::get_telemetry_module() {
ras_lib_.reset(new RdcRasLib("librdc_ras.so"));
}
if (!rocp_lib_) {
rocp_lib_.reset(new RdcRocpLib("librdc_rocp.so"));
}
if (!rdc_telemetry_module_) {
rdc_telemetry_module_.reset(new RdcTelemetryModule(smi_lib_, ras_lib_));
rdc_telemetry_module_.reset(
new RdcTelemetryModule(smi_lib_, ras_lib_, rocp_lib_));
}
return rdc_telemetry_module_;
@@ -75,4 +81,3 @@ RdcDiagnosticPtr RdcModuleMgrImpl::get_diagnostic_module() {
} // namespace rdc
} // namespace amd
+118
Просмотреть файл
@@ -0,0 +1,118 @@
/*
Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "rdc_lib/impl/RdcRocpLib.h"
#include <cstdint>
#include <functional>
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
namespace amd {
namespace rdc {
// TODO: Add init and destroy calls support
RdcRocpLib::RdcRocpLib(const char* lib_name)
: telemetry_fields_query_(nullptr),
telemetry_fields_value_get_(nullptr),
telemetry_fields_watch_(nullptr),
telemetry_fields_unwatch_(nullptr) {
rdc_status_t status = lib_loader_.load(lib_name);
if (status != RDC_ST_OK) {
RDC_LOG(RDC_ERROR, "Rocp related function will not work.");
return;
}
status = lib_loader_.load_symbol(
&telemetry_fields_query_, "rdc_telemetry_fields_query");
if (status != RDC_ST_OK) {
telemetry_fields_query_ = nullptr;
}
status = lib_loader_.load_symbol(
&telemetry_fields_value_get_, "rdc_telemetry_fields_value_get");
if (status != RDC_ST_OK) {
telemetry_fields_value_get_ = nullptr;
}
status = lib_loader_.load_symbol(
&telemetry_fields_watch_, "rdc_telemetry_fields_watch");
if (status != RDC_ST_OK) {
telemetry_fields_watch_ = nullptr;
}
status = lib_loader_.load_symbol(
&telemetry_fields_unwatch_, "rdc_telemetry_fields_unwatch");
if (status != RDC_ST_OK) {
telemetry_fields_unwatch_ = nullptr;
}
}
RdcRocpLib::~RdcRocpLib() = default;
// get support field ids
rdc_status_t RdcRocpLib::rdc_telemetry_fields_query(
uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) {
if (field_count == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
return telemetry_fields_query_(field_ids, field_count);
}
// Fetch
rdc_status_t RdcRocpLib::rdc_telemetry_fields_value_get(
rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
RDC_LOG(RDC_DEBUG, "Fetch " << fields_count << " fields from rocp_lib.");
return telemetry_fields_value_get_(
fields, fields_count, callback, user_data);
}
rdc_status_t RdcRocpLib::rdc_telemetry_fields_watch(
rdc_gpu_field_t* fields,
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
return telemetry_fields_watch_(fields, fields_count);
}
rdc_status_t RdcRocpLib::rdc_telemetry_fields_unwatch(
rdc_gpu_field_t* fields,
uint32_t fields_count) {
if (fields == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
return telemetry_fields_unwatch_(fields, fields_count);
}
} // namespace rdc
} // namespace amd
+5 -2
Просмотреть файл
@@ -94,11 +94,15 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_unwatch(
RdcTelemetryModule::RdcTelemetryModule(
const RdcSmiLibPtr& smi_lib,
const RdcRasLibPtr& ras_module) {
const RdcRasLibPtr& ras_module,
const RdcRocpLibPtr& rocp_module) {
telemetry_modules_.push_back(smi_lib);
if (ras_module) {
telemetry_modules_.push_back(ras_module);
}
if (rocp_module) {
telemetry_modules_.push_back(rocp_module);
}
auto ite = telemetry_modules_.begin();
for (; ite != telemetry_modules_.end(); ite++) {
@@ -166,4 +170,3 @@ rdc_status_t RdcTelemetryModule::rdc_telemetry_fields_value_get(
} // namespace rdc
} // namespace amd
+54
Просмотреть файл
@@ -0,0 +1,54 @@
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" Cmake RDC Lib-ROCP ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
set(SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
set(INC_DIR "${PROJECT_SOURCE_DIR}/include/rdc_modules/rdc_rocp")
set(RDC_ROCP_LIB_COMPONENT "lib${RDC_ROCP_LIB}")
set(RDC_ROCP_LIB_SRC_LIST
"${BOOTSTRAP_LIB_SRC_DIR}/RdcLogger.cc"
"${SRC_DIR}/RdcTelemetryLib.cc"
"${SRC_DIR}/RdcRocpBase.cc")
set(RDC_ROCP_LIB_INC_LIST
"${PROJECT_SOURCE_DIR}/include/rdc/rdc.h"
"${RDC_LIB_INC_DIR}/RdcDiagnosticLibInterface.h"
"${RDC_LIB_INC_DIR}/rdc_common.h"
"${RDC_LIB_INC_DIR}/RdcLogger.h"
"${INC_DIR}/RdcTelemetryLib.h"
"${INC_DIR}/RdcRocpBase.h")
if(BUILD_ROCPTEST)
message("Build librdc_rocp.so is enabled, make sure ROCmTools is installed.")
message("RDC_ROCP_LIB_INC_LIST=${RDC_ROCP_LIB_INC_LIST}")
set(ROCMTOOLS_LIB rocmtools::rocmtools)
# below provides rocmtools::rocmtools package
include(Findrocmtools)
set(HSA_LIB "hsa-runtime64")
set(RDC_LIB_MODULES ${RDC_LIB_MODULES} ${RDC_ROCP_LIB} PARENT_SCOPE)
add_library(${RDC_ROCP_LIB} SHARED ${RDC_ROCP_LIB_SRC_LIST} ${RDC_ROCP_LIB_INC_LIST})
target_link_libraries(${RDC_ROCP_LIB} ${RDC_LIB} ${BOOTSTRAP_LIB} ${HSA_LIB} ${ROCMTOOLS_LIB} pthread dl)
target_include_directories(${RDC_ROCP_LIB} PRIVATE
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
"${COMMON_DIR}"
"${RSMI_INC_DIR}"
"${ROCM_DIR}/include"
"${ROCM_DIR}/include/hsa")
# Set the VERSION and SOVERSION values
set_property(TARGET ${RDC_ROCP_LIB} PROPERTY
SOVERSION "${VERSION_MAJOR}")
set_property(TARGET ${RDC_ROCP_LIB} PROPERTY
VERSION "${SO_VERSION_STRING}")
# If the library is a release, strip the target library
if("${CMAKE_BUILD_TYPE}" STREQUAL Release)
add_custom_command(
TARGET ${RDC_ROCP_LIB}
POST_BUILD COMMAND ${CMAKE_STRIP} ${RDC_ROCP_LIB_COMPONENT}.so)
endif()
endif()
+166
Просмотреть файл
@@ -0,0 +1,166 @@
/*
Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <rocmtools.h>
#include <cassert>
#include <chrono>
#include <cstring>
#include <vector>
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rocp/RdcRocpBase.h"
namespace amd {
namespace rdc {
RdcRocpBase::RdcRocpBase() {
auto status = rocmtools_initialize();
RDC_LOG(RDC_INFO, "rocmtools_initialize status: " << status);
}
RdcRocpBase::~RdcRocpBase() {
for (auto& session : sessions) {
const rdc_status_t status = destroy_session(session.first);
assert(status == RDC_ST_OK);
}
sessions.clear();
auto status = rocmtools_finalize();
RDC_LOG(RDC_INFO, "rocmtools_finalize status: " << status);
}
rdc_status_t RdcRocpBase::rocp_lookup(
pair_gpu_field_t gpu_field,
double* value) {
if (sessions.empty()) {
return RDC_ST_NOT_FOUND;
}
if (value == nullptr) {
return RDC_ST_BAD_PARAMETER;
}
rocmtools_device_profile_metric_t counter;
session_info_t session = sessions.at(gpu_field);
const rocmtools_status_t status =
rocmtools_device_profiling_session_poll(session.id, &counter);
session.stop_time = std::chrono::high_resolution_clock::now();
if (status != ROCMTOOLS_STATUS_SUCCESS) {
return Rocp2RdcError(status);
}
const auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(
session.stop_time - session.start_time)
.count();
// some metrics are derived from others and depend on time passed
switch (gpu_field.second) {
case RDC_FI_PROF_GFLOPS_16:
case RDC_FI_PROF_GFLOPS_32:
case RDC_FI_PROF_GFLOPS_64:
case RDC_FI_PROF_MEMR_BW_KBPNS:
case RDC_FI_PROF_MEMW_BW_KBPNS:
*value = counter.value.value / elapsed;
break;
default:
*value = counter.value.value;
break;
}
return Rocp2RdcError(status);
}
rdc_status_t RdcRocpBase::create_session(pair_gpu_field_t gpu_field) {
if (sessions.count(gpu_field) != 0) {
RDC_LOG(
RDC_DEBUG, "Session for field (" << gpu_field.second << ") on GPU ["
<< gpu_field.first
<< "] already exists!");
return RDC_ST_ALREADY_EXIST;
}
session_info_t session = {};
std::vector<const char*> rocmtools_fields = {
counter_map_k.at(gpu_field.second)};
// create session
rocmtools_status_t status = rocmtools_device_profiling_session_create(
rocmtools_fields.data(), rocmtools_fields.size(), &session.id, 0,
gpu_field.first);
if (status != ROCMTOOLS_STATUS_SUCCESS) {
return Rocp2RdcError(status);
}
// add start time
session.start_time = std::chrono::high_resolution_clock::now();
sessions.emplace(gpu_field, session);
// start session
status = rocmtools_device_profiling_session_start(session.id);
return Rocp2RdcError(status);
}
rdc_status_t RdcRocpBase::destroy_session(pair_gpu_field_t gpu_field) {
if (sessions.empty()) {
RDC_LOG(RDC_DEBUG, "Cannot destroy empty session...");
return RDC_ST_OK;
}
// no session with field
if (sessions.count(gpu_field) == 0) {
RDC_LOG(
RDC_DEBUG, "Cannot destroy session with field ("
<< gpu_field.second << ") on GPU ["
<< gpu_field.first
<< "] because it doesn't exist...");
return RDC_ST_OK;
}
const rocmtools_session_id_t session_id = sessions.at(gpu_field).id;
const rocmtools_status_t status =
rocmtools_device_profiling_session_destroy(session_id);
if (status == ROCMTOOLS_STATUS_SUCCESS) {
const auto num_of_destroyed_sessions = sessions.erase(gpu_field);
RDC_LOG(
RDC_DEBUG,
"destroyed (" << num_of_destroyed_sessions << ") sessions");
}
return Rocp2RdcError(status);
}
rdc_status_t RdcRocpBase::Rocp2RdcError(rocmtools_status_t rocm_status) {
switch (rocm_status) {
case ROCMTOOLS_STATUS_SUCCESS:
return RDC_ST_OK;
case ROCMTOOLS_STATUS_ERROR_HAS_ACTIVE_SESSION:
return RDC_ST_ALREADY_EXIST;
case ROCMTOOLS_STATUS_ERROR_SESSION_FILTER_DATA_MISMATCH:
case ROCMTOOLS_STATUS_ERROR_SESSION_MISSING_FILTER:
case ROCMTOOLS_STATUS_ERROR_SESSION_NOT_FOUND:
return RDC_ST_BAD_PARAMETER;
default:
return RDC_ST_UNKNOWN_ERROR;
}
}
} // namespace rdc
} // namespace amd
+143
Просмотреть файл
@@ -0,0 +1,143 @@
/*
Copyright (c) 2022 - present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <sys/time.h>
#include <cstring>
#include <map>
#include <memory>
#include <stdexcept>
#include <vector>
#include "rdc/rdc.h"
#include "rdc_lib/RdcLogger.h"
#include "rdc_lib/RdcTelemetryLibInterface.h"
#include "rdc_lib/rdc_common.h"
#include "rdc_modules/rdc_rocp/RdcRocpBase.h"
amd::rdc::RdcRocpBase rocp;
// get supported field ids
// TODO: Query fields with rocprofiler
rdc_status_t rdc_telemetry_fields_query(
uint32_t field_ids[MAX_NUM_FIELDS],
uint32_t* field_count) {
// extract all keys from counter_map
std::vector<uint32_t> counter_keys;
counter_keys.reserve(amd::rdc::counter_map_k.size());
for (auto it : amd::rdc::counter_map_k) {
counter_keys.push_back(it.first);
}
*field_count = counter_keys.size();
// copy from vector into array
std::copy(counter_keys.begin(), counter_keys.end(), field_ids);
return RDC_ST_OK;
}
// Fetch
rdc_status_t rdc_telemetry_fields_value_get(
rdc_gpu_field_t* fields,
uint32_t fields_count,
rdc_field_value_f callback,
void* user_data) {
//
// Bulk fetch fields
std::vector<rdc_gpu_field_value_t> bulk_results;
struct timeval tv {};
gettimeofday(&tv, nullptr);
const uint64_t curTime =
static_cast<uint64_t>(tv.tv_sec) * 1000 + tv.tv_usec / 1000;
// Fetch it one by one for left fields
const int BULK_FIELDS_MAX = 16;
rdc_gpu_field_value_t values[BULK_FIELDS_MAX];
uint32_t bulk_count = 0;
rdc_status_t status = RDC_ST_UNKNOWN_ERROR;
double value = 0;
for (uint32_t i = 0; i < fields_count; i++) {
if (bulk_count >= BULK_FIELDS_MAX) {
status = callback(values, bulk_count, user_data);
// When the callback returns errors, stop processing and return.
if (status != RDC_ST_OK) {
return status;
}
bulk_count = 0;
}
RDC_LOG(RDC_DEBUG, "ID: " << fields[i].field_id);
status = rocp.rocp_lookup(
std::make_pair(fields[i].gpu_index, fields[i].field_id), &value);
// get value
values[bulk_count].gpu_index = fields[i].gpu_index;
values[bulk_count].field_value.type = DOUBLE;
values[bulk_count].field_value.status = status;
values[bulk_count].field_value.ts = curTime;
values[bulk_count].field_value.value.dbl = value;
values[bulk_count].field_value.field_id = fields[i].field_id;
RDC_LOG(RDC_DEBUG, "VALUE: " << value);
bulk_count++;
}
if (bulk_count != 0) {
rdc_status_t status = callback(values, bulk_count, user_data);
if (status != RDC_ST_OK) {
return status;
}
bulk_count = 0;
}
return status;
}
rdc_status_t rdc_telemetry_fields_watch(
rdc_gpu_field_t* fields,
uint32_t fields_count) {
rdc_status_t status = RDC_ST_OK;
for (uint32_t i = 0; i < fields_count; i++) {
RDC_LOG(RDC_DEBUG, "WATCH: " << fields[i].field_id);
const rdc_status_t temp_status = rocp.create_session(
std::make_pair(fields[i].gpu_index, fields[i].field_id));
if (temp_status != RDC_ST_OK) {
status = temp_status;
}
}
return status;
}
rdc_status_t rdc_telemetry_fields_unwatch(
rdc_gpu_field_t* fields,
uint32_t fields_count) {
rdc_status_t status = RDC_ST_OK;
for (uint32_t i = 0; i < fields_count; i++) {
RDC_LOG(RDC_DEBUG, "UNWATCH: " << fields[i].field_id);
const rdc_status_t temp_status = rocp.destroy_session(
std::make_pair(fields[i].gpu_index, fields[i].field_id));
// return last non-ok status
if (temp_status != RDC_ST_OK) {
status = temp_status;
}
}
return status;
}
+4 -4
Просмотреть файл
@@ -36,15 +36,15 @@ if(BUILD_ROCRTEST)
message("RDC_ROCR_LIB_INC_LIST=${RDC_ROCR_LIB_INC_LIST}")
set(HSA_LIB "hsa-runtime64")
set(RDC_LIBS_MODULES ${RDC_LIBS_MODULES} ${RDC_ROCR_LIB} PARENT_SCOPE)
set(RDC_LIB_MODULES ${RDC_LIB_MODULES} ${RDC_ROCR_LIB} PARENT_SCOPE)
add_library(${RDC_ROCR_LIB} SHARED ${RDC_ROCR_LIB_SRC_LIST} ${RDC_ROCR_LIB_INC_LIST})
target_link_libraries(${RDC_ROCR_LIB} ${RDC_LIB} ${BOOTSTRAP_LIB} ${HSA_LIB} pthread dl)
target_include_directories(${RDC_ROCR_LIB} PRIVATE
"${RSMI_INC_DIR}"
"${ROCM_DIR}/include"
"${PROJECT_SOURCE_DIR}"
"${PROJECT_SOURCE_DIR}/include"
"${COMMON_DIR}")
"${COMMON_DIR}"
"${RSMI_INC_DIR}"
"${ROCM_DIR}/include")
# Set the VERSION and SOVERSION values
set_property(TARGET ${RDC_ROCR_LIB} PROPERTY
+22 -4
Просмотреть файл
@@ -1,9 +1,18 @@
#!/bin/bash
set -e
# This will return 0 if an id is created and non-zero if
# it already exists
# https://www.debian.org/doc/debian-policy/ch-opersys.html#users-and-groups
do_create_rdc_user() {
useradd -r -s /bin/nologin rdc
adduser \
--system \
--quiet \
--home /nonexistent \
--no-create-home \
--disabled-password \
rdc
if [ $(getent group render) ]; then
usermod -a -G render rdc
else
@@ -13,16 +22,25 @@ do_create_rdc_user() {
return 0
}
create_rdc_service() {
#Symlink RDC Service
if [ -d /run/systemd/system ]; then
ln -s -f -r /@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rdc/rdc.service /lib/systemd/system/rdc.service
fi
}
reload_systemd() {
systemctl daemon-reload
if [ -d /run/systemd/system ]; then
systemctl daemon-reload
fi
return 0
}
case "$1" in
configure)
do_create_rdc_user
#Symlink RDC Service
ln -s -f -r /@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rdc/rdc.service /lib/systemd/system/rdc.service
create_rdc_service
reload_systemd
exit 0
;;
+15 -4
Просмотреть файл
@@ -3,13 +3,24 @@
set -e
stop_rdc() {
#stop RDC if running
systemctl stop rdc
if [ -d /run/systemd/system ]; then
#stop RDC if running
systemctl stop rdc
fi
return 0
}
rm_rdc_service() {
if [ -e /run/systemd/system ]; then
unlink /lib/systemd/system/rdc.service
fi
return 0
}
reload_systemd() {
systemctl daemon-reload
if [ -d /run/systemd/system ]; then
systemctl daemon-reload
fi
return 0
}
@@ -21,7 +32,7 @@ rm_pyc() {
case "$1" in
remove | upgrade )
stop_rdc
unlink /lib/systemd/system/rdc.service
rm_rdc_service
reload_systemd
rm_pyc
;;
+16 -3
Просмотреть файл
@@ -1,17 +1,30 @@
#!/bin/bash
stop_rdc() {
#stop RDC if running
systemctl stop rdc
if [ -d /run/systemd/system ]; then
systemctl stop rdc
fi
return 0
}
rm_rdc_service() {
if [ -e /run/systemd/system ]; then
unlink @DISTRO_ROOT@/rdc.service
fi
return 0
}
reload_systemd() {
systemctl daemon-reload
if [ -d /run/systemd/system ]; then
systemctl daemon-reload
fi
return 0
}
if [ $1 -le 1 ]; then
# perform the below actions for rpm remove($1=0) or upgrade($1=1) operations
stop_rdc
unlink @DISTRO_ROOT@/rdc.service
rm_rdc_service
reload_systemd
fi
+13 -3
Просмотреть файл
@@ -1,7 +1,8 @@
#!/bin/bash
# https://fedoraproject.org/wiki/Packaging%3aUsersAndGroups
do_create_rdc_user() {
useradd -r -s /bin/nologin rdc
useradd -r -s /sbin/nologin rdc
if [ $(getent group render) ]; then
usermod -a -G render rdc
else
@@ -12,14 +13,23 @@ do_create_rdc_user() {
return 0
}
create_rdc_service() {
if [ -d /run/systemd/system ]; then
ln -s -f -r /@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rdc/rdc.service @DISTRO_ROOT@/rdc.service
fi
}
reload_systemd() {
systemctl daemon-reload
if [ -d /run/systemd/system ]; then
systemctl daemon-reload
fi
return 0
}
do_create_rdc_user
#Symlink RDC Service
ln -s -f -r /@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rdc/rdc.service @DISTRO_ROOT@/rdc.service
create_rdc_service
#Request systemctl to reload file since RDC is adding new file/service
reload_systemd