From d734ec5aa61b23b57f55ac6992ae07ce0b968877 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Tue, 31 Oct 2023 18:00:37 -0500 Subject: [PATCH 1/3] Add linting via pre-commit and docker Please see .pre-commit-config.yaml for details - Add clang-format - Add cpplint - Add config for clang-tidy but don't enforce with pre-commit Change-Id: Ica447c78e6fde94b43bfdc00f5b4efc338363e24 Signed-off-by: Galantsev, Dmitrii [ROCm/rocm_smi_lib commit: 142fbac7ac886921e0d7087538f7293c7b434d89] --- projects/rocm-smi-lib/.clang-format | 4 ++ projects/rocm-smi-lib/.clang-tidy | 25 +++++++++++++ projects/rocm-smi-lib/.clangd | 37 +++++++++++++++++++ projects/rocm-smi-lib/.editorconfig | 1 + projects/rocm-smi-lib/.gitignore | 5 +++ projects/rocm-smi-lib/.pre-commit-config.yaml | 30 +++++++++++++++ projects/rocm-smi-lib/.update-clang-tidy.sh | 36 ++++++++++++++++++ projects/rocm-smi-lib/CPPLINT.cfg | 3 ++ 8 files changed, 141 insertions(+) create mode 100644 projects/rocm-smi-lib/.clang-format create mode 100644 projects/rocm-smi-lib/.clang-tidy create mode 100644 projects/rocm-smi-lib/.clangd create mode 100644 projects/rocm-smi-lib/.pre-commit-config.yaml create mode 100755 projects/rocm-smi-lib/.update-clang-tidy.sh create mode 100644 projects/rocm-smi-lib/CPPLINT.cfg diff --git a/projects/rocm-smi-lib/.clang-format b/projects/rocm-smi-lib/.clang-format new file mode 100644 index 0000000000..0bca7eb76f --- /dev/null +++ b/projects/rocm-smi-lib/.clang-format @@ -0,0 +1,4 @@ +--- +Language: Cpp +BasedOnStyle: Google +ColumnLimit: 100 diff --git a/projects/rocm-smi-lib/.clang-tidy b/projects/rocm-smi-lib/.clang-tidy new file mode 100644 index 0000000000..11402fd8da --- /dev/null +++ b/projects/rocm-smi-lib/.clang-tidy @@ -0,0 +1,25 @@ +# THIS FILE IS GENERATED FROM .clangd! +# Run .update-clang-tidy.sh to regenerate. +Checks: + bugprone*, + clang-analyzer*, + google*, + misc*, + modernize*, + -abseil*, + -bugprone-easily-swappable-parameters, + -bugprone-reserved-identifier, + -clang-analyzer-security.insecureAPI.strcpy, + -cppcoreguidelines*, + -cppcoreguidelines-pro*, + -misc-non-copyable-objects, + -misc-use-anonymous-namespace, + -modernize-avoid-c-arrays, + -modernize-redundant-void-arg, + -modernize-use-auto, + -modernize-use-nodiscard, + -modernize-use-noexcept, + -modernize-use-trailing-return-type, + -modernize-use-using, + -performance*, + -readability*, diff --git a/projects/rocm-smi-lib/.clangd b/projects/rocm-smi-lib/.clangd new file mode 100644 index 0000000000..74ae437a6f --- /dev/null +++ b/projects/rocm-smi-lib/.clangd @@ -0,0 +1,37 @@ +CompileFlags: + Remove: -W* + Add: [-Wall, -pedantic, -I/opt/rocm/include, -I/opt/rocm/include/hsa, -I/opt/rocm/include/rocprofiler] + Compiler: clang++ + +# list here: https://clang.llvm.org/extra/clang-tidy/checks/list.html +Diagnostics: + UnusedIncludes: Strict + # rules below are copied into .clang-tidy using ./.update-clang-tidy.sh + # please keep the rules sorted alphabetically + ClangTidy: + Add: [ + bugprone*, + clang-analyzer*, + google*, + misc*, + modernize*, + ] + Remove: [ + abseil*, + bugprone-easily-swappable-parameters, + bugprone-reserved-identifier, + clang-analyzer-security.insecureAPI.strcpy, + cppcoreguidelines*, + cppcoreguidelines-pro*, + misc-non-copyable-objects, + misc-use-anonymous-namespace, + modernize-avoid-c-arrays, + modernize-redundant-void-arg, + modernize-use-auto, + modernize-use-nodiscard, + modernize-use-noexcept, + modernize-use-trailing-return-type, + modernize-use-using, + performance*, + readability*, + ] diff --git a/projects/rocm-smi-lib/.editorconfig b/projects/rocm-smi-lib/.editorconfig index 86ca34a181..a97ea56020 100644 --- a/projects/rocm-smi-lib/.editorconfig +++ b/projects/rocm-smi-lib/.editorconfig @@ -10,3 +10,4 @@ root = true charset = utf-8 indent_style = space indent_size = 2 +max_line_length = 100 diff --git a/projects/rocm-smi-lib/.gitignore b/projects/rocm-smi-lib/.gitignore index 71facfe540..3e67867c25 100644 --- a/projects/rocm-smi-lib/.gitignore +++ b/projects/rocm-smi-lib/.gitignore @@ -23,3 +23,8 @@ device/ __pycache__ README README.html + +# do NOT ignore these files +!.clang-format +!.clang-tidy +!.clangd diff --git a/projects/rocm-smi-lib/.pre-commit-config.yaml b/projects/rocm-smi-lib/.pre-commit-config.yaml new file mode 100644 index 0000000000..d84939508a --- /dev/null +++ b/projects/rocm-smi-lib/.pre-commit-config.yaml @@ -0,0 +1,30 @@ +# - How to use: +# python3 -m pip install pre-commit +# pre-commit install --install hooks +# Upon a new commit - the hooks should automagically run +# +# - How to skip: +# git commit --no-verify +# or +# SKIP=clang-format-docker git commit +# SKIP=cpplint-docker git commit + +fail_fast: false +repos: + # For portability I decided to use Docker containers + - repo: https://github.com/dmitrii-galantsev/pre-commit-docker-cpplint + rev: 0.0.3 + hooks: + - id: clang-format-docker + - id: cpplint-docker + # Below is a local way of running formatters and linters + # NOTE: clang-tidy is not used in the above tests + # - repo: https://github.com/pocc/pre-commit-hooks + # rev: v1.3.5 + # hooks: + # - id: clang-format + # args: [--no-diff, -i] + # - id: clang-tidy + # args: [-p=build, --quiet] + # - id: cpplint + # args: [--verbose=5] diff --git a/projects/rocm-smi-lib/.update-clang-tidy.sh b/projects/rocm-smi-lib/.update-clang-tidy.sh new file mode 100755 index 0000000000..9607b35714 --- /dev/null +++ b/projects/rocm-smi-lib/.update-clang-tidy.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +set -x # trace +set -e # exit immediately if command fails +set -u # exit if an undefined variable is found + +awk ' +BEGIN { + print "# THIS FILE IS GENERATED FROM .clangd!" + print "# Run ./.update-clang-tidy.sh to regenerate." + print "Checks:" +} +/Add: \[$/{ +a=1 + next +} +/]/{ + a=0 +} +a{ +gsub(/^\s+/," ") + print +} + +/Remove: \[$/{ +r=1 + next +} +/]/{ + r=0 +} +r{ + gsub(/^\s+/," -") + print +} +' .clangd | tee .clang-tidy diff --git a/projects/rocm-smi-lib/CPPLINT.cfg b/projects/rocm-smi-lib/CPPLINT.cfg new file mode 100644 index 0000000000..b63692c6df --- /dev/null +++ b/projects/rocm-smi-lib/CPPLINT.cfg @@ -0,0 +1,3 @@ +set noparent +linelength=100 +filter=-build/include_subdir,-legal/copyright,-runtime/printf,-build/c++11,-runtime/int,-build/header_guard From 7fc67c88ce9a3548d36bfd75b06dce6f9df23265 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 29 Nov 2023 02:36:50 -0600 Subject: [PATCH 2/3] Fix ASAN for tests and log metrics better Change-Id: Ib495cfc28c48a4d291a89673a3b6fc13313845c7 Signed-off-by: Galantsev, Dmitrii [ROCm/rocm_smi_lib commit: a128867497d569837d76ea6ca09775bef3a172ef] --- projects/rocm-smi-lib/CMakeLists.txt | 2 ++ projects/rocm-smi-lib/src/rocm_smi.cc | 22 ++++++++++--------- .../rocm-smi-lib/src/rocm_smi_gpu_metrics.cc | 4 ++-- .../tests/rocm_smi_test/CMakeLists.txt | 8 ------- 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/projects/rocm-smi-lib/CMakeLists.txt b/projects/rocm-smi-lib/CMakeLists.txt index dafb40f0b9..3092e237dc 100755 --- a/projects/rocm-smi-lib/CMakeLists.txt +++ b/projects/rocm-smi-lib/CMakeLists.txt @@ -102,8 +102,10 @@ if (${ADDRESS_SANITIZER}) if (BUILD_SHARED_LIBS}) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -shared-libsan" ) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -shared-libsan" ) else () set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libsan" ) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libsan" ) endif () else () ## Security breach mitigation flags diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index ba66636ae3..d78ac0d077 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -6640,19 +6640,21 @@ rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_d const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrDClock0); amd::smi::GPUMetricCurrDClkTbl_t tmp_curr_dclk0_tbl; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_dclk0_tbl); + const auto max_num_elems = + static_cast(std::end(*current_dclk_value) - std::begin(*current_dclk_value)); + ostrstream << __PRETTY_FUNCTION__ + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_curr_dclk0_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*current_dclk_value) - std::begin(*current_dclk_value)); std::copy_n(std::begin(tmp_curr_dclk0_tbl), max_num_elems, *current_dclk_value); } - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_curr_dclk0_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); return status_code; CATCH diff --git a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc index 11d2871ca9..48d86837ee 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc @@ -2607,7 +2607,7 @@ rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t met << " | Returning = " << getRSMIStatusString(status_code) << " |"; - LOG_ERROR(ostrstream); + LOG_TRACE(ostrstream); return status_code; } } @@ -2717,7 +2717,7 @@ rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnit << " | Returning = " << getRSMIStatusString(status_code) << " |"; - LOG_ERROR(ostrstream); + LOG_TRACE(ostrstream); return status_code; } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt b/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt index bd7c827ce4..2253327813 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt @@ -21,14 +21,6 @@ message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) message("") -## Compiler flags -set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti -std=c++17") -if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64") - set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -m64 -msse -msse2") -endif() - set(SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(RSMITST "rsmitst") From e2a833f3478d4e6c488feaecf5a0a12d24cdf8d9 Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Thu, 30 Nov 2023 15:03:51 -0600 Subject: [PATCH 3/3] rocm_smi_lib: Fix GPU Metrics Max Elements Read Exceeded Code changes related to the following: * Check smallest copy size for multi-valued metrics * Unit tests: gpu_metric_read * ROCMSMI examples Build changes related to the following: * CMakeLists.txt Change-Id: Ieb2363020fa21c93fbacd0edcc1d394eed183051 Signed-off-by: Oliveira, Daniel [ROCm/rocm_smi_lib commit: 8e0d3d5a398f3eee6195950c3382ce9043829f60] --- projects/rocm-smi-lib/CMakeLists.txt | 37 +- .../rocm_smi/example/rocm_smi_example.cc | 4 +- projects/rocm-smi-lib/src/rocm_smi.cc | 195 +++++---- .../functional/gpu_metrics_read.cc | 413 ++++++++++++------ .../tests/rocm_smi_test/test_base.cc | 1 + 5 files changed, 413 insertions(+), 237 deletions(-) diff --git a/projects/rocm-smi-lib/CMakeLists.txt b/projects/rocm-smi-lib/CMakeLists.txt index 3092e237dc..b3eff5d9fb 100755 --- a/projects/rocm-smi-lib/CMakeLists.txt +++ b/projects/rocm-smi-lib/CMakeLists.txt @@ -95,17 +95,31 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--build-id=sha1") endif() +# Use this instead of above for 32 bit +# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32") + +if ("${CMAKE_BUILD_TYPE}" STREQUAL Release) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") +else () + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG") +endif () + ## Address Sanitize Flag if (${ADDRESS_SANITIZER}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -g") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -g -fno-omit-frame-pointer") set(CMAKE_EXE_LINKER_FLAGS -fsanitize=address) - if (BUILD_SHARED_LIBS}) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -shared-libsan" ) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -shared-libsan" ) + if (BUILD_SHARED_LIBS) + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -shared-libasan") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -shared-libasan") + endif() else () - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libsan" ) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libsan" ) + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libsan") + else() + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libasan") + endif() endif () else () ## Security breach mitigation flags @@ -118,21 +132,12 @@ else () "${CMAKE_CXX_FLAGS} -Wtrampolines -Wl,-z,now") endif () -# Use this instead of above for 32 bit -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32") - -if ("${CMAKE_BUILD_TYPE}" STREQUAL Release) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") -else () - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG") -endif () - set(COMMON_SRC_DIR "${PROJECT_SOURCE_DIR}/src") set(COMMON_INC_DIR "${PROJECT_SOURCE_DIR}/include/rocm_smi") set(SHR_MUTEX_DIR "${PROJECT_SOURCE_DIR}/third_party/shared_mutex") include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include - ${CMAKE_CURRENT_SOURCE_DIR}/third_party/shared_mutex) + ${CMAKE_CURRENT_SOURCE_DIR}/third_party/shared_mutex) set(CMN_SRC_LIST "${COMMON_SRC_DIR}/rocm_smi_device.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_main.cc") diff --git a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc index 546d72398c..0260104b7b 100755 --- a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc +++ b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc @@ -745,8 +745,8 @@ auto print_error_or_value(rsmi_status_t status_code, const T& metric) { return str_values; } else if constexpr ((std::is_same_v) || - (std::is_same_v) || - (std::is_same_v)) { + (std::is_same_v) || + (std::is_same_v)) { return std::to_string(metric); } } diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index d78ac0d077..702d6d6ae8 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -6375,19 +6375,24 @@ rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_valu const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempHbm); amd::smi::GPUMetricTempHbmTbl_t tmp_hbl_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_hbl_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value)); - std::copy_n(std::begin(tmp_hbl_tbl), max_num_elems, *temp_hbm_value); - } + const auto max_num_elems = + static_cast(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_hbl_tbl.size()) ? max_num_elems : tmp_hbl_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_hbl_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_hbl_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(temp_hbm_value, 0, sizeof(temp_hbm_value)); + std::copy_n(std::begin(tmp_hbl_tbl), copy_size, *temp_hbm_value); + } return status_code; CATCH @@ -6409,19 +6414,24 @@ rsmi_dev_metrics_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivity_t* vcn_a const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); amd::smi::GPUMetricVcnActivityTbl_t tmp_vcn_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_vcn_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value)); - std::copy_n(std::begin(tmp_vcn_tbl), max_num_elems, *vcn_activity_value); - } + const auto max_num_elems = + static_cast(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_vcn_tbl.size()) ? max_num_elems : tmp_vcn_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_vcn_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_vcn_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(vcn_activity_value, 0, sizeof(vcn_activity_value)); + std::copy_n(std::begin(tmp_vcn_tbl), copy_size, *vcn_activity_value); + } return status_code; CATCH @@ -6443,19 +6453,24 @@ rsmi_dev_metrics_xgmi_read_data_get(uint32_t dv_ind, GPUMetricXgmiReadDataAcc_t* const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator); amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value)); - std::copy_n(std::begin(tmp_xgmi_acc_tbl), max_num_elems, *xgmi_read_data_acc_value); - } + const auto max_num_elems = + static_cast(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_xgmi_acc_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_xgmi_acc_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(xgmi_read_data_acc_value, 0, sizeof(xgmi_read_data_acc_value)); + std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_read_data_acc_value); + } return status_code; CATCH @@ -6477,19 +6492,24 @@ rsmi_dev_metrics_xgmi_write_data_get(uint32_t dv_ind, GPUMetricXgmiWriteDataAcc_ const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator); amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value)); - std::copy_n(std::begin(tmp_xgmi_acc_tbl), max_num_elems, *xgmi_write_data_acc_value); - } + const auto max_num_elems = + static_cast(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_xgmi_acc_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_xgmi_acc_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(xgmi_write_data_acc_value, 0, sizeof(xgmi_write_data_acc_value)); + std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_write_data_acc_value); + } return status_code; CATCH @@ -6511,19 +6531,24 @@ rsmi_dev_metrics_curr_gfxclk_get(uint32_t dv_ind, GPUMetricCurrGfxClk_t* current const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock); amd::smi::GPUMetricCurrGfxClkTbl_t tmp_curr_gfxclk_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_gfxclk_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*current_gfxclk_value) - std::begin(*current_gfxclk_value)); - std::copy_n(std::begin(tmp_curr_gfxclk_tbl), max_num_elems, *current_gfxclk_value); - } + const auto max_num_elems = + static_cast(std::end(*current_gfxclk_value) - std::begin(*current_gfxclk_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_curr_gfxclk_tbl.size()) ? max_num_elems : tmp_curr_gfxclk_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_curr_gfxclk_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_curr_gfxclk_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(current_gfxclk_value, 0, sizeof(current_gfxclk_value)); + std::copy_n(std::begin(tmp_curr_gfxclk_tbl), copy_size, *current_gfxclk_value); + } return status_code; CATCH @@ -6545,19 +6570,23 @@ rsmi_dev_metrics_curr_socclk_get(uint32_t dv_ind, GPUMetricCurrSocClk_t* current const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrSocClock); amd::smi::GPUMetricCurrSocClkTbl_t tmp_curr_socclk_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_socclk_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*current_socclk_value) - std::begin(*current_socclk_value)); - std::copy_n(std::begin(tmp_curr_socclk_tbl), max_num_elems, *current_socclk_value); - } + const auto max_num_elems = + static_cast(std::end(*current_socclk_value) - std::begin(*current_socclk_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_curr_socclk_tbl.size()) ? max_num_elems : tmp_curr_socclk_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_curr_socclk_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_curr_socclk_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(current_socclk_value, 0, sizeof(current_socclk_value)); + std::copy_n(std::begin(tmp_curr_socclk_tbl), copy_size, *current_socclk_value); + } return status_code; CATCH @@ -6579,19 +6608,24 @@ rsmi_dev_metrics_curr_vclk0_get(uint32_t dv_ind, GPUMetricCurrVClk0_t* current_v const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrVClock0); amd::smi::GPUMetricCurrVClkTbl_t tmp_curr_vclk0_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_vclk0_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*current_vclk_value) - std::begin(*current_vclk_value)); - std::copy_n(std::begin(tmp_curr_vclk0_tbl), max_num_elems, *current_vclk_value); - } + const auto max_num_elems = + static_cast(std::end(*current_vclk_value) - std::begin(*current_vclk_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_curr_vclk0_tbl.size()) ? max_num_elems : tmp_curr_vclk0_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_curr_vclk0_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_curr_vclk0_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(current_vclk_value, 0, sizeof(current_vclk_value)); + std::copy_n(std::begin(tmp_curr_vclk0_tbl), copy_size, *current_vclk_value); + } return status_code; CATCH @@ -6642,6 +6676,8 @@ rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_d auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_dclk0_tbl); const auto max_num_elems = static_cast(std::end(*current_dclk_value) - std::begin(*current_dclk_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_curr_dclk0_tbl.size()) ? max_num_elems : tmp_curr_dclk0_tbl.size()); ostrstream << __PRETTY_FUNCTION__ << "\n | ======= end ======= " << "\n | End Result " @@ -6649,11 +6685,12 @@ rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_d << "\n | Metric Type: " << static_cast(gpu_metric_unit) << "\n | Metric Size: " << tmp_curr_dclk0_tbl.size() << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::copy_n(std::begin(tmp_curr_dclk0_tbl), max_num_elems, *current_dclk_value); + std::memset(current_dclk_value, 0, sizeof(current_dclk_value)); + std::copy_n(std::begin(tmp_curr_dclk0_tbl), copy_size, *current_dclk_value); } return status_code; diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc index 70a6028d18..30c6d1fdaa 100644 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc @@ -52,6 +52,7 @@ #include "gtest/gtest.h" #include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi_test/functional/gpu_metrics_read.h" #include "rocm_smi_test/test_common.h" @@ -87,6 +88,38 @@ void TestGpuMetricsRead::Close() { } +using GPUMetricResults_t = std::map; +GPUMetricResults_t MetricResults{}; + +template +auto print_error_or_value(std::string title, std::string func_name, const T& metric) { + auto str_values = title; + const auto status_code = MetricResults.at(func_name); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + if constexpr (std::is_array_v) { + auto idx = uint16_t(0); + + const auto num_elems = static_cast(std::end(metric) - std::begin(metric)); + str_values += ("\n\t\t num of values: " + std::to_string(num_elems) + "\n"); + for (const auto& el : metric) { + str_values += "\t\t [" + std::to_string(idx) + "]: " + std::to_string(el) + "\n"; + ++idx; + } + return str_values; + } + else if constexpr ((std::is_same_v) || + (std::is_same_v) || + (std::is_same_v)) { + + return str_values += std::to_string(metric); + } + } + else { + return str_values += ("\n\t\tStatus: [" + std::to_string(status_code) + "] " + "-> " + amd::smi::getRSMIStatusString(status_code)); + } +}; + + void TestGpuMetricsRead::Run(void) { rsmi_status_t err; @@ -115,74 +148,89 @@ void TestGpuMetricsRead::Run(void) { } else { CHK_ERR_ASRT(err); IF_VERB(STANDARD) { - std::cout << std::dec << "system_clock_counter=" - << smu.system_clock_counter << '\n'; - std::cout << std::dec << "temperature_edge=" - << smu.temperature_edge << '\n'; - std::cout << std::dec << "temperature_hotspot=" - << smu.temperature_hotspot << '\n'; - std::cout << std::dec << "temperature_mem=" - << smu.temperature_mem << '\n'; - std::cout << std::dec << "temperature_vrgfx=" - << smu.temperature_vrgfx << '\n'; - std::cout << std::dec << "temperature_vrsoc=" - << smu.temperature_vrsoc << '\n'; - std::cout << std::dec << "temperature_vrmem=" - << smu.temperature_vrmem << '\n'; - std::cout << std::dec << "average_gfx_activity=" - << smu.average_gfx_activity << '\n'; - std::cout << std::dec << "average_umc_activity=" - << smu.average_umc_activity << '\n'; - std::cout << std::dec << "average_mm_activity=" - << smu.average_mm_activity << '\n'; - std::cout << std::dec << "average_socket_power=" - << smu.average_socket_power << '\n'; - std::cout << std::dec << "energy_accumulator=" - << smu.energy_accumulator << '\n'; - std::cout << std::dec << "average_gfxclk_frequency=" - << smu.average_gfxclk_frequency << '\n'; - std::cout << std::dec << "average_gfxclk_frequency=" - << smu.average_gfxclk_frequency << '\n'; - std::cout << std::dec << "average_uclk_frequency=" - << smu.average_uclk_frequency << '\n'; - std::cout << std::dec << "average_vclk0_frequency=" - << smu.average_vclk0_frequency << '\n'; - std::cout << std::dec << "average_dclk0_frequency=" - << smu.average_dclk0_frequency << '\n'; - std::cout << std::dec << "average_vclk1_frequency=" - << smu.average_vclk1_frequency << '\n'; - std::cout << std::dec << "average_dclk1_frequency=" - << smu.average_dclk1_frequency << '\n'; - std::cout << std::dec << "current_gfxclk=" - << smu.current_gfxclk << '\n'; - std::cout << std::dec << "current_socclk=" - << smu.current_socclk << '\n'; - std::cout << std::dec << "current_uclk=" - << smu.current_uclk << '\n'; - std::cout << std::dec << "current_vclk0=" - << smu.current_vclk0 << '\n'; - std::cout << std::dec << "current_dclk0=" - << smu.current_dclk0 << '\n'; - std::cout << std::dec << "current_vclk1=" - << smu.current_vclk1 << '\n'; - std::cout << std::dec << "current_dclk1=" - << smu.current_dclk1 << '\n'; - std::cout << std::dec << "throttle_status=" - << smu.throttle_status << '\n'; - std::cout << std::dec << "current_fan_speed=" - << smu.current_fan_speed << '\n'; - std::cout << "pcie_link_width=" - << std::to_string(smu.pcie_link_width) << '\n'; - std::cout << "pcie_link_width=" - << std::to_string(smu.pcie_link_speed) << '\n'; - std::cout << "gfx_activity_acc=" - << std::dec << smu.gfx_activity_acc << '\n'; - std::cout << "mem_activity_acc=" - << std::dec << smu.mem_activity_acc << '\n'; + std::cout << std::dec << "\tsystem_clock_counter=" << smu.system_clock_counter << '\n'; + std::cout << std::dec << "\ttemperature_edge=" << smu.temperature_edge << '\n'; + std::cout << std::dec << "\ttemperature_hotspot=" << smu.temperature_hotspot << '\n'; + std::cout << std::dec << "\ttemperature_mem=" << smu.temperature_mem << '\n'; + std::cout << std::dec << "\ttemperature_vrgfx=" << smu.temperature_vrgfx << '\n'; + std::cout << std::dec << "\ttemperature_vrsoc=" << smu.temperature_vrsoc << '\n'; + std::cout << std::dec << "\ttemperature_vrmem=" << smu.temperature_vrmem << '\n'; + std::cout << std::dec << "\taverage_gfx_activity=" << smu.average_gfx_activity << '\n'; + std::cout << std::dec << "\taverage_umc_activity=" << smu.average_umc_activity << '\n'; + std::cout << std::dec << "\taverage_mm_activity=" << smu.average_mm_activity << '\n'; + std::cout << std::dec << "\taverage_socket_power=" << smu.average_socket_power << '\n'; + std::cout << std::dec << "\tenergy_accumulator=" << smu.energy_accumulator << '\n'; + std::cout << std::dec << "\taverage_gfxclk_frequency=" << smu.average_gfxclk_frequency << '\n'; + std::cout << std::dec << "\taverage_uclk_frequency=" << smu.average_uclk_frequency << '\n'; + std::cout << std::dec << "\taverage_vclk0_frequency=" << smu.average_vclk0_frequency << '\n'; + std::cout << std::dec << "\taverage_dclk0_frequency=" << smu.average_dclk0_frequency << '\n'; + std::cout << std::dec << "\taverage_vclk1_frequency=" << smu.average_vclk1_frequency << '\n'; + std::cout << std::dec << "\taverage_dclk1_frequency=" << smu.average_dclk1_frequency << '\n'; + std::cout << std::dec << "\tcurrent_gfxclk=" << smu.current_gfxclk << '\n'; + std::cout << std::dec << "\tcurrent_socclk=" << smu.current_socclk << '\n'; + std::cout << std::dec << "\tcurrent_uclk=" << smu.current_uclk << '\n'; + std::cout << std::dec << "\tcurrent_vclk0=" << smu.current_vclk0 << '\n'; + std::cout << std::dec << "\tcurrent_dclk0=" << smu.current_dclk0 << '\n'; + std::cout << std::dec << "\tcurrent_vclk1=" << smu.current_vclk1 << '\n'; + std::cout << std::dec << "\tcurrent_dclk1=" << smu.current_dclk1 << '\n'; + std::cout << std::dec << "\tthrottle_status=" << smu.throttle_status << '\n'; + std::cout << std::dec << "\tcurrent_fan_speed=" << smu.current_fan_speed << '\n'; + std::cout << std::dec << "\tpcie_link_width=" << smu.pcie_link_width << '\n'; + std::cout << std::dec << "\tpcie_link_speed=" << smu.pcie_link_speed << '\n'; + std::cout << std::dec << "\tgfx_activity_acc=" << std::dec << smu.gfx_activity_acc << '\n'; + std::cout << std::dec << "\tmem_activity_acc=" << std::dec << smu.mem_activity_acc << '\n'; for (int i = 0; i < RSMI_NUM_HBM_INSTANCES; ++i) { - std::cout << "temperature_hbm[" << i << "]=" << std::dec << - smu.temperature_hbm[i] << '\n'; + std::cout << "\ttemperature_hbm[" << i << "]=" << std::dec << smu.temperature_hbm[i] << '\n'; + } + std::cout << "\n"; + std::cout << "\tfirmware_timestamp=" << std::dec << smu.firmware_timestamp << '\n'; + std::cout << "\tvoltage_soc=" << std::dec << smu.voltage_soc << '\n'; + std::cout << "\tvoltage_gfx=" << std::dec << smu.voltage_gfx << '\n'; + std::cout << "\tvoltage_mem=" << std::dec << smu.voltage_mem << '\n'; + std::cout << "\tindep_throttle_status=" << std::dec << smu.indep_throttle_status << '\n'; + std::cout << "\tcurrent_socket_power=" << std::dec << smu.current_socket_power << '\n'; + + for (int i = 0; i < RSMI_MAX_NUM_VCN; ++i) { + std::cout << "\tvcn_activity[" << i << "]=" << std::dec << smu.vcn_activity[i] << '\n'; + } + + std::cout << "\n"; + std::cout << "\tgfxclk_lock_status=" << std::dec << smu.gfxclk_lock_status << '\n'; + std::cout << "\txgmi_link_width=" << std::dec << smu.xgmi_link_width << '\n'; + std::cout << "\txgmi_link_speed=" << std::dec << smu.xgmi_link_speed << '\n'; + std::cout << "\tpcie_bandwidth_acc=" << std::dec << smu.pcie_bandwidth_acc << '\n'; + std::cout << "\tpcie_bandwidth_inst=" << std::dec << smu.pcie_bandwidth_inst << '\n'; + std::cout << "\tpcie_l0_to_recov_count_acc=" << std::dec << smu.pcie_l0_to_recov_count_acc << '\n'; + std::cout << "\tpcie_replay_count_acc=" << std::dec << smu.pcie_replay_count_acc << '\n'; + std::cout << "\tpcie_replay_rover_count_acc=" << std::dec << smu.pcie_replay_rover_count_acc << '\n'; + for (int i = 0; i < RSMI_MAX_NUM_XGMI_LINKS; ++i) { + std::cout << "\txgmi_read_data_acc[" << i << "]=" << std::dec << smu.xgmi_read_data_acc[i] << '\n'; + } + + std::cout << "\n"; + for (int i = 0; i < RSMI_MAX_NUM_XGMI_LINKS; ++i) { + std::cout << "\txgmi_write_data_acc[" << i << "]=" << std::dec << smu.xgmi_write_data_acc[i] << '\n'; + } + + std::cout << "\n"; + for (int i = 0; i < RSMI_MAX_NUM_GFX_CLKS; ++i) { + std::cout << "\tcurrent_gfxclks[" << i << "]=" << std::dec << smu.current_gfxclks[i] << '\n'; + } + + std::cout << "\n"; + for (int i = 0; i < RSMI_MAX_NUM_CLKS; ++i) { + std::cout << "\tcurrent_socclks[" << i << "]=" << std::dec << smu.current_socclks[i] << '\n'; + } + + std::cout << "\n"; + for (int i = 0; i < RSMI_MAX_NUM_CLKS; ++i) { + std::cout << "\tcurrent_vclk0s[" << i << "]=" << std::dec << smu.current_vclk0s[i] << '\n'; + } + + std::cout << "\n"; + for (int i = 0; i < RSMI_MAX_NUM_CLKS; ++i) { + std::cout << "\tcurrent_dclk0s[" << i << "]=" << std::dec << smu.current_dclk0s[i] << '\n'; } } } @@ -198,6 +246,8 @@ void TestGpuMetricsRead::Run(void) { auto val_ui32 = uint32_t(0); auto val_ui64 = uint64_t(0); auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + + std::cout << "\n\t**GPU METRICS: Using direct APIs (newer):\n"; for (uint32_t i = 0; i < num_monitor_devs(); ++i) { PrintDeviceHeader(i); @@ -206,363 +256,446 @@ void TestGpuMetricsRead::Run(void) { if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_temp_edge_get", status_code); auto temp_hotspot_value = val_ui16; status_code = rsmi_dev_metrics_temp_hotspot_get(i, &temp_hotspot_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_temp_hotspot_get", status_code); auto temp_mem_value = val_ui16; status_code = rsmi_dev_metrics_temp_mem_get(i, &temp_mem_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_temp_mem_get", status_code); auto temp_vrgfx_value = val_ui16; status_code = rsmi_dev_metrics_temp_vrgfx_get(i, &temp_vrgfx_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_temp_vrgfx_get", status_code); auto temp_vrsoc_value = val_ui16; status_code = rsmi_dev_metrics_temp_vrsoc_get(i, &temp_vrsoc_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_temp_vrsoc_get", status_code); auto temp_vrmem_value = val_ui16; status_code = rsmi_dev_metrics_temp_vrmem_get(i, &temp_vrmem_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_temp_vrmem_get", status_code); - GPUMetricTempHbm_t temp_hbm_values; + GPUMetricTempHbm_t temp_hbm_values{}; status_code = rsmi_dev_metrics_temp_hbm_get(i, &temp_hbm_values); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_temp_hbm_get", status_code); auto temp_curr_socket_power_value = val_ui16; status_code = rsmi_dev_metrics_curr_socket_power_get(i, &temp_curr_socket_power_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_socket_power_get", status_code); auto temp_energy_accum_value = val_ui64; status_code = rsmi_dev_metrics_energy_acc_get(i, &temp_energy_accum_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_energy_acc_get", status_code); auto temp_avg_socket_power_value = val_ui16; status_code = rsmi_dev_metrics_avg_socket_power_get(i, &temp_avg_socket_power_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_socket_power_get", status_code); auto temp_avg_gfx_activity_value = val_ui16; status_code = rsmi_dev_metrics_avg_gfx_activity_get(i, &temp_avg_gfx_activity_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_avg_gfx_activity_get", status_code); auto temp_avg_umc_activity_value = val_ui16; status_code = rsmi_dev_metrics_avg_umc_activity_get(i, &temp_avg_umc_activity_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_avg_umc_activity_get", status_code); auto temp_avg_mm_activity_value = val_ui16; status_code = rsmi_dev_metrics_avg_mm_activity_get(i, &temp_avg_mm_activity_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_mm_activity_get", status_code); - GPUMetricVcnActivity_t temp_vcn_values; + GPUMetricVcnActivity_t temp_vcn_values{}; status_code = rsmi_dev_metrics_vcn_activity_get(i, &temp_vcn_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_vcn_activity_get", status_code); auto temp_mem_activity_accum_value = val_ui32; status_code = rsmi_dev_metrics_mem_activity_acc_get(i, &temp_mem_activity_accum_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_mem_activity_acc_get", status_code); auto temp_gfx_activity_accum_value = val_ui32; status_code = rsmi_dev_metrics_gfx_activity_acc_get(i, &temp_gfx_activity_accum_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_gfx_activity_acc_get", status_code); auto temp_avg_gfx_clock_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_gfx_clock_frequency_get(i, &temp_avg_gfx_clock_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_gfx_clock_frequency_get", status_code); auto temp_avg_soc_clock_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_soc_clock_frequency_get(i, &temp_avg_soc_clock_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_soc_clock_frequency_get", status_code); auto temp_avg_uclock_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_uclock_frequency_get(i, &temp_avg_uclock_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_uclock_frequency_get", status_code); auto temp_avg_vclock0_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_vclock0_frequency_get(i, &temp_avg_vclock0_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_vclock0_frequency_get", status_code); auto temp_avg_dclock0_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_dclock0_frequency_get(i, &temp_avg_dclock0_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_dclock0_frequency_get", status_code); auto temp_avg_vclock1_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_vclock1_frequency_get(i, &temp_avg_vclock1_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_vclock1_frequency_get", status_code); auto temp_avg_dclock1_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_dclock1_frequency_get(i, &temp_avg_dclock1_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_dclock1_frequency_get", status_code); auto temp_curr_vclk1_value = val_ui16; status_code = rsmi_dev_metrics_curr_vclk1_get(i, &temp_curr_vclk1_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_vclk1_get", status_code); auto temp_curr_dclk1_value = val_ui16; status_code = rsmi_dev_metrics_curr_dclk1_get(i, &temp_curr_dclk1_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_dclk1_get", status_code); auto temp_curr_uclk_value = val_ui16; status_code = rsmi_dev_metrics_curr_uclk_get(i, &temp_curr_uclk_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_curr_uclk_get", status_code); - GPUMetricCurrDClk0_t temp_curr_dclk0_values; + GPUMetricCurrDClk0_t temp_curr_dclk0_values{}; status_code = rsmi_dev_metrics_curr_dclk0_get(i, &temp_curr_dclk0_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_dclk0_get", status_code); - GPUMetricCurrGfxClk_t temp_curr_gfxclk_values; + GPUMetricCurrGfxClk_t temp_curr_gfxclk_values{}; status_code = rsmi_dev_metrics_curr_gfxclk_get(i, &temp_curr_gfxclk_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_gfxclk_get", status_code); - GPUMetricCurrSocClk_t temp_curr_socclk_values; + GPUMetricCurrSocClk_t temp_curr_socclk_values{}; status_code = rsmi_dev_metrics_curr_socclk_get(i, &temp_curr_socclk_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_socclk_get", status_code); - GPUMetricCurrVClk0_t temp_curr_vclk0_values; + GPUMetricCurrVClk0_t temp_curr_vclk0_values{}; status_code = rsmi_dev_metrics_curr_vclk0_get(i, &temp_curr_vclk0_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_vclk0_get", status_code); auto temp_indep_throttle_status_value = val_ui64; status_code = rsmi_dev_metrics_indep_throttle_status_get(i, &temp_indep_throttle_status_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_indep_throttle_status_get", status_code); auto temp_throttle_status_value = val_ui32; status_code = rsmi_dev_metrics_throttle_status_get(i, &temp_throttle_status_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_throttle_status_get", status_code); auto temp_gfxclk_lock_status_value = val_ui32; status_code = rsmi_dev_metrics_gfxclk_lock_status_get(i, &temp_gfxclk_lock_status_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_gfxclk_lock_status_get", status_code); + auto temp_curr_fan_speed_value = val_ui16; status_code = rsmi_dev_metrics_curr_fan_speed_get(i, &temp_curr_fan_speed_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_fan_speed_get", status_code); auto temp_pcie_link_width_value = val_ui16; status_code = rsmi_dev_metrics_pcie_link_width_get(i, &temp_pcie_link_width_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_pcie_link_width_get", status_code); auto temp_pcie_link_speed_value = val_ui16; status_code = rsmi_dev_metrics_pcie_link_speed_get(i, &temp_pcie_link_speed_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_pcie_link_speed_get", status_code); auto temp_pcie_bandwidth_accum_value = val_ui64; status_code = rsmi_dev_metrics_pcie_bandwidth_acc_get(i, &temp_pcie_bandwidth_accum_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_pcie_bandwidth_acc_get", status_code); auto temp_pcie_bandwidth_inst_value = val_ui64; status_code = rsmi_dev_metrics_pcie_bandwidth_inst_get(i, &temp_pcie_bandwidth_inst_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_pcie_bandwidth_inst_get", status_code); auto temp_pcie_l0_recov_count_accum_value = val_ui64; status_code = rsmi_dev_metrics_pcie_l0_recov_count_acc_get(i, &temp_pcie_l0_recov_count_accum_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_pcie_l0_recov_count_acc_get", status_code); auto temp_pcie_replay_count_accum_value = val_ui64; status_code = rsmi_dev_metrics_pcie_replay_count_acc_get(i, &temp_pcie_replay_count_accum_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_pcie_replay_count_acc_get", status_code); auto temp_pcie_replay_rover_count_accum_value = val_ui64; status_code = rsmi_dev_metrics_pcie_replay_rover_count_acc_get(i, &temp_pcie_replay_rover_count_accum_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_pcie_replay_rover_count_acc_get", status_code); auto temp_xgmi_link_width_value = val_ui16; status_code = rsmi_dev_metrics_xgmi_link_width_get(i, &temp_xgmi_link_width_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_xgmi_link_width_get", status_code); auto temp_xgmi_link_speed_value = val_ui16; status_code = rsmi_dev_metrics_xgmi_link_speed_get(i, &temp_xgmi_link_speed_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_xgmi_link_speed_get", status_code); - GPUMetricXgmiReadDataAcc_t temp_xgmi_read_values; + GPUMetricXgmiReadDataAcc_t temp_xgmi_read_values{}; status_code = rsmi_dev_metrics_xgmi_read_data_get(i, &temp_xgmi_read_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_xgmi_read_data_get", status_code); - GPUMetricXgmiWriteDataAcc_t temp_xgmi_write_values; + GPUMetricXgmiWriteDataAcc_t temp_xgmi_write_values{}; status_code = rsmi_dev_metrics_xgmi_write_data_get(i, &temp_xgmi_write_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_xgmi_write_data_get", status_code); auto temp_voltage_soc_value = val_ui16; status_code = rsmi_dev_metrics_volt_soc_get(i, &temp_voltage_soc_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_volt_soc_get", status_code); auto temp_voltage_gfx_value = val_ui16; status_code = rsmi_dev_metrics_volt_gfx_get(i, &temp_voltage_gfx_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_volt_gfx_get", status_code); auto temp_voltage_mem_value = val_ui16; status_code = rsmi_dev_metrics_volt_mem_get(i, &temp_voltage_mem_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_volt_mem_get", status_code); auto temp_system_clock_counter_value = val_ui64; status_code = rsmi_dev_metrics_system_clock_counter_get(i, &temp_system_clock_counter_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_system_clock_counter_get", status_code); auto temp_firmware_timestamp_value = val_ui64; status_code = rsmi_dev_metrics_firmware_timestamp_get(i, &temp_firmware_timestamp_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_firmware_timestamp_get", status_code); auto temp_xcd_counter_value = val_ui16; status_code = rsmi_dev_metrics_xcd_counter_get(i, &temp_xcd_counter_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_xcd_counter_get", status_code); IF_VERB(STANDARD) { std::cout << "\n"; std::cout << "\t[Temperature]" << "\n"; - std::cout << "\t -> temp_edge(): " << temp_edge_value << "\n"; - std::cout << "\t -> temp_hotspot(): " << temp_hotspot_value << "\n"; - std::cout << "\t -> temp_mem(): " << temp_mem_value << "\n"; - std::cout << "\t -> temp_vrgfx(): " << temp_vrgfx_value << "\n"; - std::cout << "\t -> temp_vrsoc(): " << temp_vrsoc_value << "\n"; - std::cout << "\t -> temp_vrmem(): " << temp_vrmem_value << "\n"; - std::cout << "\t -> temp_hbm(): " << temp_hbm_values << "\n"; + std::cout << print_error_or_value("\t -> temp_edge(): ", "rsmi_dev_metrics_temp_edge_get", temp_edge_value) << "\n"; + std::cout << print_error_or_value("\t -> temp_hotspot(): ", "rsmi_dev_metrics_temp_hotspot_get", temp_hotspot_value) << "\n"; + std::cout << print_error_or_value("\t -> temp_mem(): ", "rsmi_dev_metrics_temp_mem_get", temp_mem_value) << "\n"; + std::cout << print_error_or_value("\t -> temp_vrgfx(): ", "rsmi_dev_metrics_temp_vrgfx_get", temp_vrgfx_value) << "\n"; + std::cout << print_error_or_value("\t -> temp_vrsoc(): ", "rsmi_dev_metrics_temp_vrsoc_get", temp_vrsoc_value) << "\n"; + std::cout << print_error_or_value("\t -> temp_vrmem(): ", "rsmi_dev_metrics_temp_vrmem_get", temp_vrmem_value) << "\n"; + std::cout << print_error_or_value("\t -> temp_hbm[]: ", "rsmi_dev_metrics_temp_hbm_get", temp_hbm_values) << "\n"; std::cout << "\n"; std::cout << "\t[Power/Energy]" << "\n"; - std::cout << "\t -> current_socket_power(): " << temp_curr_socket_power_value << "\n"; - std::cout << "\t -> energy_accum(): " << temp_energy_accum_value << "\n"; - std::cout << "\t -> average_socket_power(): " << temp_avg_socket_power_value << "\n"; + std::cout << print_error_or_value("\t -> current_socket_power(): ", "rsmi_dev_metrics_curr_socket_power_get", temp_curr_socket_power_value) << "\n"; + std::cout << print_error_or_value("\t -> energy_accum(): ", "rsmi_dev_metrics_energy_acc_get", temp_energy_accum_value) << "\n"; + std::cout << print_error_or_value("\t -> average_socket_power(): ", "rsmi_dev_metrics_avg_socket_power_get", temp_avg_socket_power_value) << "\n"; std::cout << "\n"; std::cout << "\t[Utilization]" << "\n"; - std::cout << "\t -> average_gfx_activity(): " << temp_avg_gfx_activity_value << "\n"; - std::cout << "\t -> average_umc_activity(): " << temp_avg_umc_activity_value << "\n"; - std::cout << "\t -> average_mm_activity(): " << temp_avg_mm_activity_value << "\n"; - std::cout << "\t -> vcn_activity(): " << temp_vcn_values << "\n"; - std::cout << "\t -> mem_activity_accum(): " << temp_mem_activity_accum_value << "\n"; - std::cout << "\t -> gfx_activity_accum(): " << temp_gfx_activity_accum_value << "\n"; + std::cout << print_error_or_value("\t -> average_gfx_activity(): ", "rsmi_dev_metrics_avg_gfx_activity_get", temp_avg_gfx_activity_value) << "\n"; + std::cout << print_error_or_value("\t -> average_umc_activity(): ", "rsmi_dev_metrics_avg_umc_activity_get", temp_avg_umc_activity_value) << "\n"; + std::cout << print_error_or_value("\t -> average_mm_activity(): ", "rsmi_dev_metrics_avg_mm_activity_get", temp_avg_mm_activity_value) << "\n"; + std::cout << print_error_or_value("\t -> vcn_activity[]: ", "rsmi_dev_metrics_vcn_activity_get", temp_vcn_values) << "\n"; + std::cout << "\n"; + std::cout << print_error_or_value("\t -> mem_activity_accum(): ", "rsmi_dev_metrics_mem_activity_acc_get", temp_mem_activity_accum_value) << "\n"; + std::cout << print_error_or_value("\t -> gfx_activity_accum(): ", "rsmi_dev_metrics_gfx_activity_acc_get", temp_gfx_activity_accum_value) << "\n"; std::cout << "\n"; std::cout << "\t[Average Clock]" << "\n"; - std::cout << "\t -> average_gfx_clock_frequency(): " << temp_avg_gfx_clock_freq_value << "\n"; - std::cout << "\t -> average_soc_clock_frequency(): " << temp_avg_soc_clock_freq_value << "\n"; - std::cout << "\t -> average_uclock_frequency(): " << temp_avg_uclock_freq_value << "\n"; - std::cout << "\t -> average_vclock0_frequency(): " << temp_avg_vclock0_freq_value << "\n"; - std::cout << "\t -> average_dclock0_frequency(): " << temp_avg_dclock0_freq_value << "\n"; - std::cout << "\t -> average_vclock1_frequency(): " << temp_avg_vclock1_freq_value << "\n"; - std::cout << "\t -> average_dclock1_frequency(): " << temp_avg_dclock1_freq_value << "\n"; + std::cout << print_error_or_value("\t -> average_gfx_clock_frequency(): ", "rsmi_dev_metrics_avg_gfx_clock_frequency_get", temp_avg_gfx_clock_freq_value) << "\n"; + std::cout << print_error_or_value("\t -> average_soc_clock_frequency(): ", "rsmi_dev_metrics_avg_soc_clock_frequency_get", temp_avg_soc_clock_freq_value) << "\n"; + std::cout << print_error_or_value("\t -> average_uclock_frequency(): ", "rsmi_dev_metrics_avg_uclock_frequency_get", temp_avg_uclock_freq_value) << "\n"; + std::cout << print_error_or_value("\t -> average_vclock0_frequency(): ", "rsmi_dev_metrics_avg_vclock0_frequency_get", temp_avg_vclock0_freq_value) << "\n"; + std::cout << print_error_or_value("\t -> average_dclock0_frequency(): ", "rsmi_dev_metrics_avg_dclock0_frequency_get", temp_avg_dclock0_freq_value) << "\n"; + std::cout << print_error_or_value("\t -> average_vclock1_frequency(): ", "rsmi_dev_metrics_avg_vclock1_frequency_get", temp_avg_vclock1_freq_value) << "\n"; + std::cout << print_error_or_value("\t -> average_dclock1_frequency(): ", "rsmi_dev_metrics_avg_dclock1_frequency_get", temp_avg_dclock1_freq_value) << "\n"; std::cout << "\n"; std::cout << "\t[Current Clock]" << "\n"; - std::cout << "\t -> current_vclock1(): " << temp_curr_vclk1_value << "\n"; - std::cout << "\t -> current_dclock1(): " << temp_curr_dclk1_value << "\n"; - std::cout << "\t -> current_uclock(): " << temp_curr_uclk_value << "\n"; - std::cout << "\t -> current_dclk0(): " << temp_curr_dclk0_values << "\n"; - std::cout << "\t -> current_gfxclk(): " << temp_curr_gfxclk_values << "\n"; - std::cout << "\t -> current_soc_clock(): " << temp_curr_socclk_values << "\n"; - std::cout << "\t -> current_vclk0(): " << temp_curr_vclk0_values << "\n"; + std::cout << print_error_or_value("\t -> current_vclock1(): ", "rsmi_dev_metrics_curr_vclk1_get", temp_curr_vclk1_value) << "\n"; + std::cout << print_error_or_value("\t -> current_dclock1(): ", "rsmi_dev_metrics_curr_dclk1_get", temp_curr_dclk1_value) << "\n"; + std::cout << print_error_or_value("\t -> current_uclock(): ", "rsmi_dev_metrics_curr_uclk_get", temp_curr_uclk_value) << "\n"; + std::cout << print_error_or_value("\t -> current_dclk0[]: ", "rsmi_dev_metrics_curr_dclk0_get", temp_curr_dclk0_values) << "\n"; + std::cout << print_error_or_value("\t -> current_gfxclk[]: ", "rsmi_dev_metrics_curr_gfxclk_get", temp_curr_gfxclk_values) << "\n"; + std::cout << print_error_or_value("\t -> current_soc_clock[]: ", "rsmi_dev_metrics_curr_socclk_get", temp_curr_socclk_values) << "\n"; + std::cout << print_error_or_value("\t -> current_vclk0[]: ", "rsmi_dev_metrics_curr_vclk0_get", temp_curr_vclk0_values) << "\n"; std::cout << "\n"; std::cout << "\t[Throttle]" << "\n"; - std::cout << "\t -> indep_throttle_status(): " << temp_indep_throttle_status_value << "\n"; - std::cout << "\t -> throttle_status(): " << temp_throttle_status_value << "\n"; + std::cout << print_error_or_value("\t -> indep_throttle_status(): ", "rsmi_dev_metrics_indep_throttle_status_get", temp_indep_throttle_status_value) << "\n"; + std::cout << print_error_or_value("\t -> throttle_status(): ", "rsmi_dev_metrics_throttle_status_get", temp_throttle_status_value) << "\n"; std::cout << "\n"; std::cout << "\t[Gfx Clock Lock]" << "\n"; - std::cout << "\t -> gfxclk_lock_status(): " << temp_gfxclk_lock_status_value << "\n"; + std::cout << print_error_or_value("\t -> gfxclk_lock_status(): ", "rsmi_dev_metrics_gfxclk_lock_status_get", temp_gfxclk_lock_status_value) << "\n"; std::cout << "\n"; std::cout << "\t[Current Fan Speed]" << "\n"; - std::cout << "\t -> current_fan_speed(): " << temp_curr_fan_speed_value << "\n"; + std::cout << print_error_or_value("\t -> current_fan_speed(): ", "rsmi_dev_metrics_curr_fan_speed_get", temp_curr_fan_speed_value) << "\n"; std::cout << "\n"; std::cout << "\t[Link/Bandwidth/Speed]" << "\n"; - std::cout << "\t -> pcie_link_width(): " << temp_pcie_link_width_value << "\n"; - std::cout << "\t -> pcie_link_speed(): " << temp_pcie_link_speed_value << "\n"; - std::cout << "\t -> pcie_bandwidth_accum(): " << temp_pcie_bandwidth_accum_value << "\n"; - std::cout << "\t -> pcie_bandwidth_inst(): " << temp_pcie_bandwidth_inst_value << "\n"; - std::cout << "\t -> pcie_l0_recov_count_accum(): " << temp_pcie_l0_recov_count_accum_value << "\n"; - std::cout << "\t -> pcie_replay_count_accum(): " << temp_pcie_replay_count_accum_value << "\n"; - std::cout << "\t -> pcie_replay_rollover_count_accum(): " << temp_pcie_replay_rover_count_accum_value << "\n"; - std::cout << "\t -> xgmi_link_width(): " << temp_xgmi_link_width_value << "\n"; - std::cout << "\t -> xgmi_link_speed(): " << temp_xgmi_link_speed_value << "\n"; - std::cout << "\t -> xgmi_read_data(): " << temp_xgmi_read_values << "\n"; - std::cout << "\t -> xgmi_write_data(): " << temp_xgmi_write_values << "\n"; + std::cout << print_error_or_value("\t -> pcie_link_width(): ", "rsmi_dev_metrics_pcie_link_width_get", temp_pcie_link_width_value) << "\n"; + std::cout << print_error_or_value("\t -> pcie_link_speed(): ", "rsmi_dev_metrics_pcie_link_speed_get", temp_pcie_link_speed_value) << "\n"; + std::cout << print_error_or_value("\t -> pcie_bandwidth_accum(): ", "rsmi_dev_metrics_pcie_bandwidth_acc_get", temp_pcie_bandwidth_accum_value) << "\n"; + std::cout << print_error_or_value("\t -> pcie_bandwidth_inst(): ", "rsmi_dev_metrics_pcie_bandwidth_inst_get", temp_pcie_bandwidth_inst_value) << "\n"; + std::cout << print_error_or_value("\t -> pcie_l0_recov_count_accum(): ", "rsmi_dev_metrics_pcie_l0_recov_count_acc_get", temp_pcie_l0_recov_count_accum_value) << "\n"; + std::cout << print_error_or_value("\t -> pcie_replay_count_accum(): ", "rsmi_dev_metrics_pcie_replay_count_acc_get", temp_pcie_replay_count_accum_value) << "\n"; + std::cout << print_error_or_value("\t -> pcie_replay_rollover_count_accum(): ", "rsmi_dev_metrics_pcie_replay_rover_count_acc_get", temp_pcie_replay_rover_count_accum_value) << "\n"; + std::cout << print_error_or_value("\t -> xgmi_link_width(): ", "rsmi_dev_metrics_xgmi_link_width_get", temp_xgmi_link_width_value) << "\n"; + std::cout << print_error_or_value("\t -> xgmi_link_speed(): ", "rsmi_dev_metrics_xgmi_link_speed_get", temp_xgmi_link_speed_value) << "\n"; + std::cout << print_error_or_value("\t -> xgmi_read_data[]: ", "rsmi_dev_metrics_xgmi_read_data_get", temp_xgmi_read_values) << "\n"; + std::cout << print_error_or_value("\t -> xgmi_write_data[]: ", "rsmi_dev_metrics_xgmi_write_data_get", temp_xgmi_write_values) << "\n"; std::cout << "\n"; std::cout << "\t[Voltage]" << "\n"; - std::cout << "\t -> voltage_soc(): " << temp_voltage_soc_value << "\n"; - std::cout << "\t -> voltage_gfx(): " << temp_voltage_gfx_value << "\n"; - std::cout << "\t -> voltage_mem(): " << temp_voltage_mem_value << "\n"; + std::cout << print_error_or_value("\t -> voltage_soc(): ", "rsmi_dev_metrics_volt_soc_get", temp_voltage_soc_value) << "\n"; + std::cout << print_error_or_value("\t -> voltage_gfx(): ", "rsmi_dev_metrics_volt_gfx_get", temp_voltage_gfx_value) << "\n"; + std::cout << print_error_or_value("\t -> voltage_mem(): ", "rsmi_dev_metrics_volt_mem_get", temp_voltage_mem_value) << "\n"; std::cout << "\n"; std::cout << "\t[Timestamp]" << "\n"; - std::cout << "\t -> system_clock_counter(): " << temp_system_clock_counter_value << "\n"; - std::cout << "\t -> firmware_timestamp(): " << temp_firmware_timestamp_value << "\n"; + std::cout << print_error_or_value("\t -> system_clock_counter(): ", "rsmi_dev_metrics_system_clock_counter_get", temp_system_clock_counter_value) << "\n"; + std::cout << print_error_or_value("\t -> firmware_timestamp(): ", "rsmi_dev_metrics_firmware_timestamp_get", temp_firmware_timestamp_value) << "\n"; std::cout << "\n"; std::cout << "\t[XCD CounterVoltage]" << "\n"; - std::cout << "\t -> xcd_counter(): " << temp_xcd_counter_value << "\n"; + std::cout << print_error_or_value("\t -> xcd_counter(): ", "rsmi_dev_metrics_xcd_counter_get", temp_xcd_counter_value) << "\n"; std::cout << "\n\n"; } } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.cc index 0acce3a150..6a205faf56 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.cc @@ -122,6 +122,7 @@ void TestBase::PrintDeviceHeader(uint32_t dv_ind) { uint16_t val_ui16; IF_VERB(STANDARD) { + std::cout << "\n"; std::cout << "\t**Device index: " << dv_ind << std::endl; } err = rsmi_dev_id_get(dv_ind, &val_ui16);