diff --git a/projects/rocm-smi-lib/CMakeLists.txt b/projects/rocm-smi-lib/CMakeLists.txt index 3092e237dc..b3eff5d9fb 100755 --- a/projects/rocm-smi-lib/CMakeLists.txt +++ b/projects/rocm-smi-lib/CMakeLists.txt @@ -95,17 +95,31 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--build-id=sha1") endif() +# Use this instead of above for 32 bit +# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32") + +if ("${CMAKE_BUILD_TYPE}" STREQUAL Release) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") +else () + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG") +endif () + ## Address Sanitize Flag if (${ADDRESS_SANITIZER}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -g") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -g -fno-omit-frame-pointer") set(CMAKE_EXE_LINKER_FLAGS -fsanitize=address) - if (BUILD_SHARED_LIBS}) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -shared-libsan" ) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -shared-libsan" ) + if (BUILD_SHARED_LIBS) + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -shared-libasan") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -shared-libasan") + endif() else () - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libsan" ) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libsan" ) + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libsan") + else() + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libasan") + endif() endif () else () ## Security breach mitigation flags @@ -118,21 +132,12 @@ else () "${CMAKE_CXX_FLAGS} -Wtrampolines -Wl,-z,now") endif () -# Use this instead of above for 32 bit -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32") - -if ("${CMAKE_BUILD_TYPE}" STREQUAL Release) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") -else () - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0 -DDEBUG") -endif () - set(COMMON_SRC_DIR "${PROJECT_SOURCE_DIR}/src") set(COMMON_INC_DIR "${PROJECT_SOURCE_DIR}/include/rocm_smi") set(SHR_MUTEX_DIR "${PROJECT_SOURCE_DIR}/third_party/shared_mutex") include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include - ${CMAKE_CURRENT_SOURCE_DIR}/third_party/shared_mutex) + ${CMAKE_CURRENT_SOURCE_DIR}/third_party/shared_mutex) set(CMN_SRC_LIST "${COMMON_SRC_DIR}/rocm_smi_device.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_main.cc") diff --git a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc index 546d72398c..0260104b7b 100755 --- a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc +++ b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc @@ -745,8 +745,8 @@ auto print_error_or_value(rsmi_status_t status_code, const T& metric) { return str_values; } else if constexpr ((std::is_same_v) || - (std::is_same_v) || - (std::is_same_v)) { + (std::is_same_v) || + (std::is_same_v)) { return std::to_string(metric); } } diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index d78ac0d077..702d6d6ae8 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -6375,19 +6375,24 @@ rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_valu const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempHbm); amd::smi::GPUMetricTempHbmTbl_t tmp_hbl_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_hbl_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value)); - std::copy_n(std::begin(tmp_hbl_tbl), max_num_elems, *temp_hbm_value); - } + const auto max_num_elems = + static_cast(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_hbl_tbl.size()) ? max_num_elems : tmp_hbl_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_hbl_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_hbl_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(temp_hbm_value, 0, sizeof(temp_hbm_value)); + std::copy_n(std::begin(tmp_hbl_tbl), copy_size, *temp_hbm_value); + } return status_code; CATCH @@ -6409,19 +6414,24 @@ rsmi_dev_metrics_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivity_t* vcn_a const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); amd::smi::GPUMetricVcnActivityTbl_t tmp_vcn_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_vcn_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value)); - std::copy_n(std::begin(tmp_vcn_tbl), max_num_elems, *vcn_activity_value); - } + const auto max_num_elems = + static_cast(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_vcn_tbl.size()) ? max_num_elems : tmp_vcn_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_vcn_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_vcn_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(vcn_activity_value, 0, sizeof(vcn_activity_value)); + std::copy_n(std::begin(tmp_vcn_tbl), copy_size, *vcn_activity_value); + } return status_code; CATCH @@ -6443,19 +6453,24 @@ rsmi_dev_metrics_xgmi_read_data_get(uint32_t dv_ind, GPUMetricXgmiReadDataAcc_t* const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator); amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value)); - std::copy_n(std::begin(tmp_xgmi_acc_tbl), max_num_elems, *xgmi_read_data_acc_value); - } + const auto max_num_elems = + static_cast(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_xgmi_acc_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_xgmi_acc_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(xgmi_read_data_acc_value, 0, sizeof(xgmi_read_data_acc_value)); + std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_read_data_acc_value); + } return status_code; CATCH @@ -6477,19 +6492,24 @@ rsmi_dev_metrics_xgmi_write_data_get(uint32_t dv_ind, GPUMetricXgmiWriteDataAcc_ const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator); amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value)); - std::copy_n(std::begin(tmp_xgmi_acc_tbl), max_num_elems, *xgmi_write_data_acc_value); - } + const auto max_num_elems = + static_cast(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_xgmi_acc_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_xgmi_acc_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(xgmi_write_data_acc_value, 0, sizeof(xgmi_write_data_acc_value)); + std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_write_data_acc_value); + } return status_code; CATCH @@ -6511,19 +6531,24 @@ rsmi_dev_metrics_curr_gfxclk_get(uint32_t dv_ind, GPUMetricCurrGfxClk_t* current const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock); amd::smi::GPUMetricCurrGfxClkTbl_t tmp_curr_gfxclk_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_gfxclk_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*current_gfxclk_value) - std::begin(*current_gfxclk_value)); - std::copy_n(std::begin(tmp_curr_gfxclk_tbl), max_num_elems, *current_gfxclk_value); - } + const auto max_num_elems = + static_cast(std::end(*current_gfxclk_value) - std::begin(*current_gfxclk_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_curr_gfxclk_tbl.size()) ? max_num_elems : tmp_curr_gfxclk_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_curr_gfxclk_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_curr_gfxclk_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(current_gfxclk_value, 0, sizeof(current_gfxclk_value)); + std::copy_n(std::begin(tmp_curr_gfxclk_tbl), copy_size, *current_gfxclk_value); + } return status_code; CATCH @@ -6545,19 +6570,23 @@ rsmi_dev_metrics_curr_socclk_get(uint32_t dv_ind, GPUMetricCurrSocClk_t* current const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrSocClock); amd::smi::GPUMetricCurrSocClkTbl_t tmp_curr_socclk_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_socclk_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*current_socclk_value) - std::begin(*current_socclk_value)); - std::copy_n(std::begin(tmp_curr_socclk_tbl), max_num_elems, *current_socclk_value); - } + const auto max_num_elems = + static_cast(std::end(*current_socclk_value) - std::begin(*current_socclk_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_curr_socclk_tbl.size()) ? max_num_elems : tmp_curr_socclk_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_curr_socclk_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_curr_socclk_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(current_socclk_value, 0, sizeof(current_socclk_value)); + std::copy_n(std::begin(tmp_curr_socclk_tbl), copy_size, *current_socclk_value); + } return status_code; CATCH @@ -6579,19 +6608,24 @@ rsmi_dev_metrics_curr_vclk0_get(uint32_t dv_ind, GPUMetricCurrVClk0_t* current_v const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrVClock0); amd::smi::GPUMetricCurrVClkTbl_t tmp_curr_vclk0_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_vclk0_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*current_vclk_value) - std::begin(*current_vclk_value)); - std::copy_n(std::begin(tmp_curr_vclk0_tbl), max_num_elems, *current_vclk_value); - } + const auto max_num_elems = + static_cast(std::end(*current_vclk_value) - std::begin(*current_vclk_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_curr_vclk0_tbl.size()) ? max_num_elems : tmp_curr_vclk0_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_curr_vclk0_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_curr_vclk0_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(current_vclk_value, 0, sizeof(current_vclk_value)); + std::copy_n(std::begin(tmp_curr_vclk0_tbl), copy_size, *current_vclk_value); + } return status_code; CATCH @@ -6642,6 +6676,8 @@ rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_d auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_dclk0_tbl); const auto max_num_elems = static_cast(std::end(*current_dclk_value) - std::begin(*current_dclk_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_curr_dclk0_tbl.size()) ? max_num_elems : tmp_curr_dclk0_tbl.size()); ostrstream << __PRETTY_FUNCTION__ << "\n | ======= end ======= " << "\n | End Result " @@ -6649,11 +6685,12 @@ rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_d << "\n | Metric Type: " << static_cast(gpu_metric_unit) << "\n | Metric Size: " << tmp_curr_dclk0_tbl.size() << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::copy_n(std::begin(tmp_curr_dclk0_tbl), max_num_elems, *current_dclk_value); + std::memset(current_dclk_value, 0, sizeof(current_dclk_value)); + std::copy_n(std::begin(tmp_curr_dclk0_tbl), copy_size, *current_dclk_value); } return status_code; diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc index 70a6028d18..30c6d1fdaa 100644 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/gpu_metrics_read.cc @@ -52,6 +52,7 @@ #include "gtest/gtest.h" #include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi_test/functional/gpu_metrics_read.h" #include "rocm_smi_test/test_common.h" @@ -87,6 +88,38 @@ void TestGpuMetricsRead::Close() { } +using GPUMetricResults_t = std::map; +GPUMetricResults_t MetricResults{}; + +template +auto print_error_or_value(std::string title, std::string func_name, const T& metric) { + auto str_values = title; + const auto status_code = MetricResults.at(func_name); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + if constexpr (std::is_array_v) { + auto idx = uint16_t(0); + + const auto num_elems = static_cast(std::end(metric) - std::begin(metric)); + str_values += ("\n\t\t num of values: " + std::to_string(num_elems) + "\n"); + for (const auto& el : metric) { + str_values += "\t\t [" + std::to_string(idx) + "]: " + std::to_string(el) + "\n"; + ++idx; + } + return str_values; + } + else if constexpr ((std::is_same_v) || + (std::is_same_v) || + (std::is_same_v)) { + + return str_values += std::to_string(metric); + } + } + else { + return str_values += ("\n\t\tStatus: [" + std::to_string(status_code) + "] " + "-> " + amd::smi::getRSMIStatusString(status_code)); + } +}; + + void TestGpuMetricsRead::Run(void) { rsmi_status_t err; @@ -115,74 +148,89 @@ void TestGpuMetricsRead::Run(void) { } else { CHK_ERR_ASRT(err); IF_VERB(STANDARD) { - std::cout << std::dec << "system_clock_counter=" - << smu.system_clock_counter << '\n'; - std::cout << std::dec << "temperature_edge=" - << smu.temperature_edge << '\n'; - std::cout << std::dec << "temperature_hotspot=" - << smu.temperature_hotspot << '\n'; - std::cout << std::dec << "temperature_mem=" - << smu.temperature_mem << '\n'; - std::cout << std::dec << "temperature_vrgfx=" - << smu.temperature_vrgfx << '\n'; - std::cout << std::dec << "temperature_vrsoc=" - << smu.temperature_vrsoc << '\n'; - std::cout << std::dec << "temperature_vrmem=" - << smu.temperature_vrmem << '\n'; - std::cout << std::dec << "average_gfx_activity=" - << smu.average_gfx_activity << '\n'; - std::cout << std::dec << "average_umc_activity=" - << smu.average_umc_activity << '\n'; - std::cout << std::dec << "average_mm_activity=" - << smu.average_mm_activity << '\n'; - std::cout << std::dec << "average_socket_power=" - << smu.average_socket_power << '\n'; - std::cout << std::dec << "energy_accumulator=" - << smu.energy_accumulator << '\n'; - std::cout << std::dec << "average_gfxclk_frequency=" - << smu.average_gfxclk_frequency << '\n'; - std::cout << std::dec << "average_gfxclk_frequency=" - << smu.average_gfxclk_frequency << '\n'; - std::cout << std::dec << "average_uclk_frequency=" - << smu.average_uclk_frequency << '\n'; - std::cout << std::dec << "average_vclk0_frequency=" - << smu.average_vclk0_frequency << '\n'; - std::cout << std::dec << "average_dclk0_frequency=" - << smu.average_dclk0_frequency << '\n'; - std::cout << std::dec << "average_vclk1_frequency=" - << smu.average_vclk1_frequency << '\n'; - std::cout << std::dec << "average_dclk1_frequency=" - << smu.average_dclk1_frequency << '\n'; - std::cout << std::dec << "current_gfxclk=" - << smu.current_gfxclk << '\n'; - std::cout << std::dec << "current_socclk=" - << smu.current_socclk << '\n'; - std::cout << std::dec << "current_uclk=" - << smu.current_uclk << '\n'; - std::cout << std::dec << "current_vclk0=" - << smu.current_vclk0 << '\n'; - std::cout << std::dec << "current_dclk0=" - << smu.current_dclk0 << '\n'; - std::cout << std::dec << "current_vclk1=" - << smu.current_vclk1 << '\n'; - std::cout << std::dec << "current_dclk1=" - << smu.current_dclk1 << '\n'; - std::cout << std::dec << "throttle_status=" - << smu.throttle_status << '\n'; - std::cout << std::dec << "current_fan_speed=" - << smu.current_fan_speed << '\n'; - std::cout << "pcie_link_width=" - << std::to_string(smu.pcie_link_width) << '\n'; - std::cout << "pcie_link_width=" - << std::to_string(smu.pcie_link_speed) << '\n'; - std::cout << "gfx_activity_acc=" - << std::dec << smu.gfx_activity_acc << '\n'; - std::cout << "mem_activity_acc=" - << std::dec << smu.mem_activity_acc << '\n'; + std::cout << std::dec << "\tsystem_clock_counter=" << smu.system_clock_counter << '\n'; + std::cout << std::dec << "\ttemperature_edge=" << smu.temperature_edge << '\n'; + std::cout << std::dec << "\ttemperature_hotspot=" << smu.temperature_hotspot << '\n'; + std::cout << std::dec << "\ttemperature_mem=" << smu.temperature_mem << '\n'; + std::cout << std::dec << "\ttemperature_vrgfx=" << smu.temperature_vrgfx << '\n'; + std::cout << std::dec << "\ttemperature_vrsoc=" << smu.temperature_vrsoc << '\n'; + std::cout << std::dec << "\ttemperature_vrmem=" << smu.temperature_vrmem << '\n'; + std::cout << std::dec << "\taverage_gfx_activity=" << smu.average_gfx_activity << '\n'; + std::cout << std::dec << "\taverage_umc_activity=" << smu.average_umc_activity << '\n'; + std::cout << std::dec << "\taverage_mm_activity=" << smu.average_mm_activity << '\n'; + std::cout << std::dec << "\taverage_socket_power=" << smu.average_socket_power << '\n'; + std::cout << std::dec << "\tenergy_accumulator=" << smu.energy_accumulator << '\n'; + std::cout << std::dec << "\taverage_gfxclk_frequency=" << smu.average_gfxclk_frequency << '\n'; + std::cout << std::dec << "\taverage_uclk_frequency=" << smu.average_uclk_frequency << '\n'; + std::cout << std::dec << "\taverage_vclk0_frequency=" << smu.average_vclk0_frequency << '\n'; + std::cout << std::dec << "\taverage_dclk0_frequency=" << smu.average_dclk0_frequency << '\n'; + std::cout << std::dec << "\taverage_vclk1_frequency=" << smu.average_vclk1_frequency << '\n'; + std::cout << std::dec << "\taverage_dclk1_frequency=" << smu.average_dclk1_frequency << '\n'; + std::cout << std::dec << "\tcurrent_gfxclk=" << smu.current_gfxclk << '\n'; + std::cout << std::dec << "\tcurrent_socclk=" << smu.current_socclk << '\n'; + std::cout << std::dec << "\tcurrent_uclk=" << smu.current_uclk << '\n'; + std::cout << std::dec << "\tcurrent_vclk0=" << smu.current_vclk0 << '\n'; + std::cout << std::dec << "\tcurrent_dclk0=" << smu.current_dclk0 << '\n'; + std::cout << std::dec << "\tcurrent_vclk1=" << smu.current_vclk1 << '\n'; + std::cout << std::dec << "\tcurrent_dclk1=" << smu.current_dclk1 << '\n'; + std::cout << std::dec << "\tthrottle_status=" << smu.throttle_status << '\n'; + std::cout << std::dec << "\tcurrent_fan_speed=" << smu.current_fan_speed << '\n'; + std::cout << std::dec << "\tpcie_link_width=" << smu.pcie_link_width << '\n'; + std::cout << std::dec << "\tpcie_link_speed=" << smu.pcie_link_speed << '\n'; + std::cout << std::dec << "\tgfx_activity_acc=" << std::dec << smu.gfx_activity_acc << '\n'; + std::cout << std::dec << "\tmem_activity_acc=" << std::dec << smu.mem_activity_acc << '\n'; for (int i = 0; i < RSMI_NUM_HBM_INSTANCES; ++i) { - std::cout << "temperature_hbm[" << i << "]=" << std::dec << - smu.temperature_hbm[i] << '\n'; + std::cout << "\ttemperature_hbm[" << i << "]=" << std::dec << smu.temperature_hbm[i] << '\n'; + } + std::cout << "\n"; + std::cout << "\tfirmware_timestamp=" << std::dec << smu.firmware_timestamp << '\n'; + std::cout << "\tvoltage_soc=" << std::dec << smu.voltage_soc << '\n'; + std::cout << "\tvoltage_gfx=" << std::dec << smu.voltage_gfx << '\n'; + std::cout << "\tvoltage_mem=" << std::dec << smu.voltage_mem << '\n'; + std::cout << "\tindep_throttle_status=" << std::dec << smu.indep_throttle_status << '\n'; + std::cout << "\tcurrent_socket_power=" << std::dec << smu.current_socket_power << '\n'; + + for (int i = 0; i < RSMI_MAX_NUM_VCN; ++i) { + std::cout << "\tvcn_activity[" << i << "]=" << std::dec << smu.vcn_activity[i] << '\n'; + } + + std::cout << "\n"; + std::cout << "\tgfxclk_lock_status=" << std::dec << smu.gfxclk_lock_status << '\n'; + std::cout << "\txgmi_link_width=" << std::dec << smu.xgmi_link_width << '\n'; + std::cout << "\txgmi_link_speed=" << std::dec << smu.xgmi_link_speed << '\n'; + std::cout << "\tpcie_bandwidth_acc=" << std::dec << smu.pcie_bandwidth_acc << '\n'; + std::cout << "\tpcie_bandwidth_inst=" << std::dec << smu.pcie_bandwidth_inst << '\n'; + std::cout << "\tpcie_l0_to_recov_count_acc=" << std::dec << smu.pcie_l0_to_recov_count_acc << '\n'; + std::cout << "\tpcie_replay_count_acc=" << std::dec << smu.pcie_replay_count_acc << '\n'; + std::cout << "\tpcie_replay_rover_count_acc=" << std::dec << smu.pcie_replay_rover_count_acc << '\n'; + for (int i = 0; i < RSMI_MAX_NUM_XGMI_LINKS; ++i) { + std::cout << "\txgmi_read_data_acc[" << i << "]=" << std::dec << smu.xgmi_read_data_acc[i] << '\n'; + } + + std::cout << "\n"; + for (int i = 0; i < RSMI_MAX_NUM_XGMI_LINKS; ++i) { + std::cout << "\txgmi_write_data_acc[" << i << "]=" << std::dec << smu.xgmi_write_data_acc[i] << '\n'; + } + + std::cout << "\n"; + for (int i = 0; i < RSMI_MAX_NUM_GFX_CLKS; ++i) { + std::cout << "\tcurrent_gfxclks[" << i << "]=" << std::dec << smu.current_gfxclks[i] << '\n'; + } + + std::cout << "\n"; + for (int i = 0; i < RSMI_MAX_NUM_CLKS; ++i) { + std::cout << "\tcurrent_socclks[" << i << "]=" << std::dec << smu.current_socclks[i] << '\n'; + } + + std::cout << "\n"; + for (int i = 0; i < RSMI_MAX_NUM_CLKS; ++i) { + std::cout << "\tcurrent_vclk0s[" << i << "]=" << std::dec << smu.current_vclk0s[i] << '\n'; + } + + std::cout << "\n"; + for (int i = 0; i < RSMI_MAX_NUM_CLKS; ++i) { + std::cout << "\tcurrent_dclk0s[" << i << "]=" << std::dec << smu.current_dclk0s[i] << '\n'; } } } @@ -198,6 +246,8 @@ void TestGpuMetricsRead::Run(void) { auto val_ui32 = uint32_t(0); auto val_ui64 = uint64_t(0); auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + + std::cout << "\n\t**GPU METRICS: Using direct APIs (newer):\n"; for (uint32_t i = 0; i < num_monitor_devs(); ++i) { PrintDeviceHeader(i); @@ -206,363 +256,446 @@ void TestGpuMetricsRead::Run(void) { if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_temp_edge_get", status_code); auto temp_hotspot_value = val_ui16; status_code = rsmi_dev_metrics_temp_hotspot_get(i, &temp_hotspot_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_temp_hotspot_get", status_code); auto temp_mem_value = val_ui16; status_code = rsmi_dev_metrics_temp_mem_get(i, &temp_mem_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_temp_mem_get", status_code); auto temp_vrgfx_value = val_ui16; status_code = rsmi_dev_metrics_temp_vrgfx_get(i, &temp_vrgfx_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_temp_vrgfx_get", status_code); auto temp_vrsoc_value = val_ui16; status_code = rsmi_dev_metrics_temp_vrsoc_get(i, &temp_vrsoc_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_temp_vrsoc_get", status_code); auto temp_vrmem_value = val_ui16; status_code = rsmi_dev_metrics_temp_vrmem_get(i, &temp_vrmem_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_temp_vrmem_get", status_code); - GPUMetricTempHbm_t temp_hbm_values; + GPUMetricTempHbm_t temp_hbm_values{}; status_code = rsmi_dev_metrics_temp_hbm_get(i, &temp_hbm_values); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_temp_hbm_get", status_code); auto temp_curr_socket_power_value = val_ui16; status_code = rsmi_dev_metrics_curr_socket_power_get(i, &temp_curr_socket_power_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_socket_power_get", status_code); auto temp_energy_accum_value = val_ui64; status_code = rsmi_dev_metrics_energy_acc_get(i, &temp_energy_accum_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_energy_acc_get", status_code); auto temp_avg_socket_power_value = val_ui16; status_code = rsmi_dev_metrics_avg_socket_power_get(i, &temp_avg_socket_power_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_socket_power_get", status_code); auto temp_avg_gfx_activity_value = val_ui16; status_code = rsmi_dev_metrics_avg_gfx_activity_get(i, &temp_avg_gfx_activity_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_avg_gfx_activity_get", status_code); auto temp_avg_umc_activity_value = val_ui16; status_code = rsmi_dev_metrics_avg_umc_activity_get(i, &temp_avg_umc_activity_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_avg_umc_activity_get", status_code); auto temp_avg_mm_activity_value = val_ui16; status_code = rsmi_dev_metrics_avg_mm_activity_get(i, &temp_avg_mm_activity_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_mm_activity_get", status_code); - GPUMetricVcnActivity_t temp_vcn_values; + GPUMetricVcnActivity_t temp_vcn_values{}; status_code = rsmi_dev_metrics_vcn_activity_get(i, &temp_vcn_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_vcn_activity_get", status_code); auto temp_mem_activity_accum_value = val_ui32; status_code = rsmi_dev_metrics_mem_activity_acc_get(i, &temp_mem_activity_accum_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_mem_activity_acc_get", status_code); auto temp_gfx_activity_accum_value = val_ui32; status_code = rsmi_dev_metrics_gfx_activity_acc_get(i, &temp_gfx_activity_accum_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_gfx_activity_acc_get", status_code); auto temp_avg_gfx_clock_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_gfx_clock_frequency_get(i, &temp_avg_gfx_clock_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_gfx_clock_frequency_get", status_code); auto temp_avg_soc_clock_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_soc_clock_frequency_get(i, &temp_avg_soc_clock_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_soc_clock_frequency_get", status_code); auto temp_avg_uclock_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_uclock_frequency_get(i, &temp_avg_uclock_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_uclock_frequency_get", status_code); auto temp_avg_vclock0_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_vclock0_frequency_get(i, &temp_avg_vclock0_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_vclock0_frequency_get", status_code); auto temp_avg_dclock0_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_dclock0_frequency_get(i, &temp_avg_dclock0_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_dclock0_frequency_get", status_code); auto temp_avg_vclock1_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_vclock1_frequency_get(i, &temp_avg_vclock1_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_vclock1_frequency_get", status_code); auto temp_avg_dclock1_freq_value = val_ui16; status_code = rsmi_dev_metrics_avg_dclock1_frequency_get(i, &temp_avg_dclock1_freq_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_avg_dclock1_frequency_get", status_code); auto temp_curr_vclk1_value = val_ui16; status_code = rsmi_dev_metrics_curr_vclk1_get(i, &temp_curr_vclk1_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_vclk1_get", status_code); auto temp_curr_dclk1_value = val_ui16; status_code = rsmi_dev_metrics_curr_dclk1_get(i, &temp_curr_dclk1_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_dclk1_get", status_code); auto temp_curr_uclk_value = val_ui16; status_code = rsmi_dev_metrics_curr_uclk_get(i, &temp_curr_uclk_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_curr_uclk_get", status_code); - GPUMetricCurrDClk0_t temp_curr_dclk0_values; + GPUMetricCurrDClk0_t temp_curr_dclk0_values{}; status_code = rsmi_dev_metrics_curr_dclk0_get(i, &temp_curr_dclk0_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_dclk0_get", status_code); - GPUMetricCurrGfxClk_t temp_curr_gfxclk_values; + GPUMetricCurrGfxClk_t temp_curr_gfxclk_values{}; status_code = rsmi_dev_metrics_curr_gfxclk_get(i, &temp_curr_gfxclk_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_gfxclk_get", status_code); - GPUMetricCurrSocClk_t temp_curr_socclk_values; + GPUMetricCurrSocClk_t temp_curr_socclk_values{}; status_code = rsmi_dev_metrics_curr_socclk_get(i, &temp_curr_socclk_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_socclk_get", status_code); - GPUMetricCurrVClk0_t temp_curr_vclk0_values; + GPUMetricCurrVClk0_t temp_curr_vclk0_values{}; status_code = rsmi_dev_metrics_curr_vclk0_get(i, &temp_curr_vclk0_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_vclk0_get", status_code); auto temp_indep_throttle_status_value = val_ui64; status_code = rsmi_dev_metrics_indep_throttle_status_get(i, &temp_indep_throttle_status_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_indep_throttle_status_get", status_code); auto temp_throttle_status_value = val_ui32; status_code = rsmi_dev_metrics_throttle_status_get(i, &temp_throttle_status_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_throttle_status_get", status_code); auto temp_gfxclk_lock_status_value = val_ui32; status_code = rsmi_dev_metrics_gfxclk_lock_status_get(i, &temp_gfxclk_lock_status_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_gfxclk_lock_status_get", status_code); + auto temp_curr_fan_speed_value = val_ui16; status_code = rsmi_dev_metrics_curr_fan_speed_get(i, &temp_curr_fan_speed_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_curr_fan_speed_get", status_code); auto temp_pcie_link_width_value = val_ui16; status_code = rsmi_dev_metrics_pcie_link_width_get(i, &temp_pcie_link_width_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_pcie_link_width_get", status_code); auto temp_pcie_link_speed_value = val_ui16; status_code = rsmi_dev_metrics_pcie_link_speed_get(i, &temp_pcie_link_speed_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_pcie_link_speed_get", status_code); auto temp_pcie_bandwidth_accum_value = val_ui64; status_code = rsmi_dev_metrics_pcie_bandwidth_acc_get(i, &temp_pcie_bandwidth_accum_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_pcie_bandwidth_acc_get", status_code); auto temp_pcie_bandwidth_inst_value = val_ui64; status_code = rsmi_dev_metrics_pcie_bandwidth_inst_get(i, &temp_pcie_bandwidth_inst_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_pcie_bandwidth_inst_get", status_code); auto temp_pcie_l0_recov_count_accum_value = val_ui64; status_code = rsmi_dev_metrics_pcie_l0_recov_count_acc_get(i, &temp_pcie_l0_recov_count_accum_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_pcie_l0_recov_count_acc_get", status_code); auto temp_pcie_replay_count_accum_value = val_ui64; status_code = rsmi_dev_metrics_pcie_replay_count_acc_get(i, &temp_pcie_replay_count_accum_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_pcie_replay_count_acc_get", status_code); auto temp_pcie_replay_rover_count_accum_value = val_ui64; status_code = rsmi_dev_metrics_pcie_replay_rover_count_acc_get(i, &temp_pcie_replay_rover_count_accum_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_pcie_replay_rover_count_acc_get", status_code); auto temp_xgmi_link_width_value = val_ui16; status_code = rsmi_dev_metrics_xgmi_link_width_get(i, &temp_xgmi_link_width_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_xgmi_link_width_get", status_code); auto temp_xgmi_link_speed_value = val_ui16; status_code = rsmi_dev_metrics_xgmi_link_speed_get(i, &temp_xgmi_link_speed_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_xgmi_link_speed_get", status_code); - GPUMetricXgmiReadDataAcc_t temp_xgmi_read_values; + GPUMetricXgmiReadDataAcc_t temp_xgmi_read_values{}; status_code = rsmi_dev_metrics_xgmi_read_data_get(i, &temp_xgmi_read_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_xgmi_read_data_get", status_code); - GPUMetricXgmiWriteDataAcc_t temp_xgmi_write_values; + GPUMetricXgmiWriteDataAcc_t temp_xgmi_write_values{}; status_code = rsmi_dev_metrics_xgmi_write_data_get(i, &temp_xgmi_write_values); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_xgmi_write_data_get", status_code); auto temp_voltage_soc_value = val_ui16; status_code = rsmi_dev_metrics_volt_soc_get(i, &temp_voltage_soc_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_volt_soc_get", status_code); auto temp_voltage_gfx_value = val_ui16; status_code = rsmi_dev_metrics_volt_gfx_get(i, &temp_voltage_gfx_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_volt_gfx_get", status_code); auto temp_voltage_mem_value = val_ui16; status_code = rsmi_dev_metrics_volt_mem_get(i, &temp_voltage_mem_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_volt_mem_get", status_code); auto temp_system_clock_counter_value = val_ui64; status_code = rsmi_dev_metrics_system_clock_counter_get(i, &temp_system_clock_counter_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_system_clock_counter_get", status_code); auto temp_firmware_timestamp_value = val_ui64; status_code = rsmi_dev_metrics_firmware_timestamp_get(i, &temp_firmware_timestamp_value); - CHK_ERR_ASRT(status_code); + if (status_code != RSMI_STATUS_NOT_SUPPORTED) { + CHK_ERR_ASRT(status_code); + } + MetricResults.emplace("rsmi_dev_metrics_firmware_timestamp_get", status_code); auto temp_xcd_counter_value = val_ui16; status_code = rsmi_dev_metrics_xcd_counter_get(i, &temp_xcd_counter_value); if (status_code != RSMI_STATUS_NOT_SUPPORTED) { CHK_ERR_ASRT(status_code); } + MetricResults.emplace("rsmi_dev_metrics_xcd_counter_get", status_code); IF_VERB(STANDARD) { std::cout << "\n"; std::cout << "\t[Temperature]" << "\n"; - std::cout << "\t -> temp_edge(): " << temp_edge_value << "\n"; - std::cout << "\t -> temp_hotspot(): " << temp_hotspot_value << "\n"; - std::cout << "\t -> temp_mem(): " << temp_mem_value << "\n"; - std::cout << "\t -> temp_vrgfx(): " << temp_vrgfx_value << "\n"; - std::cout << "\t -> temp_vrsoc(): " << temp_vrsoc_value << "\n"; - std::cout << "\t -> temp_vrmem(): " << temp_vrmem_value << "\n"; - std::cout << "\t -> temp_hbm(): " << temp_hbm_values << "\n"; + std::cout << print_error_or_value("\t -> temp_edge(): ", "rsmi_dev_metrics_temp_edge_get", temp_edge_value) << "\n"; + std::cout << print_error_or_value("\t -> temp_hotspot(): ", "rsmi_dev_metrics_temp_hotspot_get", temp_hotspot_value) << "\n"; + std::cout << print_error_or_value("\t -> temp_mem(): ", "rsmi_dev_metrics_temp_mem_get", temp_mem_value) << "\n"; + std::cout << print_error_or_value("\t -> temp_vrgfx(): ", "rsmi_dev_metrics_temp_vrgfx_get", temp_vrgfx_value) << "\n"; + std::cout << print_error_or_value("\t -> temp_vrsoc(): ", "rsmi_dev_metrics_temp_vrsoc_get", temp_vrsoc_value) << "\n"; + std::cout << print_error_or_value("\t -> temp_vrmem(): ", "rsmi_dev_metrics_temp_vrmem_get", temp_vrmem_value) << "\n"; + std::cout << print_error_or_value("\t -> temp_hbm[]: ", "rsmi_dev_metrics_temp_hbm_get", temp_hbm_values) << "\n"; std::cout << "\n"; std::cout << "\t[Power/Energy]" << "\n"; - std::cout << "\t -> current_socket_power(): " << temp_curr_socket_power_value << "\n"; - std::cout << "\t -> energy_accum(): " << temp_energy_accum_value << "\n"; - std::cout << "\t -> average_socket_power(): " << temp_avg_socket_power_value << "\n"; + std::cout << print_error_or_value("\t -> current_socket_power(): ", "rsmi_dev_metrics_curr_socket_power_get", temp_curr_socket_power_value) << "\n"; + std::cout << print_error_or_value("\t -> energy_accum(): ", "rsmi_dev_metrics_energy_acc_get", temp_energy_accum_value) << "\n"; + std::cout << print_error_or_value("\t -> average_socket_power(): ", "rsmi_dev_metrics_avg_socket_power_get", temp_avg_socket_power_value) << "\n"; std::cout << "\n"; std::cout << "\t[Utilization]" << "\n"; - std::cout << "\t -> average_gfx_activity(): " << temp_avg_gfx_activity_value << "\n"; - std::cout << "\t -> average_umc_activity(): " << temp_avg_umc_activity_value << "\n"; - std::cout << "\t -> average_mm_activity(): " << temp_avg_mm_activity_value << "\n"; - std::cout << "\t -> vcn_activity(): " << temp_vcn_values << "\n"; - std::cout << "\t -> mem_activity_accum(): " << temp_mem_activity_accum_value << "\n"; - std::cout << "\t -> gfx_activity_accum(): " << temp_gfx_activity_accum_value << "\n"; + std::cout << print_error_or_value("\t -> average_gfx_activity(): ", "rsmi_dev_metrics_avg_gfx_activity_get", temp_avg_gfx_activity_value) << "\n"; + std::cout << print_error_or_value("\t -> average_umc_activity(): ", "rsmi_dev_metrics_avg_umc_activity_get", temp_avg_umc_activity_value) << "\n"; + std::cout << print_error_or_value("\t -> average_mm_activity(): ", "rsmi_dev_metrics_avg_mm_activity_get", temp_avg_mm_activity_value) << "\n"; + std::cout << print_error_or_value("\t -> vcn_activity[]: ", "rsmi_dev_metrics_vcn_activity_get", temp_vcn_values) << "\n"; + std::cout << "\n"; + std::cout << print_error_or_value("\t -> mem_activity_accum(): ", "rsmi_dev_metrics_mem_activity_acc_get", temp_mem_activity_accum_value) << "\n"; + std::cout << print_error_or_value("\t -> gfx_activity_accum(): ", "rsmi_dev_metrics_gfx_activity_acc_get", temp_gfx_activity_accum_value) << "\n"; std::cout << "\n"; std::cout << "\t[Average Clock]" << "\n"; - std::cout << "\t -> average_gfx_clock_frequency(): " << temp_avg_gfx_clock_freq_value << "\n"; - std::cout << "\t -> average_soc_clock_frequency(): " << temp_avg_soc_clock_freq_value << "\n"; - std::cout << "\t -> average_uclock_frequency(): " << temp_avg_uclock_freq_value << "\n"; - std::cout << "\t -> average_vclock0_frequency(): " << temp_avg_vclock0_freq_value << "\n"; - std::cout << "\t -> average_dclock0_frequency(): " << temp_avg_dclock0_freq_value << "\n"; - std::cout << "\t -> average_vclock1_frequency(): " << temp_avg_vclock1_freq_value << "\n"; - std::cout << "\t -> average_dclock1_frequency(): " << temp_avg_dclock1_freq_value << "\n"; + std::cout << print_error_or_value("\t -> average_gfx_clock_frequency(): ", "rsmi_dev_metrics_avg_gfx_clock_frequency_get", temp_avg_gfx_clock_freq_value) << "\n"; + std::cout << print_error_or_value("\t -> average_soc_clock_frequency(): ", "rsmi_dev_metrics_avg_soc_clock_frequency_get", temp_avg_soc_clock_freq_value) << "\n"; + std::cout << print_error_or_value("\t -> average_uclock_frequency(): ", "rsmi_dev_metrics_avg_uclock_frequency_get", temp_avg_uclock_freq_value) << "\n"; + std::cout << print_error_or_value("\t -> average_vclock0_frequency(): ", "rsmi_dev_metrics_avg_vclock0_frequency_get", temp_avg_vclock0_freq_value) << "\n"; + std::cout << print_error_or_value("\t -> average_dclock0_frequency(): ", "rsmi_dev_metrics_avg_dclock0_frequency_get", temp_avg_dclock0_freq_value) << "\n"; + std::cout << print_error_or_value("\t -> average_vclock1_frequency(): ", "rsmi_dev_metrics_avg_vclock1_frequency_get", temp_avg_vclock1_freq_value) << "\n"; + std::cout << print_error_or_value("\t -> average_dclock1_frequency(): ", "rsmi_dev_metrics_avg_dclock1_frequency_get", temp_avg_dclock1_freq_value) << "\n"; std::cout << "\n"; std::cout << "\t[Current Clock]" << "\n"; - std::cout << "\t -> current_vclock1(): " << temp_curr_vclk1_value << "\n"; - std::cout << "\t -> current_dclock1(): " << temp_curr_dclk1_value << "\n"; - std::cout << "\t -> current_uclock(): " << temp_curr_uclk_value << "\n"; - std::cout << "\t -> current_dclk0(): " << temp_curr_dclk0_values << "\n"; - std::cout << "\t -> current_gfxclk(): " << temp_curr_gfxclk_values << "\n"; - std::cout << "\t -> current_soc_clock(): " << temp_curr_socclk_values << "\n"; - std::cout << "\t -> current_vclk0(): " << temp_curr_vclk0_values << "\n"; + std::cout << print_error_or_value("\t -> current_vclock1(): ", "rsmi_dev_metrics_curr_vclk1_get", temp_curr_vclk1_value) << "\n"; + std::cout << print_error_or_value("\t -> current_dclock1(): ", "rsmi_dev_metrics_curr_dclk1_get", temp_curr_dclk1_value) << "\n"; + std::cout << print_error_or_value("\t -> current_uclock(): ", "rsmi_dev_metrics_curr_uclk_get", temp_curr_uclk_value) << "\n"; + std::cout << print_error_or_value("\t -> current_dclk0[]: ", "rsmi_dev_metrics_curr_dclk0_get", temp_curr_dclk0_values) << "\n"; + std::cout << print_error_or_value("\t -> current_gfxclk[]: ", "rsmi_dev_metrics_curr_gfxclk_get", temp_curr_gfxclk_values) << "\n"; + std::cout << print_error_or_value("\t -> current_soc_clock[]: ", "rsmi_dev_metrics_curr_socclk_get", temp_curr_socclk_values) << "\n"; + std::cout << print_error_or_value("\t -> current_vclk0[]: ", "rsmi_dev_metrics_curr_vclk0_get", temp_curr_vclk0_values) << "\n"; std::cout << "\n"; std::cout << "\t[Throttle]" << "\n"; - std::cout << "\t -> indep_throttle_status(): " << temp_indep_throttle_status_value << "\n"; - std::cout << "\t -> throttle_status(): " << temp_throttle_status_value << "\n"; + std::cout << print_error_or_value("\t -> indep_throttle_status(): ", "rsmi_dev_metrics_indep_throttle_status_get", temp_indep_throttle_status_value) << "\n"; + std::cout << print_error_or_value("\t -> throttle_status(): ", "rsmi_dev_metrics_throttle_status_get", temp_throttle_status_value) << "\n"; std::cout << "\n"; std::cout << "\t[Gfx Clock Lock]" << "\n"; - std::cout << "\t -> gfxclk_lock_status(): " << temp_gfxclk_lock_status_value << "\n"; + std::cout << print_error_or_value("\t -> gfxclk_lock_status(): ", "rsmi_dev_metrics_gfxclk_lock_status_get", temp_gfxclk_lock_status_value) << "\n"; std::cout << "\n"; std::cout << "\t[Current Fan Speed]" << "\n"; - std::cout << "\t -> current_fan_speed(): " << temp_curr_fan_speed_value << "\n"; + std::cout << print_error_or_value("\t -> current_fan_speed(): ", "rsmi_dev_metrics_curr_fan_speed_get", temp_curr_fan_speed_value) << "\n"; std::cout << "\n"; std::cout << "\t[Link/Bandwidth/Speed]" << "\n"; - std::cout << "\t -> pcie_link_width(): " << temp_pcie_link_width_value << "\n"; - std::cout << "\t -> pcie_link_speed(): " << temp_pcie_link_speed_value << "\n"; - std::cout << "\t -> pcie_bandwidth_accum(): " << temp_pcie_bandwidth_accum_value << "\n"; - std::cout << "\t -> pcie_bandwidth_inst(): " << temp_pcie_bandwidth_inst_value << "\n"; - std::cout << "\t -> pcie_l0_recov_count_accum(): " << temp_pcie_l0_recov_count_accum_value << "\n"; - std::cout << "\t -> pcie_replay_count_accum(): " << temp_pcie_replay_count_accum_value << "\n"; - std::cout << "\t -> pcie_replay_rollover_count_accum(): " << temp_pcie_replay_rover_count_accum_value << "\n"; - std::cout << "\t -> xgmi_link_width(): " << temp_xgmi_link_width_value << "\n"; - std::cout << "\t -> xgmi_link_speed(): " << temp_xgmi_link_speed_value << "\n"; - std::cout << "\t -> xgmi_read_data(): " << temp_xgmi_read_values << "\n"; - std::cout << "\t -> xgmi_write_data(): " << temp_xgmi_write_values << "\n"; + std::cout << print_error_or_value("\t -> pcie_link_width(): ", "rsmi_dev_metrics_pcie_link_width_get", temp_pcie_link_width_value) << "\n"; + std::cout << print_error_or_value("\t -> pcie_link_speed(): ", "rsmi_dev_metrics_pcie_link_speed_get", temp_pcie_link_speed_value) << "\n"; + std::cout << print_error_or_value("\t -> pcie_bandwidth_accum(): ", "rsmi_dev_metrics_pcie_bandwidth_acc_get", temp_pcie_bandwidth_accum_value) << "\n"; + std::cout << print_error_or_value("\t -> pcie_bandwidth_inst(): ", "rsmi_dev_metrics_pcie_bandwidth_inst_get", temp_pcie_bandwidth_inst_value) << "\n"; + std::cout << print_error_or_value("\t -> pcie_l0_recov_count_accum(): ", "rsmi_dev_metrics_pcie_l0_recov_count_acc_get", temp_pcie_l0_recov_count_accum_value) << "\n"; + std::cout << print_error_or_value("\t -> pcie_replay_count_accum(): ", "rsmi_dev_metrics_pcie_replay_count_acc_get", temp_pcie_replay_count_accum_value) << "\n"; + std::cout << print_error_or_value("\t -> pcie_replay_rollover_count_accum(): ", "rsmi_dev_metrics_pcie_replay_rover_count_acc_get", temp_pcie_replay_rover_count_accum_value) << "\n"; + std::cout << print_error_or_value("\t -> xgmi_link_width(): ", "rsmi_dev_metrics_xgmi_link_width_get", temp_xgmi_link_width_value) << "\n"; + std::cout << print_error_or_value("\t -> xgmi_link_speed(): ", "rsmi_dev_metrics_xgmi_link_speed_get", temp_xgmi_link_speed_value) << "\n"; + std::cout << print_error_or_value("\t -> xgmi_read_data[]: ", "rsmi_dev_metrics_xgmi_read_data_get", temp_xgmi_read_values) << "\n"; + std::cout << print_error_or_value("\t -> xgmi_write_data[]: ", "rsmi_dev_metrics_xgmi_write_data_get", temp_xgmi_write_values) << "\n"; std::cout << "\n"; std::cout << "\t[Voltage]" << "\n"; - std::cout << "\t -> voltage_soc(): " << temp_voltage_soc_value << "\n"; - std::cout << "\t -> voltage_gfx(): " << temp_voltage_gfx_value << "\n"; - std::cout << "\t -> voltage_mem(): " << temp_voltage_mem_value << "\n"; + std::cout << print_error_or_value("\t -> voltage_soc(): ", "rsmi_dev_metrics_volt_soc_get", temp_voltage_soc_value) << "\n"; + std::cout << print_error_or_value("\t -> voltage_gfx(): ", "rsmi_dev_metrics_volt_gfx_get", temp_voltage_gfx_value) << "\n"; + std::cout << print_error_or_value("\t -> voltage_mem(): ", "rsmi_dev_metrics_volt_mem_get", temp_voltage_mem_value) << "\n"; std::cout << "\n"; std::cout << "\t[Timestamp]" << "\n"; - std::cout << "\t -> system_clock_counter(): " << temp_system_clock_counter_value << "\n"; - std::cout << "\t -> firmware_timestamp(): " << temp_firmware_timestamp_value << "\n"; + std::cout << print_error_or_value("\t -> system_clock_counter(): ", "rsmi_dev_metrics_system_clock_counter_get", temp_system_clock_counter_value) << "\n"; + std::cout << print_error_or_value("\t -> firmware_timestamp(): ", "rsmi_dev_metrics_firmware_timestamp_get", temp_firmware_timestamp_value) << "\n"; std::cout << "\n"; std::cout << "\t[XCD CounterVoltage]" << "\n"; - std::cout << "\t -> xcd_counter(): " << temp_xcd_counter_value << "\n"; + std::cout << print_error_or_value("\t -> xcd_counter(): ", "rsmi_dev_metrics_xcd_counter_get", temp_xcd_counter_value) << "\n"; std::cout << "\n\n"; } } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.cc index 0acce3a150..6a205faf56 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.cc @@ -122,6 +122,7 @@ void TestBase::PrintDeviceHeader(uint32_t dv_ind) { uint16_t val_ui16; IF_VERB(STANDARD) { + std::cout << "\n"; std::cout << "\t**Device index: " << dv_ind << std::endl; } err = rsmi_dev_id_get(dv_ind, &val_ui16);