diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000000..0bca7eb76f --- /dev/null +++ b/.clang-format @@ -0,0 +1,4 @@ +--- +Language: Cpp +BasedOnStyle: Google +ColumnLimit: 100 diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000000..11402fd8da --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,25 @@ +# THIS FILE IS GENERATED FROM .clangd! +# Run .update-clang-tidy.sh to regenerate. +Checks: + bugprone*, + clang-analyzer*, + google*, + misc*, + modernize*, + -abseil*, + -bugprone-easily-swappable-parameters, + -bugprone-reserved-identifier, + -clang-analyzer-security.insecureAPI.strcpy, + -cppcoreguidelines*, + -cppcoreguidelines-pro*, + -misc-non-copyable-objects, + -misc-use-anonymous-namespace, + -modernize-avoid-c-arrays, + -modernize-redundant-void-arg, + -modernize-use-auto, + -modernize-use-nodiscard, + -modernize-use-noexcept, + -modernize-use-trailing-return-type, + -modernize-use-using, + -performance*, + -readability*, diff --git a/.clangd b/.clangd new file mode 100644 index 0000000000..74ae437a6f --- /dev/null +++ b/.clangd @@ -0,0 +1,37 @@ +CompileFlags: + Remove: -W* + Add: [-Wall, -pedantic, -I/opt/rocm/include, -I/opt/rocm/include/hsa, -I/opt/rocm/include/rocprofiler] + Compiler: clang++ + +# list here: https://clang.llvm.org/extra/clang-tidy/checks/list.html +Diagnostics: + UnusedIncludes: Strict + # rules below are copied into .clang-tidy using ./.update-clang-tidy.sh + # please keep the rules sorted alphabetically + ClangTidy: + Add: [ + bugprone*, + clang-analyzer*, + google*, + misc*, + modernize*, + ] + Remove: [ + abseil*, + bugprone-easily-swappable-parameters, + bugprone-reserved-identifier, + clang-analyzer-security.insecureAPI.strcpy, + cppcoreguidelines*, + cppcoreguidelines-pro*, + misc-non-copyable-objects, + misc-use-anonymous-namespace, + modernize-avoid-c-arrays, + modernize-redundant-void-arg, + modernize-use-auto, + modernize-use-nodiscard, + modernize-use-noexcept, + modernize-use-trailing-return-type, + modernize-use-using, + performance*, + readability*, + ] diff --git a/.editorconfig b/.editorconfig index bb852a105e..64cc5d6985 100644 --- a/.editorconfig +++ b/.editorconfig @@ -13,3 +13,4 @@ indent_style = space charset = utf-8 indent_style = space indent_size = 2 +max_line_length = 100 diff --git a/.gitignore b/.gitignore index 9dae753e46..d126460b84 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,8 @@ device/ # misc esmi_ib_library/ + +# do NOT ignore these files +!.clang-format +!.clang-tidy +!.clangd diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..d84939508a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,30 @@ +# - How to use: +# python3 -m pip install pre-commit +# pre-commit install --install hooks +# Upon a new commit - the hooks should automagically run +# +# - How to skip: +# git commit --no-verify +# or +# SKIP=clang-format-docker git commit +# SKIP=cpplint-docker git commit + +fail_fast: false +repos: + # For portability I decided to use Docker containers + - repo: https://github.com/dmitrii-galantsev/pre-commit-docker-cpplint + rev: 0.0.3 + hooks: + - id: clang-format-docker + - id: cpplint-docker + # Below is a local way of running formatters and linters + # NOTE: clang-tidy is not used in the above tests + # - repo: https://github.com/pocc/pre-commit-hooks + # rev: v1.3.5 + # hooks: + # - id: clang-format + # args: [--no-diff, -i] + # - id: clang-tidy + # args: [-p=build, --quiet] + # - id: cpplint + # args: [--verbose=5] diff --git a/.update-clang-tidy.sh b/.update-clang-tidy.sh new file mode 100755 index 0000000000..9607b35714 --- /dev/null +++ b/.update-clang-tidy.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +set -x # trace +set -e # exit immediately if command fails +set -u # exit if an undefined variable is found + +awk ' +BEGIN { + print "# THIS FILE IS GENERATED FROM .clangd!" + print "# Run ./.update-clang-tidy.sh to regenerate." + print "Checks:" +} +/Add: \[$/{ +a=1 + next +} +/]/{ + a=0 +} +a{ +gsub(/^\s+/," ") + print +} + +/Remove: \[$/{ +r=1 + next +} +/]/{ + r=0 +} +r{ + gsub(/^\s+/," -") + print +} +' .clangd | tee .clang-tidy diff --git a/CHANGELOG.md b/CHANGELOG.md index 381efe9b17..91d5f8dd9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,21 +1,25 @@ # Change Log for ROCm SMI Library -Full documentation for rocm_smi_lib is available at [https://docs.amd.com/](https://docs.amd.com/category/SMI%20API%20Guides). +Full documentation for rocm_smi_lib is available at [https://docs.amd.com/](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/). ## rocm_smi_lib for ROCm 5.5.0 ### Optimizations + - Add new test to measure api execution time. - Remove the shared mutex if no process is using it. ### Added + - ROCm SMI CLI: Add --showtempgraph Feature. ### Changed + - Relying on vendor ID to detect AMDGPU. - Change pragma message to warning for backward compatibility. ### Fixed + - Fix --showproductname when device's SKU cannot be parsed out of the VBIOS string. - Fix compile error: ‘memcpy’ was not declared. - Fix order of CE and UE reporting in ROCm SMI CLI. diff --git a/CPPLINT.cfg b/CPPLINT.cfg new file mode 100644 index 0000000000..b63692c6df --- /dev/null +++ b/CPPLINT.cfg @@ -0,0 +1,3 @@ +set noparent +linelength=100 +filter=-build/include_subdir,-legal/copyright,-runtime/printf,-build/c++11,-runtime/int,-build/header_guard diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 235c78ede6..26ef7b9f72 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -919,16 +919,6 @@ amdsmi_process_handle_t = ctypes.c_uint32 class struct_amdsmi_proc_info_t(Structure): pass -class struct_engine_usage_(Structure): - pass - -struct_engine_usage_._pack_ = 1 # source:False -struct_engine_usage_._fields_ = [ - ('gfx', ctypes.c_uint64), - ('enc', ctypes.c_uint64), - ('reserved', ctypes.c_uint32 * 12), -] - class struct_memory_usage_(Structure): pass @@ -940,6 +930,16 @@ struct_memory_usage_._fields_ = [ ('reserved', ctypes.c_uint32 * 10), ] +class struct_engine_usage_(Structure): + pass + +struct_engine_usage_._pack_ = 1 # source:False +struct_engine_usage_._fields_ = [ + ('gfx', ctypes.c_uint64), + ('enc', ctypes.c_uint64), + ('reserved', ctypes.c_uint32 * 12), +] + struct_amdsmi_proc_info_t._pack_ = 1 # source:False struct_amdsmi_proc_info_t._fields_ = [ ('name', ctypes.c_char * 32), diff --git a/rocm_smi/CMakeLists.txt b/rocm_smi/CMakeLists.txt index 43b723a7d0..6ee12b967f 100755 --- a/rocm_smi/CMakeLists.txt +++ b/rocm_smi/CMakeLists.txt @@ -40,11 +40,12 @@ if(${ROCM_PATCH_VERSION}) set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") else() set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}") -endif() -set(${ROCM_SMI}_VERSION_MAJOR "${VERSION_MAJOR}") -set(${ROCM_SMI}_VERSION_MINOR "${VERSION_MINOR}") -set(${ROCM_SMI}_VERSION_PATCH "0") +endif () +set(${ROCM_SMI}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}") +set(${ROCM_SMI}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}") +set(${ROCM_SMI}_VERSION_PATCH "${CPACK_PACKAGE_VERSION_PATCH}") set(${ROCM_SMI}_VERSION_BUILD "0") +set(${ROCM_SMI}_VERSION_HASH "${PKG_VERSION_HASH}") message("SOVERSION: ${SO_VERSION_STRING}") # Create a configure file to get version info from within library diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc index 1f2983fc72..fa54728b5e 100755 --- a/rocm_smi/example/rocm_smi_example.cc +++ b/rocm_smi/example/rocm_smi_example.cc @@ -745,8 +745,8 @@ auto print_error_or_value(rsmi_status_t status_code, const T& metric) { return str_values; } else if constexpr ((std::is_same_v) || - (std::is_same_v) || - (std::is_same_v)) { + (std::is_same_v) || + (std::is_same_v)) { return std::to_string(metric); } } diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 0236803e1e..8d29293085 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -80,6 +80,7 @@ extern "C" { //! The number of points that make up a voltage-frequency curve definition #define RSMI_NUM_VOLTAGE_CURVE_POINTS 3 + /** * @brief Error codes retured by rocm_smi_lib functions */ @@ -353,7 +354,7 @@ typedef struct { * Clock types */ typedef enum { - RSMI_CLK_TYPE_SYS = 0x0, //!< System clock + RSMI_CLK_TYPE_SYS = 0x0, //!< System clock RSMI_CLK_TYPE_FIRST = RSMI_CLK_TYPE_SYS, RSMI_CLK_TYPE_DF, //!< Data Fabric clock (for ASICs //!< running on a separate clock) @@ -970,6 +971,9 @@ struct metrics_table_header_t { uint8_t content_revision; /// \endcond }; +/// \cond Ignore in docs. +typedef struct metrics_table_header_t metrics_table_header_t; +/// \endcond /** * @brief The following structure holds the gpu metrics values for a device. @@ -986,9 +990,14 @@ struct metrics_table_header_t { #define RSMI_NUM_HBM_INSTANCES 4 /** - * @brief This should match kRSMI_MAX_NUM_VCN + * @brief This should match kRSMI_MAX_NUM_VCNS */ -#define RSMI_MAX_NUM_VCN 4 +#define RSMI_MAX_NUM_VCNS 4 + +/** + * @brief This should match kRSMI_MAX_JPEG_ENGINES + */ +#define RSMI_MAX_NUM_JPEG_ENGS 32 /** * @brief This should match kRSMI_MAX_NUM_CLKS @@ -1109,7 +1118,7 @@ typedef struct { uint16_t current_socket_power; // Utilization (%) - uint16_t vcn_activity[RSMI_MAX_NUM_VCN]; // VCN instances activity percent (encode/decode) + uint16_t vcn_activity[RSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode) // Clock Lock Status. Each bit corresponds to clock instance uint32_t gfxclk_lock_status; @@ -1143,6 +1152,19 @@ typedef struct { uint16_t current_vclk0s[RSMI_MAX_NUM_CLKS]; uint16_t current_dclk0s[RSMI_MAX_NUM_CLKS]; + /* + * v1.5 additions + */ + // JPEG activity percent (encode/decode) + uint16_t jpeg_activity[RSMI_MAX_NUM_JPEG_ENGS]; + + // PCIE NAK sent accumulated count + uint32_t pcie_nak_sent_count_acc; + + // PCIE NAK received accumulated count + uint32_t pcie_nak_rcvd_count_acc; + + /// \endcond } rsmi_gpu_metrics_t; @@ -1358,7 +1380,7 @@ rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision); * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid * */ -rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *sku); +rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, char *sku); /** * @brief Get the device vendor id associated with the device with provided @@ -1733,7 +1755,6 @@ rsmi_status_t rsmi_dev_subsystem_vendor_id_get(uint32_t dv_ind, uint16_t *id); */ rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id); - /** * @brief Get the XGMI physical id associated with the device * @@ -4097,7 +4118,7 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, /** @} */ // end of HWTopo /*****************************************************************************/ -/** @defgroup compute_partition Compute Partition Functions +/** @defgroup ComputePartition Compute Partition Functions * These functions are used to configure and query the device's * compute parition setting. * @{ @@ -4182,10 +4203,10 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, */ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind); -/** @} */ // end of compute_partition +/** @} */ // end of ComputePartition /*****************************************************************************/ -/** @defgroup memory_partition Memory Partition Functions +/** @defgroup memory_partition The Memory Partition Functions * These functions are used to query and set the device's current memory * partition. * @{ @@ -4627,7 +4648,8 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind); * Metric multi-valued counter types */ typedef uint16_t GPUMetricTempHbm_t[RSMI_NUM_HBM_INSTANCES]; -typedef uint16_t GPUMetricVcnActivity_t[RSMI_MAX_NUM_VCN]; +typedef uint16_t GPUMetricVcnActivity_t[RSMI_MAX_NUM_VCNS]; +typedef uint16_t GPUMetricJpegActivity_t[RSMI_MAX_NUM_JPEG_ENGS]; typedef uint64_t GPUMetricXgmiReadDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS]; typedef uint64_t GPUMetricXgmiWriteDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS]; typedef uint16_t GPUMetricCurrGfxClk_t[RSMI_MAX_NUM_GFX_CLKS]; @@ -5113,7 +5135,7 @@ rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_valu * * @param[inout] vcn_activity_value a pointer to uint16_t to which the device gpu * metric unit will be stored - * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_VCN) + * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_VCNS) * element array (GPUMetricVcnActivity_t) * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h index d3323fc25e..e699eec3d4 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -255,6 +255,7 @@ class Device { rsmi_status_t dev_log_gpu_metrics(std::ostringstream& outstream_metrics); AMGpuMetricsPublicLatestTupl_t dev_copy_internal_to_external_metrics(); + private: std::shared_ptr monitor_; std::shared_ptr power_monitor_; @@ -277,7 +278,6 @@ class Device { bool returnWriteErr = false); rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query); - uint64_t bdfid_; uint64_t kfd_gpu_id_; std::unordered_set; using GpuMetricTypePtr_t = std::shared_ptr; @@ -780,6 +872,40 @@ class GpuMetricsBase_v14_t final : public GpuMetricsBase_t }; +class GpuMetricsBase_v15_t final : public GpuMetricsBase_t +{ + public: + ~GpuMetricsBase_v15_t() = default; + + size_t sizeof_metric_table() override { + return sizeof(AMDGpuMetrics_v15_t); + } + + GpuMetricTypePtr_t get_metrics_table() override + { + if (!m_gpu_metric_ptr) { + m_gpu_metric_ptr.reset(&m_gpu_metrics_tbl, [](AMDGpuMetrics_v15_t*){}); + } + assert(m_gpu_metric_ptr != nullptr); + return m_gpu_metric_ptr; + } + + void dump_internal_metrics_table() override; + + AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override + { + return AMDGpuMetricVersionFlags_t::kGpuMetricV15; + } + + rsmi_status_t populate_metrics_dynamic_tbl() override; + AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override; + + + private: + AMDGpuMetrics_v15_t m_gpu_metrics_tbl; + std::shared_ptr m_gpu_metric_ptr; + +}; template rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, T& metric_value); diff --git a/rocm_smi/python_smi_tools/rocm_smi.py b/rocm_smi/python_smi_tools/rocm_smi.py index 6c1de2d7a7..9ebf07167b 100755 --- a/rocm_smi/python_smi_tools/rocm_smi.py +++ b/rocm_smi/python_smi_tools/rocm_smi.py @@ -29,10 +29,12 @@ from rsmiBindings import * # Major version - Increment when backwards-compatibility breaks # Minor version - Increment when adding a new feature, set to 0 when major is incremented # Patch version - Increment when adding a fix, set to 0 when minor is incremented -SMI_MAJ = 1 -SMI_MIN = 5 +# Hash version - Shortened commit hash. Print here and not with lib for consistency with amd-smi +SMI_MAJ = 2 +SMI_MIN = 0 SMI_PAT = 0 -__version__ = '%s.%s.%s' % (SMI_MAJ, SMI_MIN, SMI_PAT) +# SMI_HASH is provided by rsmiBindings +__version__ = '%s.%s.%s+%s' % (SMI_MAJ, SMI_MIN, SMI_PAT, SMI_HASH) # Set to 1 if an error occurs RETCODE = 0 @@ -828,23 +830,20 @@ def printTableRow(space, displayString, v_delim=" "): def checkIfSecondaryDie(device): """ Checks if GCD(die) is the secondary die in a MCM. + MI200 device specific feature check. + The secondary dies lacks power management features. - Secondary dies lack power management features. - TODO: switch to more robust way to check for primary/secondary die, when implemented in Kernel and rocm_smi_lib. @param device: The device to check """ - power_cap = c_uint64() - # secondary die can currently be determined by checking if all power1_* (power cap) values are equal to zero. - ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap)) - if not (rsmi_ret_ok(ret, None, 'get_power_cap', False) and power_cap.value == 0): - return False - ret = rocmsmi.rsmi_dev_power_cap_default_get(device, byref(power_cap)) - if not (rsmi_ret_ok(ret, None, 'get_power_cap_default', False) and power_cap.value == 0): - return False - ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power_cap)) - if not (rsmi_ret_ok(ret, None, 'get_power_avg', False) and power_cap.value == 0): - return False - return True + energy_count = c_uint64() + counter_resoution = c_float() + timestamp = c_uint64() + + # secondary die can be determined by checking if energy counter == 0 + ret = rocmsmi.rsmi_dev_energy_count_get(device, byref(energy_count), byref(counter_resoution), byref(timestamp)) + if (rsmi_ret_ok(ret, None, 'energy_count_secondary_die_check', silent=False)) and (energy_count.value == 0): + return True + return False def resetClocks(deviceList): diff --git a/rocm_smi/python_smi_tools/rsmiBindings.py b/rocm_smi/python_smi_tools/rsmiBindings.py index ee0ec76124..884793468f 100644 --- a/rocm_smi/python_smi_tools/rsmiBindings.py +++ b/rocm_smi/python_smi_tools/rsmiBindings.py @@ -55,6 +55,8 @@ dv_id = c_uint64() # GPU ID gpu_id = c_uint32(0) +SMI_HASH = '@PKG_VERSION_HASH@' + # Policy enums RSMI_MAX_NUM_FREQUENCIES = 33 diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 88fda5dd74..f9845a9dfc 100755 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -594,7 +594,7 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, if (ret != RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ << " | ======= end =======" << ", returning get_dev_value_line() response = " - << getRSMIStatusString(ret); + << amd::smi::getRSMIStatusString(ret); LOG_ERROR(ss); return ret; } @@ -613,7 +613,7 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, ss << __PRETTY_FUNCTION__ << " | ======= end =======" << ", returning strtoul() response = " - << getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(errno)); + << amd::smi::getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(errno)); LOG_TRACE(ss); return amd::smi::ErrnoToRsmiStatus(errno); @@ -667,7 +667,7 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block, if (ret != RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ << " | ======= end =======" << ", returning rsmi_dev_ecc_enabled_get() response = " - << getRSMIStatusString(ret); + << amd::smi::getRSMIStatusString(ret); LOG_ERROR(ss); return ret; } @@ -728,7 +728,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, default: ss << __PRETTY_FUNCTION__ << " | ======= end =======" << ", default case -> reporting " - << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED); + << amd::smi::getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED); LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } @@ -748,7 +748,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, if (ret != RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ << " | ======= end =======" << ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS" - << " -> reporting " << getRSMIStatusString(ret); + << " -> reporting " << amd::smi::getRSMIStatusString(ret); LOG_ERROR(ss); return ret; } @@ -767,7 +767,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, fs2 >> ec->correctable_err; ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", reporting " << getRSMIStatusString(ret);; + << ", reporting " << amd::smi::getRSMIStatusString(ret);; LOG_TRACE(ss); return ret; CATCH @@ -935,7 +935,7 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { ret = get_id(dv_ind, amd::smi::kDevDevID, id); ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", reporting " << getRSMIStatusString(ret); + << ", reporting " << amd::smi::getRSMIStatusString(ret); LOG_TRACE(ss); return ret; } @@ -950,7 +950,7 @@ rsmi_dev_oam_id_get(uint32_t dv_ind, uint16_t *id) { ret = get_id(dv_ind, amd::smi::kDevXGMIPhysicalID, id); ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", reporting " << getRSMIStatusString(ret); + << ", reporting " << amd::smi::getRSMIStatusString(ret); LOG_TRACE(ss); return ret; } @@ -965,7 +965,7 @@ rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision) { ret = get_id(dv_ind, amd::smi::kDevDevRevID, revision); outss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", reporting " << getRSMIStatusString(ret); + << ", reporting " << amd::smi::getRSMIStatusString(ret); LOG_TRACE(outss); return ret; } @@ -980,7 +980,7 @@ rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *id) { CHK_SUPPORT_NAME_ONLY(id) ret = get_id(dv_ind, amd::smi::kDevDevProdNum, id); ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", reporting " << getRSMIStatusString(ret); + << ", reporting " << amd::smi::getRSMIStatusString(ret); LOG_TRACE(ss); return ret; CATCH @@ -4045,6 +4045,7 @@ rsmi_status_t rsmi_dev_serial_number_get(uint32_t dv_ind, if (ret != RSMI_STATUS_SUCCESS) { return ret; } + uint32_t ln = static_cast(val_str.copy(serial_num, len)); serial_num[std::min(len - 1, ln)] = '\0'; @@ -5125,15 +5126,11 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) << " | Cause: device board name does not support this action" << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } - std::string newMemoryPartition - = mapRSMIToStringMemoryPartitionTypes.at(memory_partition); - std::string currentMemoryPartition; - switch (memory_partition) { case RSMI_MEMORY_PARTITION_NPS1: case RSMI_MEMORY_PARTITION_NPS2: @@ -5154,6 +5151,9 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, LOG_ERROR(ss); return RSMI_STATUS_INVALID_ARGS; } + std::string newMemoryPartition + = mapRSMIToStringMemoryPartitionTypes.at(memory_partition); + std::string currentMemoryPartition; // do nothing if memory_partition is the current mode rsmi_status_t ret_get = get_memory_partition(dv_ind, currentMemoryPartition); @@ -5196,13 +5196,16 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, if (amd::smi::ErrnoToRsmiStatus(ret) != RSMI_STATUS_SUCCESS) { rsmi_status_t err = amd::smi::ErrnoToRsmiStatus(ret); + if (ret == EACCES) { + err = RSMI_STATUS_NOT_SUPPORTED; // already verified permissions + } ss << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | Fail " << " | Device #: " << dv_ind << " | Type: " << devInfoTypesStrings.at(amd::smi::kDevMemoryPartition) - << " | Cause: issue writing requested setting of " + newMemoryPartition + << " | Cause: issue writing reqested setting of " + newMemoryPartition << " | Returning = " << getRSMIStatusString(err) << " |"; LOG_ERROR(ss); @@ -6027,7 +6030,6 @@ rsmi_dev_metrics_curr_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_v CATCH } - rsmi_status_t rsmi_dev_metrics_avg_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value) { @@ -6541,7 +6543,6 @@ rsmi_dev_metrics_pcie_bandwidth_inst_get(uint32_t dv_ind, uint64_t* pcie_bandwid CATCH } - rsmi_status_t rsmi_dev_metrics_pcie_l0_recov_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value) { @@ -6666,19 +6667,24 @@ rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_valu const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempHbm); amd::smi::GPUMetricTempHbmTbl_t tmp_hbl_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_hbl_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value)); - std::copy_n(std::begin(tmp_hbl_tbl), max_num_elems, *temp_hbm_value); - } + const auto max_num_elems = + static_cast(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_hbl_tbl.size()) ? max_num_elems : tmp_hbl_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_hbl_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_hbl_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(temp_hbm_value, 0, sizeof(*temp_hbm_value)); + std::copy_n(std::begin(tmp_hbl_tbl), copy_size, *temp_hbm_value); + } return status_code; CATCH @@ -6700,19 +6706,24 @@ rsmi_dev_metrics_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivity_t* vcn_a const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); amd::smi::GPUMetricVcnActivityTbl_t tmp_vcn_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_vcn_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value)); - std::copy_n(std::begin(tmp_vcn_tbl), max_num_elems, *vcn_activity_value); - } + const auto max_num_elems = + static_cast(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_vcn_tbl.size()) ? max_num_elems : tmp_vcn_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_vcn_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_vcn_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(vcn_activity_value, 0, sizeof(*vcn_activity_value)); + std::copy_n(std::begin(tmp_vcn_tbl), copy_size, *vcn_activity_value); + } return status_code; CATCH @@ -6734,19 +6745,24 @@ rsmi_dev_metrics_xgmi_read_data_get(uint32_t dv_ind, GPUMetricXgmiReadDataAcc_t* const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator); amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value)); - std::copy_n(std::begin(tmp_xgmi_acc_tbl), max_num_elems, *xgmi_read_data_acc_value); - } + const auto max_num_elems = + static_cast(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_xgmi_acc_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_xgmi_acc_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(xgmi_read_data_acc_value, 0, sizeof(*xgmi_read_data_acc_value)); + std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_read_data_acc_value); + } return status_code; CATCH @@ -6768,19 +6784,24 @@ rsmi_dev_metrics_xgmi_write_data_get(uint32_t dv_ind, GPUMetricXgmiWriteDataAcc_ const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator); amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value)); - std::copy_n(std::begin(tmp_xgmi_acc_tbl), max_num_elems, *xgmi_write_data_acc_value); - } + const auto max_num_elems = + static_cast(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_xgmi_acc_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_xgmi_acc_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(xgmi_write_data_acc_value, 0, sizeof(*xgmi_write_data_acc_value)); + std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_write_data_acc_value); + } return status_code; CATCH @@ -6800,26 +6821,28 @@ rsmi_dev_metrics_curr_gfxclk_get(uint32_t dv_ind, GPUMetricCurrGfxClk_t* current } const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock); - rsmi_gpu_metrics_t gpu = {}; - auto status = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu); - if (status == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::copy_n(std::begin(gpu.current_gfxclks), - static_cast( - sizeof(gpu.current_gfxclks)/sizeof(gpu.current_gfxclks[0])), - *current_gfxclk_value); - } + amd::smi::GPUMetricCurrGfxClkTbl_t tmp_curr_gfxclk_tbl{}; + auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_gfxclk_tbl); + const auto max_num_elems = + static_cast(std::end(*current_gfxclk_value) - std::begin(*current_gfxclk_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_curr_gfxclk_tbl.size()) ? max_num_elems : tmp_curr_gfxclk_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << static_cast( - sizeof(gpu.current_gfxclks)/sizeof(gpu.current_gfxclks[0])) - << " | Returning = " << status << " " - << getRSMIStatusString(status) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_curr_gfxclk_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(current_gfxclk_value, 0, sizeof(*current_gfxclk_value)); + std::copy_n(std::begin(tmp_curr_gfxclk_tbl), copy_size, *current_gfxclk_value); + } - return status; + return status_code; CATCH } @@ -6839,19 +6862,23 @@ rsmi_dev_metrics_curr_socclk_get(uint32_t dv_ind, GPUMetricCurrSocClk_t* current const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrSocClock); amd::smi::GPUMetricCurrSocClkTbl_t tmp_curr_socclk_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_socclk_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*current_socclk_value) - std::begin(*current_socclk_value)); - std::copy_n(std::begin(tmp_curr_socclk_tbl), max_num_elems, *current_socclk_value); - } + const auto max_num_elems = + static_cast(std::end(*current_socclk_value) - std::begin(*current_socclk_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_curr_socclk_tbl.size()) ? max_num_elems : tmp_curr_socclk_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_curr_socclk_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_curr_socclk_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(current_socclk_value, 0, sizeof(*current_socclk_value)); + std::copy_n(std::begin(tmp_curr_socclk_tbl), copy_size, *current_socclk_value); + } return status_code; CATCH @@ -6873,19 +6900,24 @@ rsmi_dev_metrics_curr_vclk0_get(uint32_t dv_ind, GPUMetricCurrVClk0_t* current_v const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrVClock0); amd::smi::GPUMetricCurrVClkTbl_t tmp_curr_vclk0_tbl{}; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_vclk0_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*current_vclk_value) - std::begin(*current_vclk_value)); - std::copy_n(std::begin(tmp_curr_vclk0_tbl), max_num_elems, *current_vclk_value); - } + const auto max_num_elems = + static_cast(std::end(*current_vclk_value) - std::begin(*current_vclk_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_curr_vclk0_tbl.size()) ? max_num_elems : tmp_curr_vclk0_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_curr_vclk0_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_curr_vclk0_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(current_vclk_value, 0, sizeof(*current_vclk_value)); + std::copy_n(std::begin(tmp_curr_vclk0_tbl), copy_size, *current_vclk_value); + } return status_code; CATCH @@ -6934,19 +6966,24 @@ rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_d const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrDClock0); amd::smi::GPUMetricCurrDClkTbl_t tmp_curr_dclk0_tbl; auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_dclk0_tbl); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - const auto max_num_elems = - static_cast(std::end(*current_dclk_value) - std::begin(*current_dclk_value)); - std::copy_n(std::begin(tmp_curr_dclk0_tbl), max_num_elems, *current_dclk_value); - } + const auto max_num_elems = + static_cast(std::end(*current_dclk_value) - std::begin(*current_dclk_value)); + const auto copy_size = + static_cast((max_num_elems < tmp_curr_dclk0_tbl.size()) ? max_num_elems : tmp_curr_dclk0_tbl.size()); ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Metric Size: " << tmp_curr_dclk0_tbl.size() - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; + << "\n | ======= end ======= " + << "\n | End Result " + << "\n | Device #: " << dv_ind + << "\n | Metric Type: " << static_cast(gpu_metric_unit) + << "\n | Metric Size: " << tmp_curr_dclk0_tbl.size() + << "\n | Max num of elements: " << max_num_elems + << "\n | Copy size: " << copy_size + << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; LOG_INFO(ostrstream); + if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::memset(current_dclk_value, 0, sizeof(*current_dclk_value)); + std::copy_n(std::begin(tmp_curr_dclk0_tbl), copy_size, *current_dclk_value); + } return status_code; CATCH @@ -7277,6 +7314,7 @@ rsmi_dev_metrics_header_info_get(uint32_t dv_ind, metrics_table_header_t* header CATCH } + rsmi_status_t rsmi_dev_metrics_xcd_counter_get(uint32_t dv_ind, uint16_t* xcd_counter_value) { @@ -7335,6 +7373,7 @@ rsmi_dev_metrics_log_get(uint32_t dv_ind) return status_code; CATCH } + // // End of: new GPU Metrics related work. // diff --git a/rocm_smi/src/rocm_smi64Config.in b/rocm_smi/src/rocm_smi64Config.in index bde279ced1..a3b26311d5 100755 --- a/rocm_smi/src/rocm_smi64Config.in +++ b/rocm_smi/src/rocm_smi64Config.in @@ -52,5 +52,6 @@ #define rocm_smi_VERSION_MINOR @rocm_smi_VERSION_MINOR@ #define rocm_smi_VERSION_PATCH @rocm_smi_VERSION_PATCH@ #define rocm_smi_VERSION_BUILD "@rocm_smi_VERSION_BUILD@" +#define rocm_smi_VERSION_HASH "@rocm_smi_VERSION_HASH@" -#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_ \ No newline at end of file +#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_ diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 469d7cbffb..6e79c29b3e 100755 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -738,7 +738,7 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) { << " | " << (fs.fail() ? "[ERROR] Failed read - format error" : "[GOOD] No fail - Successful read operation") << " | " << (fs.eof() ? "[ERROR] Failed read - EOF error" : - "[GOOD] No eof error - Successful read operation") + "[GOOD] No eof - Successful read operation") << " | " << (fs.good() ? "[GOOD] read good - Successful read operation" : "[ERROR] Failed read - good error"); LOG_INFO(ss); @@ -800,7 +800,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr, << " | " << (fs.fail() ? "[ERROR] Failed write - format error" : "[GOOD] No fail - Successful write operation") << " | " << (fs.eof() ? "[ERROR] Failed write - EOF error" : - "[GOOD] No eof error - Successful write operation") + "[GOOD] No eof - Successful write operation") << " | " << (fs.good() ? "[GOOD] Write good - Successful write operation" : "[ERROR] Failed write - good error"); diff --git a/rocm_smi/src/rocm_smi_gpu_metrics.cc b/rocm_smi/src/rocm_smi_gpu_metrics.cc index b92b8d542b..1618e328a9 100755 --- a/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -163,6 +163,7 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl {join_metrics_version(1, 2), AMDGpuMetricVersionFlags_t::kGpuMetricV12}, {join_metrics_version(1, 3), AMDGpuMetricVersionFlags_t::kGpuMetricV13}, {join_metrics_version(1, 4), AMDGpuMetricVersionFlags_t::kGpuMetricV14}, + {join_metrics_version(1, 5), AMDGpuMetricVersionFlags_t::kGpuMetricV15}, }; /** @@ -201,7 +202,8 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation {AMDGpuMetricsUnitType_t::kMetricAvgMmActivity, "AvgMmActivity"}, {AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, "GfxActivityAcc"}, {AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, "MemActivityAcc"}, - {AMDGpuMetricsUnitType_t::kMetricVcnActivity, "VcnActivity"}, + {AMDGpuMetricsUnitType_t::kMetricVcnActivity, "VcnActivity"}, /* v1.4 */ + {AMDGpuMetricsUnitType_t::kMetricJpegActivity, "JpegActivity"}, /* v1.5 */ // kGpuMetricAverageClock counters {AMDGpuMetricsUnitType_t::kMetricAvgGfxClockFrequency, "AvgGfxClockFrequency"}, @@ -213,11 +215,11 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation {AMDGpuMetricsUnitType_t::kMetricAvgDClock1Frequency, "AvgDClock1Frequency"}, // kGpuMetricCurrentClock counters - {AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, "CurrGfxClock"}, - {AMDGpuMetricsUnitType_t::kMetricCurrSocClock, "CurrSocClock"}, + {AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, "CurrGfxClock"}, /* v1.4: Changed to array */ + {AMDGpuMetricsUnitType_t::kMetricCurrSocClock, "CurrSocClock"}, /* v1.4: Changed to array */ {AMDGpuMetricsUnitType_t::kMetricCurrUClock, "CurrUClock"}, - {AMDGpuMetricsUnitType_t::kMetricCurrVClock0, "CurrVClock0"}, - {AMDGpuMetricsUnitType_t::kMetricCurrDClock0, "CurrDClock0"}, + {AMDGpuMetricsUnitType_t::kMetricCurrVClock0, "CurrVClock0"}, /* v1.4: Changed to array */ + {AMDGpuMetricsUnitType_t::kMetricCurrDClock0, "CurrDClock0"}, /* v1.4: Changed to array */ {AMDGpuMetricsUnitType_t::kMetricCurrVClock1, "CurrVClock1"}, {AMDGpuMetricsUnitType_t::kMetricCurrDClock1, "CurrDClock1"}, @@ -226,7 +228,7 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation {AMDGpuMetricsUnitType_t::kMetricIndepThrottleStatus, "IndepThrottleStatus"}, // kGpuMetricGfxClkLockStatus counters - {AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, "GfxClkLockStatus"}, + {AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, "GfxClkLockStatus"}, /* v1.4 */ // kGpuMetricCurrentFanSpeed counters {AMDGpuMetricsUnitType_t::kMetricCurrFanSpeed, "CurrFanSpeed"}, @@ -234,19 +236,21 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation // kGpuMetricLinkWidthSpeed counters {AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, "PcieLinkWidth"}, {AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, "PcieLinkSpeed"}, - {AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, "PcieBandwidthAcc"}, - {AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, "PcieBandwidthInst"}, - {AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, "XgmiLinkWidth"}, - {AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, "XgmiLinkSpeed"}, - {AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, "XgmiReadDataAcc"}, - {AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, "XgmiWriteDataAcc"}, - {AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, "PcieL0RecovCountAcc"}, - {AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, "PcieReplayCountAcc"}, - {AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, "PcieReplayRollOverCountAcc"}, + {AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, "PcieBandwidthAcc"}, /* v1.4 */ + {AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, "PcieBandwidthInst"}, /* v1.4 */ + {AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, "XgmiLinkWidth"}, /* v1.4 */ + {AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, "XgmiLinkSpeed"}, /* v1.4 */ + {AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, "XgmiReadDataAcc"}, /* v1.4 */ + {AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, "XgmiWriteDataAcc"}, /* v1.4 */ + {AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, "PcieL0RecovCountAcc"}, /* v1.4 */ + {AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, "PcieReplayCountAcc"}, /* v1.4 */ + {AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, "PcieReplayRollOverCountAcc"}, /* v1.4 */ + {AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator, "PcieNakSentCountAcc"}, /* v1.5 */ + {AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator, "PcieNakRcvdCountAcc"}, /* v1.5 */ // kGpuMetricPowerEnergy counters {AMDGpuMetricsUnitType_t::kMetricAvgSocketPower, "AvgSocketPower"}, - {AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, "CurrSocketPower"}, + {AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, "CurrSocketPower"}, /* v1.4 */ {AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, "EnergyAcc"}, // kGpuMetricVoltage counters @@ -343,6 +347,7 @@ AMDGpuMetricFactories_t amd_gpu_metrics_factory_table {AMDGpuMetricVersionFlags_t::kGpuMetricV12, std::make_shared(GpuMetricsBase_v12_t{})}, {AMDGpuMetricVersionFlags_t::kGpuMetricV13, std::make_shared(GpuMetricsBase_v13_t{})}, {AMDGpuMetricVersionFlags_t::kGpuMetricV14, std::make_shared(GpuMetricsBase_v14_t{})}, + {AMDGpuMetricVersionFlags_t::kGpuMetricV15, std::make_shared(GpuMetricsBase_v15_t{})}, }; GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version) @@ -462,6 +467,341 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str return multi_values; } +void GpuMetricsBase_v15_t::dump_internal_metrics_table() +{ + std::ostringstream ostrstream; + std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n"; + ostrstream << __PRETTY_FUNCTION__ + << " | ======= DEBUG ======= " + << " | Metric Version: " << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header) + << " | Size: " << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size) + << " |" + << "\n"; + ostrstream << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n" + << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n" + << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n" + + << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n" + + << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" + << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"; + + ostrstream << " vcn_activity: " << "\n"; + auto idx = uint64_t(0); + for (const auto& temp : m_gpu_metrics_tbl.m_vcn_activity) { + ostrstream << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ostrstream << " jpeg_activity: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_jpeg_activity) { + ostrstream << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ostrstream << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n" + << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n" + + << " throttle_status: " << m_gpu_metrics_tbl.m_throttle_status << "\n" + + << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" + << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n" + + << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n" + + << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n" + << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n" + + << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n" + << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n" + + << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n" + << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n" + + << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n" + << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n" + << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n" + << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n" + << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n" + << " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n" + << " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n"; + + ostrstream << " xgmi_read_data_acc: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) { + ostrstream << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ostrstream << " xgmi_write_data_acc: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) { + ostrstream << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ostrstream << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n"; + + ostrstream << " current_gfxclk: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) { + ostrstream << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ostrstream << " current_socclk: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) { + ostrstream << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ostrstream << " current_vclk0: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) { + ostrstream << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ostrstream << " current_dclk0: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) { + ostrstream << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ostrstream << " padding: " << m_gpu_metrics_tbl.m_padding << "\n"; + LOG_DEBUG(ostrstream); +} + +rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl() +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + // + // Note: Any metric treatment/changes (if any) should happen before they + // get written to internal/external tables. + // + auto run_metric_adjustments_v15 = [&]() { + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + const auto gpu_metrics_version = translate_flag_to_metric_version(get_gpu_metrics_version_used()); + ostrstream << __PRETTY_FUNCTION__ + << " | ======= info ======= " + << " | Applying adjustments " + << " | Metric Version: " << stringfy_metric_header_version( + disjoin_metrics_version(gpu_metrics_version)) + << " |"; + LOG_TRACE(ostrstream); + + // firmware_timestamp is at 10ns resolution + ostrstream << __PRETTY_FUNCTION__ + << " | ======= Changes ======= " + << " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp + << " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + LOG_DEBUG(ostrstream); + }; + + + // Adjustments/Changes specific to this version + run_metric_adjustments_v15(); + + // Temperature Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, + format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, + "temperature_hotspot")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem, + format_metric_row(m_gpu_metrics_tbl.m_temperature_mem, + "temperature_mem")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc, + "temperature_vrsoc")) + ); + + // Power/Energy Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, + format_metric_row(m_gpu_metrics_tbl.m_current_socket_power, + "curr_socket_power")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator, + "energy_acc")) + ); + + // Utilization Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity, + "average_gfx_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity, + "average_umc_activity")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVcnActivity, + format_metric_row(m_gpu_metrics_tbl.m_vcn_activity, + "[average_vcn_activity]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricJpegActivity, + format_metric_row(m_gpu_metrics_tbl.m_jpeg_activity, + "[average_jpeg_activity]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc, + "gfx_activity_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc, + "mem_activity_acc")) + ); + + // Timestamp Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware, + format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp, + "firmware_timestamp")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter, + format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter, + "system_clock_counter")) + ); + + // Throttle Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleStatus] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricThrottleStatus, + format_metric_row(m_gpu_metrics_tbl.m_throttle_status, + "throttle_status")) + ); + + // GfxLock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, + format_metric_row(m_gpu_metrics_tbl.m_gfxclk_lock_status, + "gfxclk_lock_status")) + ); + + // Link/Width/Speed Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width, + "pcie_link_width")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed, + "pcie_link_speed")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_width, + "xgmi_link_width")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_speed, + "xgmi_link_speed")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc, + "pcie_bandwidth_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_inst, + "pcie_bandwidth_inst")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc, + "pcie_l0_recov_count_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_count_acc, + "pcie_replay_count_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc, + "pcie_replay_rollover_count_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc, + "pcie_nak_sent_count_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc, + "pcie_nak_rcvd_count_acc")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc, + "[xgmi_read_data_acc]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_write_data_acc, + "[xgmi_write_data_acc]")) + ); + + // CurrentClock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, + format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk, + "[current_gfxclk]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock, + format_metric_row(m_gpu_metrics_tbl.m_current_socclk, + "[current_socclk]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_vclk0, + "[current_vclk0]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_dclk0, + "[current_dclk0]")) + ); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock, + format_metric_row(m_gpu_metrics_tbl.m_current_uclk, + "current_uclk")) + ); + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return status_code; +} + + void GpuMetricsBase_v14_t::dump_internal_metrics_table() { std::ostringstream ostrstream; @@ -827,6 +1167,10 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m std::end(rsmi_gpu_metrics.vcn_activity), init_max_uint_types()); + std::fill(std::begin(rsmi_gpu_metrics.jpeg_activity), + std::end(rsmi_gpu_metrics.jpeg_activity), + init_max_uint_types()); + rsmi_gpu_metrics.gfxclk_lock_status = init_max_uint_types(); rsmi_gpu_metrics.xgmi_link_width = init_max_uint_types(); rsmi_gpu_metrics.xgmi_link_speed = init_max_uint_types(); @@ -836,35 +1180,33 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m rsmi_gpu_metrics.pcie_replay_count_acc = init_max_uint_types(); rsmi_gpu_metrics.pcie_replay_rover_count_acc = init_max_uint_types(); - std::fill_n(&rsmi_gpu_metrics.xgmi_read_data_acc[0], - (sizeof(rsmi_gpu_metrics.xgmi_read_data_acc) / - sizeof(rsmi_gpu_metrics.xgmi_read_data_acc[0])), - std::numeric_limits::max()); + std::fill(std::begin(rsmi_gpu_metrics.xgmi_read_data_acc), + std::end(rsmi_gpu_metrics.xgmi_read_data_acc), + init_max_uint_types()); - std::fill_n(&rsmi_gpu_metrics.xgmi_write_data_acc[0], - (sizeof(rsmi_gpu_metrics.xgmi_write_data_acc) / - sizeof(rsmi_gpu_metrics.xgmi_write_data_acc[0])), - std::numeric_limits::max()); + std::fill(std::begin(rsmi_gpu_metrics.xgmi_write_data_acc), + std::end(rsmi_gpu_metrics.xgmi_write_data_acc), + init_max_uint_types()); - std::fill_n(&rsmi_gpu_metrics.current_gfxclks[0], - (sizeof(rsmi_gpu_metrics.current_gfxclks) / - sizeof(rsmi_gpu_metrics.current_gfxclks[0])), - std::numeric_limits::max()); + std::fill(std::begin(rsmi_gpu_metrics.current_gfxclks), + std::end(rsmi_gpu_metrics.current_gfxclks), + init_max_uint_types()); - std::fill_n(&rsmi_gpu_metrics.current_socclks[0], - (sizeof(rsmi_gpu_metrics.current_socclks) / - sizeof(rsmi_gpu_metrics.current_socclks[0])), - std::numeric_limits::max()); + std::fill(std::begin(rsmi_gpu_metrics.current_socclks), + std::end(rsmi_gpu_metrics.current_socclks), + init_max_uint_types()); - std::fill_n(&rsmi_gpu_metrics.current_vclk0s[0], - (sizeof(rsmi_gpu_metrics.current_vclk0s) / - sizeof(rsmi_gpu_metrics.current_vclk0s[0])), - std::numeric_limits::max()); + std::fill(std::begin(rsmi_gpu_metrics.current_vclk0s), + std::end(rsmi_gpu_metrics.current_vclk0s), + init_max_uint_types()); + + std::fill(std::begin(rsmi_gpu_metrics.current_dclk0s), + std::end(rsmi_gpu_metrics.current_dclk0s), + init_max_uint_types()); + + rsmi_gpu_metrics.pcie_nak_sent_count_acc = init_max_uint_types(); + rsmi_gpu_metrics.pcie_nak_rcvd_count_acc = init_max_uint_types(); - std::fill_n(&rsmi_gpu_metrics.current_dclk0s[0], - (sizeof(rsmi_gpu_metrics.current_dclk0s) / - sizeof(rsmi_gpu_metrics.current_dclk0s[0])), - std::numeric_limits::max()); ostrstream << __PRETTY_FUNCTION__ << " | ======= end ======= " @@ -876,6 +1218,195 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m return status_code; } +AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v15_t::copy_internal_to_external_metrics() +{ + std::ostringstream ostrstream; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ostrstream << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ostrstream); + + auto copy_data_from_internal_metrics_tbl = [&]() { + AMGpuMetricsPublicLatest_t metrics_public_init{}; + + // + // Note: Initializing data members with their max. If field is max, + // no data was assigned to it. + init_max_public_gpu_matrics(metrics_public_init); + + // Header + metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size; + metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision; + metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision; + + + // Temperature + metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot; + metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem; + metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc; + + // Power + metrics_public_init.current_socket_power = m_gpu_metrics_tbl.m_current_socket_power; + + // Utilization + metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity; + metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity; + + // vcn_activity + const auto vcn_activity_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_vcn_activity) - + std::begin(m_gpu_metrics_tbl.m_vcn_activity)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_vcn_activity), + vcn_activity_num_elems, + metrics_public_init.vcn_activity); + + // jpeg_activity + const auto jpeg_activity_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_jpeg_activity) - + std::begin(m_gpu_metrics_tbl.m_jpeg_activity)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_jpeg_activity), + jpeg_activity_num_elems, + metrics_public_init.jpeg_activity); + + // Power/Energy + metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator; + + // Driver attached timestamp (in ns) + metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter; + + // Throttle status + metrics_public_init.throttle_status = m_gpu_metrics_tbl.m_throttle_status; + + // Clock Lock Status. Each bit corresponds to clock instance + metrics_public_init.gfxclk_lock_status = m_gpu_metrics_tbl.m_gfxclk_lock_status; + + // Link width (number of lanes) and speed + metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width; + metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed; + + // XGMI bus width and bitrate + metrics_public_init.xgmi_link_width = m_gpu_metrics_tbl.m_xgmi_link_width; + metrics_public_init.xgmi_link_speed = m_gpu_metrics_tbl.m_xgmi_link_speed; + + // Utilization Accumulated + metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc; + metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc; + + // PCIE accumulated bandwidth + metrics_public_init.pcie_bandwidth_acc = m_gpu_metrics_tbl.m_pcie_bandwidth_acc; + + // PCIE instantaneous bandwidth + metrics_public_init.pcie_bandwidth_inst = m_gpu_metrics_tbl.m_pcie_bandwidth_inst; + + // PCIE L0 to recovery state transition accumulated count + metrics_public_init.pcie_l0_to_recov_count_acc = m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc; + + // PCIE replay accumulated count + metrics_public_init.pcie_replay_count_acc = m_gpu_metrics_tbl.m_pcie_replay_count_acc; + + // PCIE replay rollover accumulated count + metrics_public_init.pcie_replay_rover_count_acc = m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc; + + // PCIE NAK sent accumulated count + metrics_public_init.pcie_nak_sent_count_acc = m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc; + + // PCIE NAK received accumulated count + metrics_public_init.pcie_nak_rcvd_count_acc = m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc; + + // XGMI accumulated data transfer size + // xgmi_read_data + const auto xgmi_read_data_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_read_data_acc) - + std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc), + xgmi_read_data_num_elems, + metrics_public_init.xgmi_read_data_acc); + // xgmi_write_data + const auto xgmi_write_data_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_write_data_acc) - + std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc), + xgmi_write_data_num_elems, + metrics_public_init.xgmi_write_data_acc); + + // PMFW attached timestamp (10ns resolution) + metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp; + + // Current clocks + // current_gfxclk + const auto curr_gfxclk_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_gfxclk) - + std::begin(m_gpu_metrics_tbl.m_current_gfxclk)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_gfxclk), + curr_gfxclk_num_elems, + metrics_public_init.current_gfxclks); + + // current_socclk + const auto curr_socclk_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_socclk) - + std::begin(m_gpu_metrics_tbl.m_current_socclk)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_socclk), + curr_socclk_num_elems, + metrics_public_init.current_socclks); + + // current_vclk0 + const auto curr_vclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_vclk0) - + std::begin(m_gpu_metrics_tbl.m_current_vclk0)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_vclk0), + curr_vclk0_num_elems, + metrics_public_init.current_vclk0s); + + // current_dclk0 + const auto curr_dclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_dclk0) - + std::begin(m_gpu_metrics_tbl.m_current_dclk0)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_dclk0), + curr_dclk0_num_elems, + metrics_public_init.current_dclk0s); + + metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk; + + // + // Note: Backwards compatibility -> Handling extra/exception cases + // related to earlier versions (1.3) + metrics_public_init.current_gfxclk = metrics_public_init.current_gfxclks[0]; + // metrics_public_init.average_gfxclk_frequency = metrics_public_init.current_gfxclks[0]; + + metrics_public_init.current_socclk = metrics_public_init.current_socclks[0]; + // metrics_public_init.average_socclk_frequency = metrics_public_init.current_socclks[0]; + + metrics_public_init.current_vclk0 = metrics_public_init.current_vclk0s[0]; + // metrics_public_init.average_vclk0_frequency = metrics_public_init.current_vclk0s[0]; + + metrics_public_init.current_vclk1 = metrics_public_init.current_vclk0s[1]; + // metrics_public_init.average_vclk1_frequency = metrics_public_init.current_vclk0s[1]; + + metrics_public_init.current_dclk0 = metrics_public_init.current_dclk0s[0]; + // metrics_public_init.average_dclk0_frequency = metrics_public_init.current_dclk0s[0]; + + metrics_public_init.current_dclk1 = metrics_public_init.current_dclk0s[1]; + // metrics_public_init.average_dclk1_frequency = metrics_public_init.current_dclk0s[1]; + + return metrics_public_init; + }(); + + ostrstream << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ostrstream); + + return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl); +} AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v14_t::copy_internal_to_external_metrics() { @@ -2154,11 +2685,9 @@ rsmi_status_t Device::dev_read_gpu_metrics_header_data() // Check if/when metrics table needs to be refreshed. auto now_ts = actual_timestamp_in_secs(); - if (((!m_gpu_metrics_header.m_structure_size) || - (!m_gpu_metrics_header.m_format_revision) || - (!m_gpu_metrics_header.m_content_revision)) || - ((now_ts - m_gpu_metrics_updated_timestamp) >= - kRSMI_GPU_METRICS_EXPIRATION_SECS)) { + if ((!m_gpu_metrics_header.m_structure_size) || + (!m_gpu_metrics_header.m_format_revision) || + (!m_gpu_metrics_header.m_content_revision)) { auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, sizeof(AMDGpuMetricsHeader_v1_t), &m_gpu_metrics_header); @@ -2617,7 +3146,7 @@ rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t met << " | Returning = " << getRSMIStatusString(status_code) << " |"; - LOG_ERROR(ostrstream); + LOG_TRACE(ostrstream); return status_code; } } @@ -2727,7 +3256,7 @@ rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnit << " | Returning = " << getRSMIStatusString(status_code) << " |"; - LOG_ERROR(ostrstream); + LOG_TRACE(ostrstream); return status_code; } diff --git a/rocm_smi/src/rocm_smi_main.cc b/rocm_smi/src/rocm_smi_main.cc index 7365fe9cfc..612482aee0 100755 --- a/rocm_smi/src/rocm_smi_main.cc +++ b/rocm_smi/src/rocm_smi_main.cc @@ -52,8 +52,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -391,10 +391,6 @@ RocmSMI::Initialize(uint64_t flags) { << "\n | final update: device->bdfid() holds correct device bdf"; LOG_TRACE(ss); } - if (ret != 0) { - throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, - "Failed to initialize rocm_smi library (amdgpu node discovery)."); - } std::shared_ptr dev; // Sort index based on the BDF, collect BDF id firstly. @@ -437,6 +433,7 @@ RocmSMI::Initialize(uint64_t flags) { for (it = io_link_map_tmp.begin(); it != io_link_map_tmp.end(); it++) io_link_map_[it->first] = it->second; + // Remove any drm nodes that don't have a corresponding readable kfd node. // kfd nodes will not be added if their properties file is not readable. auto dev_iter = devices_.begin(); @@ -480,6 +477,7 @@ RocmSMI::Initialize(uint64_t flags) { if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { logSystemDetails(); } + // Leaving below to help debug temp file issues // displayAppTmpFilesContent(); std::string amdGPUDeviceList = displayAllDevicePaths(devices_); diff --git a/rocm_smi/src/rocm_smi_utils.cc b/rocm_smi/src/rocm_smi_utils.cc index dfba3e687d..f9589be8f8 100755 --- a/rocm_smi/src/rocm_smi_utils.cc +++ b/rocm_smi/src/rocm_smi_utils.cc @@ -42,11 +42,14 @@ */ #define _GNU_SOURCE 1 // REQUIRED: to utilize some GNU features/functions, see // _GNU_SOURCE functions which check +#include +#include +#include +#include #include -#include #include #include -#include +#include #include #include