Merge rocm-smi/amd-staging into amd-dev 20240119

Change-Id: Ie706473ff92a91b19e95d2d58f64904cad73a89a
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/amdsmi commit: 6132074089]
Tá an tiomantas seo le fáil i:
Charis Poag
2024-01-19 02:45:37 -06:00
tuismitheoir c081e9e6f8
tiomantas c5ba765be0
D'athraigh 23 comhad le 1083 breiseanna agus 218 scriosta
+4
Féach ar an gComhad
@@ -0,0 +1,4 @@
---
Language: Cpp
BasedOnStyle: Google
ColumnLimit: 100
+25
Féach ar an gComhad
@@ -0,0 +1,25 @@
# THIS FILE IS GENERATED FROM .clangd!
# Run .update-clang-tidy.sh to regenerate.
Checks:
bugprone*,
clang-analyzer*,
google*,
misc*,
modernize*,
-abseil*,
-bugprone-easily-swappable-parameters,
-bugprone-reserved-identifier,
-clang-analyzer-security.insecureAPI.strcpy,
-cppcoreguidelines*,
-cppcoreguidelines-pro*,
-misc-non-copyable-objects,
-misc-use-anonymous-namespace,
-modernize-avoid-c-arrays,
-modernize-redundant-void-arg,
-modernize-use-auto,
-modernize-use-nodiscard,
-modernize-use-noexcept,
-modernize-use-trailing-return-type,
-modernize-use-using,
-performance*,
-readability*,
+37
Féach ar an gComhad
@@ -0,0 +1,37 @@
CompileFlags:
Remove: -W*
Add: [-Wall, -pedantic, -I/opt/rocm/include, -I/opt/rocm/include/hsa, -I/opt/rocm/include/rocprofiler]
Compiler: clang++
# list here: https://clang.llvm.org/extra/clang-tidy/checks/list.html
Diagnostics:
UnusedIncludes: Strict
# rules below are copied into .clang-tidy using ./.update-clang-tidy.sh
# please keep the rules sorted alphabetically
ClangTidy:
Add: [
bugprone*,
clang-analyzer*,
google*,
misc*,
modernize*,
]
Remove: [
abseil*,
bugprone-easily-swappable-parameters,
bugprone-reserved-identifier,
clang-analyzer-security.insecureAPI.strcpy,
cppcoreguidelines*,
cppcoreguidelines-pro*,
misc-non-copyable-objects,
misc-use-anonymous-namespace,
modernize-avoid-c-arrays,
modernize-redundant-void-arg,
modernize-use-auto,
modernize-use-nodiscard,
modernize-use-noexcept,
modernize-use-trailing-return-type,
modernize-use-using,
performance*,
readability*,
]
+1
Féach ar an gComhad
@@ -13,3 +13,4 @@ indent_style = space
charset = utf-8
indent_style = space
indent_size = 2
max_line_length = 100
+5
Féach ar an gComhad
@@ -34,3 +34,8 @@ device/
# misc
esmi_ib_library/
# do NOT ignore these files
!.clang-format
!.clang-tidy
!.clangd
+30
Féach ar an gComhad
@@ -0,0 +1,30 @@
# - How to use:
# python3 -m pip install pre-commit
# pre-commit install --install hooks
# Upon a new commit - the hooks should automagically run
#
# - How to skip:
# git commit --no-verify
# or
# SKIP=clang-format-docker git commit
# SKIP=cpplint-docker git commit
fail_fast: false
repos:
# For portability I decided to use Docker containers
- repo: https://github.com/dmitrii-galantsev/pre-commit-docker-cpplint
rev: 0.0.3
hooks:
- id: clang-format-docker
- id: cpplint-docker
# Below is a local way of running formatters and linters
# NOTE: clang-tidy is not used in the above tests
# - repo: https://github.com/pocc/pre-commit-hooks
# rev: v1.3.5
# hooks:
# - id: clang-format
# args: [--no-diff, -i]
# - id: clang-tidy
# args: [-p=build, --quiet]
# - id: cpplint
# args: [--verbose=5]
+36
Féach ar an gComhad
@@ -0,0 +1,36 @@
#!/usr/bin/env bash
set -x # trace
set -e # exit immediately if command fails
set -u # exit if an undefined variable is found
awk '
BEGIN {
print "# THIS FILE IS GENERATED FROM .clangd!"
print "# Run ./.update-clang-tidy.sh to regenerate."
print "Checks:"
}
/Add: \[$/{
a=1
next
}
/]/{
a=0
}
a{
gsub(/^\s+/," ")
print
}
/Remove: \[$/{
r=1
next
}
/]/{
r=0
}
r{
gsub(/^\s+/," -")
print
}
' .clangd | tee .clang-tidy
+5 -1
Féach ar an gComhad
@@ -1,21 +1,25 @@
# Change Log for ROCm SMI Library
Full documentation for rocm_smi_lib is available at [https://docs.amd.com/](https://docs.amd.com/category/SMI%20API%20Guides).
Full documentation for rocm_smi_lib is available at [https://docs.amd.com/](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/).
## rocm_smi_lib for ROCm 5.5.0
### Optimizations
- Add new test to measure api execution time.
- Remove the shared mutex if no process is using it.
### Added
- ROCm SMI CLI: Add --showtempgraph Feature.
### Changed
- Relying on vendor ID to detect AMDGPU.
- Change pragma message to warning for backward compatibility.
### Fixed
- Fix --showproductname when device's SKU cannot be parsed out of the VBIOS string.
- Fix compile error: memcpy was not declared.
- Fix order of CE and UE reporting in ROCm SMI CLI.
+3
Féach ar an gComhad
@@ -0,0 +1,3 @@
set noparent
linelength=100
filter=-build/include_subdir,-legal/copyright,-runtime/printf,-build/c++11,-runtime/int,-build/header_guard
+10 -10
Féach ar an gComhad
@@ -919,16 +919,6 @@ amdsmi_process_handle_t = ctypes.c_uint32
class struct_amdsmi_proc_info_t(Structure):
pass
class struct_engine_usage_(Structure):
pass
struct_engine_usage_._pack_ = 1 # source:False
struct_engine_usage_._fields_ = [
('gfx', ctypes.c_uint64),
('enc', ctypes.c_uint64),
('reserved', ctypes.c_uint32 * 12),
]
class struct_memory_usage_(Structure):
pass
@@ -940,6 +930,16 @@ struct_memory_usage_._fields_ = [
('reserved', ctypes.c_uint32 * 10),
]
class struct_engine_usage_(Structure):
pass
struct_engine_usage_._pack_ = 1 # source:False
struct_engine_usage_._fields_ = [
('gfx', ctypes.c_uint64),
('enc', ctypes.c_uint64),
('reserved', ctypes.c_uint32 * 12),
]
struct_amdsmi_proc_info_t._pack_ = 1 # source:False
struct_amdsmi_proc_info_t._fields_ = [
('name', ctypes.c_char * 32),
+5 -4
Féach ar an gComhad
@@ -40,11 +40,12 @@ if(${ROCM_PATCH_VERSION})
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
else()
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}")
endif()
set(${ROCM_SMI}_VERSION_MAJOR "${VERSION_MAJOR}")
set(${ROCM_SMI}_VERSION_MINOR "${VERSION_MINOR}")
set(${ROCM_SMI}_VERSION_PATCH "0")
endif ()
set(${ROCM_SMI}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}")
set(${ROCM_SMI}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}")
set(${ROCM_SMI}_VERSION_PATCH "${CPACK_PACKAGE_VERSION_PATCH}")
set(${ROCM_SMI}_VERSION_BUILD "0")
set(${ROCM_SMI}_VERSION_HASH "${PKG_VERSION_HASH}")
message("SOVERSION: ${SO_VERSION_STRING}")
# Create a configure file to get version info from within library
@@ -745,8 +745,8 @@ auto print_error_or_value(rsmi_status_t status_code, const T& metric) {
return str_values;
}
else if constexpr ((std::is_same_v<T, std::uint16_t>) ||
(std::is_same_v<T, std::uint32_t>) ||
(std::is_same_v<T, std::uint64_t>)) {
(std::is_same_v<T, std::uint32_t>) ||
(std::is_same_v<T, std::uint64_t>)) {
return std::to_string(metric);
}
}
+33 -11
Féach ar an gComhad
@@ -80,6 +80,7 @@ extern "C" {
//! The number of points that make up a voltage-frequency curve definition
#define RSMI_NUM_VOLTAGE_CURVE_POINTS 3
/**
* @brief Error codes retured by rocm_smi_lib functions
*/
@@ -353,7 +354,7 @@ typedef struct {
* Clock types
*/
typedef enum {
RSMI_CLK_TYPE_SYS = 0x0, //!< System clock
RSMI_CLK_TYPE_SYS = 0x0, //!< System clock
RSMI_CLK_TYPE_FIRST = RSMI_CLK_TYPE_SYS,
RSMI_CLK_TYPE_DF, //!< Data Fabric clock (for ASICs
//!< running on a separate clock)
@@ -970,6 +971,9 @@ struct metrics_table_header_t {
uint8_t content_revision;
/// \endcond
};
/// \cond Ignore in docs.
typedef struct metrics_table_header_t metrics_table_header_t;
/// \endcond
/**
* @brief The following structure holds the gpu metrics values for a device.
@@ -986,9 +990,14 @@ struct metrics_table_header_t {
#define RSMI_NUM_HBM_INSTANCES 4
/**
* @brief This should match kRSMI_MAX_NUM_VCN
* @brief This should match kRSMI_MAX_NUM_VCNS
*/
#define RSMI_MAX_NUM_VCN 4
#define RSMI_MAX_NUM_VCNS 4
/**
* @brief This should match kRSMI_MAX_JPEG_ENGINES
*/
#define RSMI_MAX_NUM_JPEG_ENGS 32
/**
* @brief This should match kRSMI_MAX_NUM_CLKS
@@ -1109,7 +1118,7 @@ typedef struct {
uint16_t current_socket_power;
// Utilization (%)
uint16_t vcn_activity[RSMI_MAX_NUM_VCN]; // VCN instances activity percent (encode/decode)
uint16_t vcn_activity[RSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode)
// Clock Lock Status. Each bit corresponds to clock instance
uint32_t gfxclk_lock_status;
@@ -1143,6 +1152,19 @@ typedef struct {
uint16_t current_vclk0s[RSMI_MAX_NUM_CLKS];
uint16_t current_dclk0s[RSMI_MAX_NUM_CLKS];
/*
* v1.5 additions
*/
// JPEG activity percent (encode/decode)
uint16_t jpeg_activity[RSMI_MAX_NUM_JPEG_ENGS];
// PCIE NAK sent accumulated count
uint32_t pcie_nak_sent_count_acc;
// PCIE NAK received accumulated count
uint32_t pcie_nak_rcvd_count_acc;
/// \endcond
} rsmi_gpu_metrics_t;
@@ -1358,7 +1380,7 @@ rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision);
* @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid
*
*/
rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *sku);
rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, char *sku);
/**
* @brief Get the device vendor id associated with the device with provided
@@ -1733,7 +1755,6 @@ rsmi_status_t rsmi_dev_subsystem_vendor_id_get(uint32_t dv_ind, uint16_t *id);
*/
rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id);
/**
* @brief Get the XGMI physical id associated with the device
*
@@ -4097,7 +4118,7 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst,
/** @} */ // end of HWTopo
/*****************************************************************************/
/** @defgroup compute_partition Compute Partition Functions
/** @defgroup ComputePartition Compute Partition Functions
* These functions are used to configure and query the device's
* compute parition setting.
* @{
@@ -4182,10 +4203,10 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
*/
rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind);
/** @} */ // end of compute_partition
/** @} */ // end of ComputePartition
/*****************************************************************************/
/** @defgroup memory_partition Memory Partition Functions
/** @defgroup memory_partition The Memory Partition Functions
* These functions are used to query and set the device's current memory
* partition.
* @{
@@ -4627,7 +4648,8 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind);
* Metric multi-valued counter types
*/
typedef uint16_t GPUMetricTempHbm_t[RSMI_NUM_HBM_INSTANCES];
typedef uint16_t GPUMetricVcnActivity_t[RSMI_MAX_NUM_VCN];
typedef uint16_t GPUMetricVcnActivity_t[RSMI_MAX_NUM_VCNS];
typedef uint16_t GPUMetricJpegActivity_t[RSMI_MAX_NUM_JPEG_ENGS];
typedef uint64_t GPUMetricXgmiReadDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS];
typedef uint64_t GPUMetricXgmiWriteDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS];
typedef uint16_t GPUMetricCurrGfxClk_t[RSMI_MAX_NUM_GFX_CLKS];
@@ -5113,7 +5135,7 @@ rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_valu
*
* @param[inout] vcn_activity_value a pointer to uint16_t to which the device gpu
* metric unit will be stored
* - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_VCN)
* - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_VCNS)
* element array (GPUMetricVcnActivity_t)
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
@@ -255,6 +255,7 @@ class Device {
rsmi_status_t dev_log_gpu_metrics(std::ostringstream& outstream_metrics);
AMGpuMetricsPublicLatestTupl_t dev_copy_internal_to_external_metrics();
private:
std::shared_ptr<Monitor> monitor_;
std::shared_ptr<PowerMon> power_monitor_;
@@ -277,7 +278,6 @@ class Device {
bool returnWriteErr = false);
rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query);
uint64_t bdfid_;
uint64_t kfd_gpu_id_;
std::unordered_set<rsmi_event_group_t,
@@ -92,7 +92,10 @@ constexpr uint32_t kRSMI_MAX_NUM_GFX_CLKS = 8;
constexpr uint32_t kRSMI_MAX_NUM_CLKS = 4;
// Note: This *must* match NUM_VCN
constexpr uint32_t kRSMI_MAX_NUM_VCN = 4;
constexpr uint32_t kRSMI_MAX_NUM_VCNS = 4;
// Note: This *must* match NUM_JPEG_ENG
constexpr uint32_t kRSMI_MAX_JPEG_ENGINES = 32;
struct AMDGpuMetricsHeader_v1_t
@@ -326,7 +329,7 @@ struct AMDGpuMetrics_v14_t
// Utilization (%)
uint16_t m_average_gfx_activity;
uint16_t m_average_umc_activity; // memory controller
uint16_t m_vcn_activity[kRSMI_MAX_NUM_VCN]; // VCN instances activity percent (encode/decode)
uint16_t m_vcn_activity[kRSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode)
// Energy (15.259uJ (2^-16) units)
uint64_t m_energy_accumulator;
@@ -383,7 +386,89 @@ struct AMDGpuMetrics_v14_t
uint16_t m_padding;
};
using AMGpuMetricsLatest_t = AMDGpuMetrics_v14_t;
struct AMDGpuMetrics_v15_t
{
~AMDGpuMetrics_v15_t() = default;
struct AMDGpuMetricsHeader_v1_t m_common_header;
// Temperature (Celsius). It will be zero (0) if unsupported.
uint16_t m_temperature_hotspot;
uint16_t m_temperature_mem;
uint16_t m_temperature_vrsoc;
// Power (Watts)
uint16_t m_current_socket_power;
// Utilization (%)
uint16_t m_average_gfx_activity;
uint16_t m_average_umc_activity; // memory controller
uint16_t m_vcn_activity[kRSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode)
uint16_t m_jpeg_activity[kRSMI_MAX_JPEG_ENGINES]; // JPEG activity percent (encode/decode)
// Energy (15.259uJ (2^-16) units)
uint64_t m_energy_accumulator;
// Driver attached timestamp (in ns)
uint64_t m_system_clock_counter;
// Throttle status
uint32_t m_throttle_status;
// Clock Lock Status. Each bit corresponds to clock instance
uint32_t m_gfxclk_lock_status;
// Link width (number of lanes) and speed (in 0.1 GT/s)
uint16_t m_pcie_link_width;
uint16_t m_pcie_link_speed; // in 0.1 GT/s
// XGMI bus width and bitrate (in Gbps)
uint16_t m_xgmi_link_width;
uint16_t m_xgmi_link_speed;
// Utilization Accumulated (%)
uint32_t m_gfx_activity_acc;
uint32_t m_mem_activity_acc;
// PCIE accumulated bandwidth (GB/sec)
uint64_t m_pcie_bandwidth_acc;
// PCIE instantaneous bandwidth (GB/sec)
uint64_t m_pcie_bandwidth_inst;
// PCIE L0 to recovery state transition accumulated count
uint64_t m_pcie_l0_to_recov_count_acc;
// PCIE replay accumulated count
uint64_t m_pcie_replay_count_acc;
// PCIE replay rollover accumulated count
uint64_t m_pcie_replay_rover_count_acc;
// PCIE NAK sent accumulated count
uint32_t m_pcie_nak_sent_count_acc;
// PCIE NAK received accumulated count
uint32_t m_pcie_nak_rcvd_count_acc;
// XGMI accumulated data transfer size(KiloBytes)
uint64_t m_xgmi_read_data_acc[kRSMI_MAX_NUM_XGMI_LINKS];
uint64_t m_xgmi_write_data_acc[kRSMI_MAX_NUM_XGMI_LINKS];
// PMFW attached timestamp (10ns resolution)
uint64_t m_firmware_timestamp;
// Current clocks (Mhz)
uint16_t m_current_gfxclk[kRSMI_MAX_NUM_GFX_CLKS];
uint16_t m_current_socclk[kRSMI_MAX_NUM_CLKS];
uint16_t m_current_vclk0[kRSMI_MAX_NUM_CLKS];
uint16_t m_current_dclk0[kRSMI_MAX_NUM_CLKS];
uint16_t m_current_uclk;
uint16_t m_padding;
};
using AMGpuMetricsLatest_t = AMDGpuMetrics_v15_t;
/**
* This is GPU Metrics version that gets to public access.
@@ -410,6 +495,9 @@ using GPUMetricTempHbmTbl_t = GpuMetricU16Tbl_t;
using GPUMetricVcnActivity_t = decltype(AMDGpuMetrics_v14_t::m_vcn_activity);
using GPUMetricVcnActivityTbl_t = GpuMetricU16Tbl_t;
using GPUMetricJpegActivity_t = decltype(AMDGpuMetrics_v15_t::m_jpeg_activity);
using GPUMetricJpegActivityTbl_t = GpuMetricU16Tbl_t;
using GPUMetricXgmiReadDataAcc_t = decltype(AMDGpuMetrics_v14_t::m_xgmi_read_data_acc);
using GPUMetricXgmiWriteDataAcc_t = decltype(AMDGpuMetrics_v14_t::m_xgmi_write_data_acc);
using GPUMetricXgmiAccTbl_t = GpuMetricU64Tbl_t;
@@ -518,6 +606,7 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t
kMetricGfxActivityAccumulator,
kMetricMemActivityAccumulator,
kMetricVcnActivity, //v1.4
kMetricJpegActivity, //v1.5
// kGpuMetricAverageClock counters
kMetricAvgGfxClockFrequency,
@@ -559,6 +648,8 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t
kMetricPcieL0RecovCountAccumulator, //v1.4
kMetricPcieReplayCountAccumulator, //v1.4
kMetricPcieReplayRollOverCountAccumulator, //v1.4
kMetricPcieNakSentCountAccumulator, //v1.5
kMetricPcieNakReceivedCountAccumulator, //v1.5
// kGpuMetricPowerEnergy counters
kMetricAvgSocketPower,
@@ -608,6 +699,7 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t
kGpuMetricV12 = (0x1 << 2),
kGpuMetricV13 = (0x1 << 3),
kGpuMetricV14 = (0x1 << 4),
kGpuMetricV15 = (0x1 << 5),
};
using AMDGpuMetricVersionTranslationTbl_t = std::map<uint16_t, AMDGpuMetricVersionFlags_t>;
using GpuMetricTypePtr_t = std::shared_ptr<void>;
@@ -780,6 +872,40 @@ class GpuMetricsBase_v14_t final : public GpuMetricsBase_t
};
class GpuMetricsBase_v15_t final : public GpuMetricsBase_t
{
public:
~GpuMetricsBase_v15_t() = default;
size_t sizeof_metric_table() override {
return sizeof(AMDGpuMetrics_v15_t);
}
GpuMetricTypePtr_t get_metrics_table() override
{
if (!m_gpu_metric_ptr) {
m_gpu_metric_ptr.reset(&m_gpu_metrics_tbl, [](AMDGpuMetrics_v15_t*){});
}
assert(m_gpu_metric_ptr != nullptr);
return m_gpu_metric_ptr;
}
void dump_internal_metrics_table() override;
AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override
{
return AMDGpuMetricVersionFlags_t::kGpuMetricV15;
}
rsmi_status_t populate_metrics_dynamic_tbl() override;
AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override;
private:
AMDGpuMetrics_v15_t m_gpu_metrics_tbl;
std::shared_ptr<AMDGpuMetrics_v15_t> m_gpu_metric_ptr;
};
template<typename T>
rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, T& metric_value);
+16 -17
Féach ar an gComhad
@@ -29,10 +29,12 @@ from rsmiBindings import *
# Major version - Increment when backwards-compatibility breaks
# Minor version - Increment when adding a new feature, set to 0 when major is incremented
# Patch version - Increment when adding a fix, set to 0 when minor is incremented
SMI_MAJ = 1
SMI_MIN = 5
# Hash version - Shortened commit hash. Print here and not with lib for consistency with amd-smi
SMI_MAJ = 2
SMI_MIN = 0
SMI_PAT = 0
__version__ = '%s.%s.%s' % (SMI_MAJ, SMI_MIN, SMI_PAT)
# SMI_HASH is provided by rsmiBindings
__version__ = '%s.%s.%s+%s' % (SMI_MAJ, SMI_MIN, SMI_PAT, SMI_HASH)
# Set to 1 if an error occurs
RETCODE = 0
@@ -828,23 +830,20 @@ def printTableRow(space, displayString, v_delim=" "):
def checkIfSecondaryDie(device):
""" Checks if GCD(die) is the secondary die in a MCM.
MI200 device specific feature check.
The secondary dies lacks power management features.
Secondary dies lack power management features.
TODO: switch to more robust way to check for primary/secondary die, when implemented in Kernel and rocm_smi_lib.
@param device: The device to check
"""
power_cap = c_uint64()
# secondary die can currently be determined by checking if all power1_* (power cap) values are equal to zero.
ret = rocmsmi.rsmi_dev_power_cap_get(device, 0, byref(power_cap))
if not (rsmi_ret_ok(ret, None, 'get_power_cap', False) and power_cap.value == 0):
return False
ret = rocmsmi.rsmi_dev_power_cap_default_get(device, byref(power_cap))
if not (rsmi_ret_ok(ret, None, 'get_power_cap_default', False) and power_cap.value == 0):
return False
ret = rocmsmi.rsmi_dev_power_ave_get(device, 0, byref(power_cap))
if not (rsmi_ret_ok(ret, None, 'get_power_avg', False) and power_cap.value == 0):
return False
return True
energy_count = c_uint64()
counter_resoution = c_float()
timestamp = c_uint64()
# secondary die can be determined by checking if energy counter == 0
ret = rocmsmi.rsmi_dev_energy_count_get(device, byref(energy_count), byref(counter_resoution), byref(timestamp))
if (rsmi_ret_ok(ret, None, 'energy_count_secondary_die_check', silent=False)) and (energy_count.value == 0):
return True
return False
def resetClocks(deviceList):
@@ -55,6 +55,8 @@ dv_id = c_uint64()
# GPU ID
gpu_id = c_uint32(0)
SMI_HASH = '@PKG_VERSION_HASH@'
# Policy enums
RSMI_MAX_NUM_FREQUENCIES = 33
+151 -112
Féach ar an gComhad
@@ -594,7 +594,7 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", returning get_dev_value_line() response = "
<< getRSMIStatusString(ret);
<< amd::smi::getRSMIStatusString(ret);
LOG_ERROR(ss);
return ret;
}
@@ -613,7 +613,7 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind,
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", returning strtoul() response = "
<< getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(errno));
<< amd::smi::getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(errno));
LOG_TRACE(ss);
return amd::smi::ErrnoToRsmiStatus(errno);
@@ -667,7 +667,7 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block,
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", returning rsmi_dev_ecc_enabled_get() response = "
<< getRSMIStatusString(ret);
<< amd::smi::getRSMIStatusString(ret);
LOG_ERROR(ss);
return ret;
}
@@ -728,7 +728,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
default:
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", default case -> reporting "
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED);
<< amd::smi::getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED);
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
@@ -748,7 +748,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
if (ret != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", GetDevValueVec() ret was not RSMI_STATUS_SUCCESS"
<< " -> reporting " << getRSMIStatusString(ret);
<< " -> reporting " << amd::smi::getRSMIStatusString(ret);
LOG_ERROR(ss);
return ret;
}
@@ -767,7 +767,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block,
fs2 >> ec->correctable_err;
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", reporting " << getRSMIStatusString(ret);;
<< ", reporting " << amd::smi::getRSMIStatusString(ret);;
LOG_TRACE(ss);
return ret;
CATCH
@@ -935,7 +935,7 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) {
ret = get_id(dv_ind, amd::smi::kDevDevID, id);
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", reporting " << getRSMIStatusString(ret);
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
LOG_TRACE(ss);
return ret;
}
@@ -950,7 +950,7 @@ rsmi_dev_oam_id_get(uint32_t dv_ind, uint16_t *id) {
ret = get_id(dv_ind, amd::smi::kDevXGMIPhysicalID, id);
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", reporting " << getRSMIStatusString(ret);
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
LOG_TRACE(ss);
return ret;
}
@@ -965,7 +965,7 @@ rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision) {
ret = get_id(dv_ind, amd::smi::kDevDevRevID, revision);
outss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", reporting " << getRSMIStatusString(ret);
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
LOG_TRACE(outss);
return ret;
}
@@ -980,7 +980,7 @@ rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *id) {
CHK_SUPPORT_NAME_ONLY(id)
ret = get_id(dv_ind, amd::smi::kDevDevProdNum, id);
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", reporting " << getRSMIStatusString(ret);
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
LOG_TRACE(ss);
return ret;
CATCH
@@ -4045,6 +4045,7 @@ rsmi_status_t rsmi_dev_serial_number_get(uint32_t dv_ind,
if (ret != RSMI_STATUS_SUCCESS) {
return ret;
}
uint32_t ln = static_cast<uint32_t>(val_str.copy(serial_num, len));
serial_num[std::min(len - 1, ln)] = '\0';
@@ -5125,15 +5126,11 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
<< " | Cause: device board name does not support this action"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |";
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
std::string newMemoryPartition
= mapRSMIToStringMemoryPartitionTypes.at(memory_partition);
std::string currentMemoryPartition;
switch (memory_partition) {
case RSMI_MEMORY_PARTITION_NPS1:
case RSMI_MEMORY_PARTITION_NPS2:
@@ -5154,6 +5151,9 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
LOG_ERROR(ss);
return RSMI_STATUS_INVALID_ARGS;
}
std::string newMemoryPartition
= mapRSMIToStringMemoryPartitionTypes.at(memory_partition);
std::string currentMemoryPartition;
// do nothing if memory_partition is the current mode
rsmi_status_t ret_get = get_memory_partition(dv_ind, currentMemoryPartition);
@@ -5196,13 +5196,16 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
if (amd::smi::ErrnoToRsmiStatus(ret) != RSMI_STATUS_SUCCESS) {
rsmi_status_t err = amd::smi::ErrnoToRsmiStatus(ret);
if (ret == EACCES) {
err = RSMI_STATUS_NOT_SUPPORTED; // already verified permissions
}
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
<< devInfoTypesStrings.at(amd::smi::kDevMemoryPartition)
<< " | Cause: issue writing requested setting of " + newMemoryPartition
<< " | Cause: issue writing reqested setting of " + newMemoryPartition
<< " | Returning = "
<< getRSMIStatusString(err) << " |";
LOG_ERROR(ss);
@@ -6027,7 +6030,6 @@ rsmi_dev_metrics_curr_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_v
CATCH
}
rsmi_status_t
rsmi_dev_metrics_avg_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value)
{
@@ -6541,7 +6543,6 @@ rsmi_dev_metrics_pcie_bandwidth_inst_get(uint32_t dv_ind, uint64_t* pcie_bandwid
CATCH
}
rsmi_status_t
rsmi_dev_metrics_pcie_l0_recov_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value)
{
@@ -6666,19 +6667,24 @@ rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_valu
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempHbm);
amd::smi::GPUMetricTempHbmTbl_t tmp_hbl_tbl{};
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_hbl_tbl);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
const auto max_num_elems =
static_cast<uint16_t>(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value));
std::copy_n(std::begin(tmp_hbl_tbl), max_num_elems, *temp_hbm_value);
}
const auto max_num_elems =
static_cast<uint16_t>(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value));
const auto copy_size =
static_cast<uint16_t>((max_num_elems < tmp_hbl_tbl.size()) ? max_num_elems : tmp_hbl_tbl.size());
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | End Result "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< " | Metric Size: " << tmp_hbl_tbl.size()
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
<< "\n | ======= end ======= "
<< "\n | End Result "
<< "\n | Device #: " << dv_ind
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< "\n | Metric Size: " << tmp_hbl_tbl.size()
<< "\n | Max num of elements: " << max_num_elems
<< "\n | Copy size: " << copy_size
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
LOG_INFO(ostrstream);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
std::memset(temp_hbm_value, 0, sizeof(*temp_hbm_value));
std::copy_n(std::begin(tmp_hbl_tbl), copy_size, *temp_hbm_value);
}
return status_code;
CATCH
@@ -6700,19 +6706,24 @@ rsmi_dev_metrics_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivity_t* vcn_a
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity);
amd::smi::GPUMetricVcnActivityTbl_t tmp_vcn_tbl{};
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_vcn_tbl);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
const auto max_num_elems =
static_cast<uint16_t>(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value));
std::copy_n(std::begin(tmp_vcn_tbl), max_num_elems, *vcn_activity_value);
}
const auto max_num_elems =
static_cast<uint16_t>(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value));
const auto copy_size =
static_cast<uint16_t>((max_num_elems < tmp_vcn_tbl.size()) ? max_num_elems : tmp_vcn_tbl.size());
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | End Result "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< " | Metric Size: " << tmp_vcn_tbl.size()
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
<< "\n | ======= end ======= "
<< "\n | End Result "
<< "\n | Device #: " << dv_ind
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< "\n | Metric Size: " << tmp_vcn_tbl.size()
<< "\n | Max num of elements: " << max_num_elems
<< "\n | Copy size: " << copy_size
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
LOG_INFO(ostrstream);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
std::memset(vcn_activity_value, 0, sizeof(*vcn_activity_value));
std::copy_n(std::begin(tmp_vcn_tbl), copy_size, *vcn_activity_value);
}
return status_code;
CATCH
@@ -6734,19 +6745,24 @@ rsmi_dev_metrics_xgmi_read_data_get(uint32_t dv_ind, GPUMetricXgmiReadDataAcc_t*
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator);
amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{};
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
const auto max_num_elems =
static_cast<uint16_t>(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value));
std::copy_n(std::begin(tmp_xgmi_acc_tbl), max_num_elems, *xgmi_read_data_acc_value);
}
const auto max_num_elems =
static_cast<uint16_t>(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value));
const auto copy_size =
static_cast<uint16_t>((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size());
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | End Result "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< " | Metric Size: " << tmp_xgmi_acc_tbl.size()
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
<< "\n | ======= end ======= "
<< "\n | End Result "
<< "\n | Device #: " << dv_ind
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< "\n | Metric Size: " << tmp_xgmi_acc_tbl.size()
<< "\n | Max num of elements: " << max_num_elems
<< "\n | Copy size: " << copy_size
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
LOG_INFO(ostrstream);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
std::memset(xgmi_read_data_acc_value, 0, sizeof(*xgmi_read_data_acc_value));
std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_read_data_acc_value);
}
return status_code;
CATCH
@@ -6768,19 +6784,24 @@ rsmi_dev_metrics_xgmi_write_data_get(uint32_t dv_ind, GPUMetricXgmiWriteDataAcc_
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator);
amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{};
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
const auto max_num_elems =
static_cast<uint16_t>(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value));
std::copy_n(std::begin(tmp_xgmi_acc_tbl), max_num_elems, *xgmi_write_data_acc_value);
}
const auto max_num_elems =
static_cast<uint16_t>(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value));
const auto copy_size =
static_cast<uint16_t>((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size());
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | End Result "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< " | Metric Size: " << tmp_xgmi_acc_tbl.size()
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
<< "\n | ======= end ======= "
<< "\n | End Result "
<< "\n | Device #: " << dv_ind
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< "\n | Metric Size: " << tmp_xgmi_acc_tbl.size()
<< "\n | Max num of elements: " << max_num_elems
<< "\n | Copy size: " << copy_size
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
LOG_INFO(ostrstream);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
std::memset(xgmi_write_data_acc_value, 0, sizeof(*xgmi_write_data_acc_value));
std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_write_data_acc_value);
}
return status_code;
CATCH
@@ -6800,26 +6821,28 @@ rsmi_dev_metrics_curr_gfxclk_get(uint32_t dv_ind, GPUMetricCurrGfxClk_t* current
}
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock);
rsmi_gpu_metrics_t gpu = {};
auto status = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu);
if (status == rsmi_status_t::RSMI_STATUS_SUCCESS) {
std::copy_n(std::begin(gpu.current_gfxclks),
static_cast<uint16_t>(
sizeof(gpu.current_gfxclks)/sizeof(gpu.current_gfxclks[0])),
*current_gfxclk_value);
}
amd::smi::GPUMetricCurrGfxClkTbl_t tmp_curr_gfxclk_tbl{};
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_gfxclk_tbl);
const auto max_num_elems =
static_cast<uint16_t>(std::end(*current_gfxclk_value) - std::begin(*current_gfxclk_value));
const auto copy_size =
static_cast<uint16_t>((max_num_elems < tmp_curr_gfxclk_tbl.size()) ? max_num_elems : tmp_curr_gfxclk_tbl.size());
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | End Result "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< " | Metric Size: " << static_cast<uint16_t>(
sizeof(gpu.current_gfxclks)/sizeof(gpu.current_gfxclks[0]))
<< " | Returning = " << status << " "
<< getRSMIStatusString(status) << " |";
<< "\n | ======= end ======= "
<< "\n | End Result "
<< "\n | Device #: " << dv_ind
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< "\n | Metric Size: " << tmp_curr_gfxclk_tbl.size()
<< "\n | Max num of elements: " << max_num_elems
<< "\n | Copy size: " << copy_size
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
LOG_INFO(ostrstream);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
std::memset(current_gfxclk_value, 0, sizeof(*current_gfxclk_value));
std::copy_n(std::begin(tmp_curr_gfxclk_tbl), copy_size, *current_gfxclk_value);
}
return status;
return status_code;
CATCH
}
@@ -6839,19 +6862,23 @@ rsmi_dev_metrics_curr_socclk_get(uint32_t dv_ind, GPUMetricCurrSocClk_t* current
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrSocClock);
amd::smi::GPUMetricCurrSocClkTbl_t tmp_curr_socclk_tbl{};
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_socclk_tbl);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
const auto max_num_elems =
static_cast<uint16_t>(std::end(*current_socclk_value) - std::begin(*current_socclk_value));
std::copy_n(std::begin(tmp_curr_socclk_tbl), max_num_elems, *current_socclk_value);
}
const auto max_num_elems =
static_cast<uint16_t>(std::end(*current_socclk_value) - std::begin(*current_socclk_value));
const auto copy_size =
static_cast<uint16_t>((max_num_elems < tmp_curr_socclk_tbl.size()) ? max_num_elems : tmp_curr_socclk_tbl.size());
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | End Result "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< " | Metric Size: " << tmp_curr_socclk_tbl.size()
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
<< "\n | ======= end ======= "
<< "\n | End Result "
<< "\n | Device #: " << dv_ind
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< "\n | Metric Size: " << tmp_curr_socclk_tbl.size()
<< "\n | Max num of elements: " << max_num_elems
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
LOG_INFO(ostrstream);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
std::memset(current_socclk_value, 0, sizeof(*current_socclk_value));
std::copy_n(std::begin(tmp_curr_socclk_tbl), copy_size, *current_socclk_value);
}
return status_code;
CATCH
@@ -6873,19 +6900,24 @@ rsmi_dev_metrics_curr_vclk0_get(uint32_t dv_ind, GPUMetricCurrVClk0_t* current_v
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrVClock0);
amd::smi::GPUMetricCurrVClkTbl_t tmp_curr_vclk0_tbl{};
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_vclk0_tbl);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
const auto max_num_elems =
static_cast<uint16_t>(std::end(*current_vclk_value) - std::begin(*current_vclk_value));
std::copy_n(std::begin(tmp_curr_vclk0_tbl), max_num_elems, *current_vclk_value);
}
const auto max_num_elems =
static_cast<uint16_t>(std::end(*current_vclk_value) - std::begin(*current_vclk_value));
const auto copy_size =
static_cast<uint16_t>((max_num_elems < tmp_curr_vclk0_tbl.size()) ? max_num_elems : tmp_curr_vclk0_tbl.size());
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | End Result "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< " | Metric Size: " << tmp_curr_vclk0_tbl.size()
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
<< "\n | ======= end ======= "
<< "\n | End Result "
<< "\n | Device #: " << dv_ind
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< "\n | Metric Size: " << tmp_curr_vclk0_tbl.size()
<< "\n | Max num of elements: " << max_num_elems
<< "\n | Copy size: " << copy_size
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
LOG_INFO(ostrstream);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
std::memset(current_vclk_value, 0, sizeof(*current_vclk_value));
std::copy_n(std::begin(tmp_curr_vclk0_tbl), copy_size, *current_vclk_value);
}
return status_code;
CATCH
@@ -6934,19 +6966,24 @@ rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_d
const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrDClock0);
amd::smi::GPUMetricCurrDClkTbl_t tmp_curr_dclk0_tbl;
auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_dclk0_tbl);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
const auto max_num_elems =
static_cast<uint16_t>(std::end(*current_dclk_value) - std::begin(*current_dclk_value));
std::copy_n(std::begin(tmp_curr_dclk0_tbl), max_num_elems, *current_dclk_value);
}
const auto max_num_elems =
static_cast<uint16_t>(std::end(*current_dclk_value) - std::begin(*current_dclk_value));
const auto copy_size =
static_cast<uint16_t>((max_num_elems < tmp_curr_dclk0_tbl.size()) ? max_num_elems : tmp_curr_dclk0_tbl.size());
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | End Result "
<< " | Device #: " << dv_ind
<< " | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< " | Metric Size: " << tmp_curr_dclk0_tbl.size()
<< " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
<< "\n | ======= end ======= "
<< "\n | End Result "
<< "\n | Device #: " << dv_ind
<< "\n | Metric Type: " << static_cast<AMDGpuMetricTypeId_t>(gpu_metric_unit)
<< "\n | Metric Size: " << tmp_curr_dclk0_tbl.size()
<< "\n | Max num of elements: " << max_num_elems
<< "\n | Copy size: " << copy_size
<< "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |";
LOG_INFO(ostrstream);
if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) {
std::memset(current_dclk_value, 0, sizeof(*current_dclk_value));
std::copy_n(std::begin(tmp_curr_dclk0_tbl), copy_size, *current_dclk_value);
}
return status_code;
CATCH
@@ -7277,6 +7314,7 @@ rsmi_dev_metrics_header_info_get(uint32_t dv_ind, metrics_table_header_t* header
CATCH
}
rsmi_status_t
rsmi_dev_metrics_xcd_counter_get(uint32_t dv_ind, uint16_t* xcd_counter_value)
{
@@ -7335,6 +7373,7 @@ rsmi_dev_metrics_log_get(uint32_t dv_ind)
return status_code;
CATCH
}
//
// End of: new GPU Metrics related work.
//
+2 -1
Féach ar an gComhad
@@ -52,5 +52,6 @@
#define rocm_smi_VERSION_MINOR @rocm_smi_VERSION_MINOR@
#define rocm_smi_VERSION_PATCH @rocm_smi_VERSION_PATCH@
#define rocm_smi_VERSION_BUILD "@rocm_smi_VERSION_BUILD@"
#define rocm_smi_VERSION_HASH "@rocm_smi_VERSION_HASH@"
#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_
#endif // INCLUDE_ROCM_SMI_ROCM_SMI64CONFIG_H_
+2 -2
Féach ar an gComhad
@@ -738,7 +738,7 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) {
<< " | " << (fs.fail() ? "[ERROR] Failed read - format error" :
"[GOOD] No fail - Successful read operation")
<< " | " << (fs.eof() ? "[ERROR] Failed read - EOF error" :
"[GOOD] No eof error - Successful read operation")
"[GOOD] No eof - Successful read operation")
<< " | " << (fs.good() ? "[GOOD] read good - Successful read operation" :
"[ERROR] Failed read - good error");
LOG_INFO(ss);
@@ -800,7 +800,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr,
<< " | " << (fs.fail() ? "[ERROR] Failed write - format error" :
"[GOOD] No fail - Successful write operation")
<< " | " << (fs.eof() ? "[ERROR] Failed write - EOF error" :
"[GOOD] No eof error - Successful write operation")
"[GOOD] No eof - Successful write operation")
<< " | " << (fs.good() ?
"[GOOD] Write good - Successful write operation" :
"[ERROR] Failed write - good error");
+576 -47
Féach ar an gComhad
@@ -163,6 +163,7 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl
{join_metrics_version(1, 2), AMDGpuMetricVersionFlags_t::kGpuMetricV12},
{join_metrics_version(1, 3), AMDGpuMetricVersionFlags_t::kGpuMetricV13},
{join_metrics_version(1, 4), AMDGpuMetricVersionFlags_t::kGpuMetricV14},
{join_metrics_version(1, 5), AMDGpuMetricVersionFlags_t::kGpuMetricV15},
};
/**
@@ -201,7 +202,8 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation
{AMDGpuMetricsUnitType_t::kMetricAvgMmActivity, "AvgMmActivity"},
{AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, "GfxActivityAcc"},
{AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, "MemActivityAcc"},
{AMDGpuMetricsUnitType_t::kMetricVcnActivity, "VcnActivity"},
{AMDGpuMetricsUnitType_t::kMetricVcnActivity, "VcnActivity"}, /* v1.4 */
{AMDGpuMetricsUnitType_t::kMetricJpegActivity, "JpegActivity"}, /* v1.5 */
// kGpuMetricAverageClock counters
{AMDGpuMetricsUnitType_t::kMetricAvgGfxClockFrequency, "AvgGfxClockFrequency"},
@@ -213,11 +215,11 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation
{AMDGpuMetricsUnitType_t::kMetricAvgDClock1Frequency, "AvgDClock1Frequency"},
// kGpuMetricCurrentClock counters
{AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, "CurrGfxClock"},
{AMDGpuMetricsUnitType_t::kMetricCurrSocClock, "CurrSocClock"},
{AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, "CurrGfxClock"}, /* v1.4: Changed to array */
{AMDGpuMetricsUnitType_t::kMetricCurrSocClock, "CurrSocClock"}, /* v1.4: Changed to array */
{AMDGpuMetricsUnitType_t::kMetricCurrUClock, "CurrUClock"},
{AMDGpuMetricsUnitType_t::kMetricCurrVClock0, "CurrVClock0"},
{AMDGpuMetricsUnitType_t::kMetricCurrDClock0, "CurrDClock0"},
{AMDGpuMetricsUnitType_t::kMetricCurrVClock0, "CurrVClock0"}, /* v1.4: Changed to array */
{AMDGpuMetricsUnitType_t::kMetricCurrDClock0, "CurrDClock0"}, /* v1.4: Changed to array */
{AMDGpuMetricsUnitType_t::kMetricCurrVClock1, "CurrVClock1"},
{AMDGpuMetricsUnitType_t::kMetricCurrDClock1, "CurrDClock1"},
@@ -226,7 +228,7 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation
{AMDGpuMetricsUnitType_t::kMetricIndepThrottleStatus, "IndepThrottleStatus"},
// kGpuMetricGfxClkLockStatus counters
{AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, "GfxClkLockStatus"},
{AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, "GfxClkLockStatus"}, /* v1.4 */
// kGpuMetricCurrentFanSpeed counters
{AMDGpuMetricsUnitType_t::kMetricCurrFanSpeed, "CurrFanSpeed"},
@@ -234,19 +236,21 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation
// kGpuMetricLinkWidthSpeed counters
{AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, "PcieLinkWidth"},
{AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, "PcieLinkSpeed"},
{AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, "PcieBandwidthAcc"},
{AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, "PcieBandwidthInst"},
{AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, "XgmiLinkWidth"},
{AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, "XgmiLinkSpeed"},
{AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, "XgmiReadDataAcc"},
{AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, "XgmiWriteDataAcc"},
{AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, "PcieL0RecovCountAcc"},
{AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, "PcieReplayCountAcc"},
{AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, "PcieReplayRollOverCountAcc"},
{AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, "PcieBandwidthAcc"}, /* v1.4 */
{AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, "PcieBandwidthInst"}, /* v1.4 */
{AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, "XgmiLinkWidth"}, /* v1.4 */
{AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, "XgmiLinkSpeed"}, /* v1.4 */
{AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, "XgmiReadDataAcc"}, /* v1.4 */
{AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, "XgmiWriteDataAcc"}, /* v1.4 */
{AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, "PcieL0RecovCountAcc"}, /* v1.4 */
{AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, "PcieReplayCountAcc"}, /* v1.4 */
{AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, "PcieReplayRollOverCountAcc"}, /* v1.4 */
{AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator, "PcieNakSentCountAcc"}, /* v1.5 */
{AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator, "PcieNakRcvdCountAcc"}, /* v1.5 */
// kGpuMetricPowerEnergy counters
{AMDGpuMetricsUnitType_t::kMetricAvgSocketPower, "AvgSocketPower"},
{AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, "CurrSocketPower"},
{AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, "CurrSocketPower"}, /* v1.4 */
{AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, "EnergyAcc"},
// kGpuMetricVoltage counters
@@ -343,6 +347,7 @@ AMDGpuMetricFactories_t amd_gpu_metrics_factory_table
{AMDGpuMetricVersionFlags_t::kGpuMetricV12, std::make_shared<GpuMetricsBase_v12_t>(GpuMetricsBase_v12_t{})},
{AMDGpuMetricVersionFlags_t::kGpuMetricV13, std::make_shared<GpuMetricsBase_v13_t>(GpuMetricsBase_v13_t{})},
{AMDGpuMetricVersionFlags_t::kGpuMetricV14, std::make_shared<GpuMetricsBase_v14_t>(GpuMetricsBase_v14_t{})},
{AMDGpuMetricVersionFlags_t::kGpuMetricV15, std::make_shared<GpuMetricsBase_v15_t>(GpuMetricsBase_v15_t{})},
};
GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version)
@@ -462,6 +467,341 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str
return multi_values;
}
void GpuMetricsBase_v15_t::dump_internal_metrics_table()
{
std::ostringstream ostrstream;
std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n";
ostrstream << __PRETTY_FUNCTION__
<< " | ======= DEBUG ======= "
<< " | Metric Version: " << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header)
<< " | Size: " << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size)
<< " |"
<< "\n";
ostrstream << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n"
<< " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n"
<< " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n"
<< " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n"
<< " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
<< " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n";
ostrstream << " vcn_activity: " << "\n";
auto idx = uint64_t(0);
for (const auto& temp : m_gpu_metrics_tbl.m_vcn_activity) {
ostrstream << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ostrstream << " jpeg_activity: " << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_jpeg_activity) {
ostrstream << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ostrstream << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n"
<< " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n"
<< " throttle_status: " << m_gpu_metrics_tbl.m_throttle_status << "\n"
<< " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n"
<< " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"
<< " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n"
<< " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n"
<< " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n"
<< " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n"
<< " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n"
<< " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n"
<< " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n"
<< " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n"
<< " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n"
<< " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n"
<< " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n"
<< " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n"
<< " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n"
<< " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n";
ostrstream << " xgmi_read_data_acc: " << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) {
ostrstream << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ostrstream << " xgmi_write_data_acc: " << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) {
ostrstream << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ostrstream << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n";
ostrstream << " current_gfxclk: " << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) {
ostrstream << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ostrstream << " current_socclk: " << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) {
ostrstream << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ostrstream << " current_vclk0: " << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) {
ostrstream << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ostrstream << " current_dclk0: " << "\n";
idx = 0;
for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) {
ostrstream << "\t [" << idx << "]: " << temp << "\n";
++idx;
}
ostrstream << " padding: " << m_gpu_metrics_tbl.m_padding << "\n";
LOG_DEBUG(ostrstream);
}
rsmi_status_t GpuMetricsBase_v15_t::populate_metrics_dynamic_tbl()
{
std::ostringstream ostrstream;
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
ostrstream << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ostrstream);
//
// Note: Any metric treatment/changes (if any) should happen before they
// get written to internal/external tables.
//
auto run_metric_adjustments_v15 = [&]() {
ostrstream << __PRETTY_FUNCTION__ << " | ======= start =======";
const auto gpu_metrics_version = translate_flag_to_metric_version(get_gpu_metrics_version_used());
ostrstream << __PRETTY_FUNCTION__
<< " | ======= info ======= "
<< " | Applying adjustments "
<< " | Metric Version: " << stringfy_metric_header_version(
disjoin_metrics_version(gpu_metrics_version))
<< " |";
LOG_TRACE(ostrstream);
// firmware_timestamp is at 10ns resolution
ostrstream << __PRETTY_FUNCTION__
<< " | ======= Changes ======= "
<< " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp
<< " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10);
LOG_DEBUG(ostrstream);
};
// Adjustments/Changes specific to this version
run_metric_adjustments_v15();
// Temperature Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot,
format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot,
"temperature_hotspot"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem,
format_metric_row(m_gpu_metrics_tbl.m_temperature_mem,
"temperature_mem"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc,
format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc,
"temperature_vrsoc"))
);
// Power/Energy Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower,
format_metric_row(m_gpu_metrics_tbl.m_current_socket_power,
"curr_socket_power"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator,
"energy_acc"))
);
// Utilization Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity,
format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity,
"average_gfx_activity"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity,
format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity,
"average_umc_activity"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVcnActivity,
format_metric_row(m_gpu_metrics_tbl.m_vcn_activity,
"[average_vcn_activity]"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricJpegActivity,
format_metric_row(m_gpu_metrics_tbl.m_jpeg_activity,
"[average_jpeg_activity]"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc,
"gfx_activity_acc"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc,
"mem_activity_acc"))
);
// Timestamp Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware,
format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp,
"firmware_timestamp"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter,
format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter,
"system_clock_counter"))
);
// Throttle Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleStatus]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricThrottleStatus,
format_metric_row(m_gpu_metrics_tbl.m_throttle_status,
"throttle_status"))
);
// GfxLock Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus,
format_metric_row(m_gpu_metrics_tbl.m_gfxclk_lock_status,
"gfxclk_lock_status"))
);
// Link/Width/Speed Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth,
format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width,
"pcie_link_width"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed,
format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed,
"pcie_link_speed"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth,
format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_width,
"xgmi_link_width"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed,
format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_speed,
"xgmi_link_speed"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc,
"pcie_bandwidth_acc"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst,
format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_inst,
"pcie_bandwidth_inst"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc,
"pcie_l0_recov_count_acc"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_count_acc,
"pcie_replay_count_acc"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc,
"pcie_replay_rollover_count_acc"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc,
"pcie_nak_sent_count_acc"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc,
"pcie_nak_rcvd_count_acc"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc,
"[xgmi_read_data_acc]"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator,
format_metric_row(m_gpu_metrics_tbl.m_xgmi_write_data_acc,
"[xgmi_write_data_acc]"))
);
// CurrentClock Info
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock,
format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk,
"[current_gfxclk]"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock,
format_metric_row(m_gpu_metrics_tbl.m_current_socclk,
"[current_socclk]"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0,
format_metric_row(m_gpu_metrics_tbl.m_current_vclk0,
"[current_vclk0]"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0,
format_metric_row(m_gpu_metrics_tbl.m_current_dclk0,
"[current_dclk0]"))
);
m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock]
.insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock,
format_metric_row(m_gpu_metrics_tbl.m_current_uclk,
"current_uclk"))
);
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success "
<< " | Returning = " << getRSMIStatusString(status_code)
<< " |";
LOG_TRACE(ostrstream);
return status_code;
}
void GpuMetricsBase_v14_t::dump_internal_metrics_table()
{
std::ostringstream ostrstream;
@@ -827,6 +1167,10 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
std::end(rsmi_gpu_metrics.vcn_activity),
init_max_uint_types<std::uint16_t>());
std::fill(std::begin(rsmi_gpu_metrics.jpeg_activity),
std::end(rsmi_gpu_metrics.jpeg_activity),
init_max_uint_types<std::uint16_t>());
rsmi_gpu_metrics.gfxclk_lock_status = init_max_uint_types<decltype(rsmi_gpu_metrics.gfxclk_lock_status)>();
rsmi_gpu_metrics.xgmi_link_width = init_max_uint_types<decltype(rsmi_gpu_metrics.xgmi_link_width)>();
rsmi_gpu_metrics.xgmi_link_speed = init_max_uint_types<decltype(rsmi_gpu_metrics.xgmi_link_speed)>();
@@ -836,35 +1180,33 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
rsmi_gpu_metrics.pcie_replay_count_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_replay_count_acc)>();
rsmi_gpu_metrics.pcie_replay_rover_count_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_replay_rover_count_acc)>();
std::fill_n(&rsmi_gpu_metrics.xgmi_read_data_acc[0],
(sizeof(rsmi_gpu_metrics.xgmi_read_data_acc) /
sizeof(rsmi_gpu_metrics.xgmi_read_data_acc[0])),
std::numeric_limits<uint64_t>::max());
std::fill(std::begin(rsmi_gpu_metrics.xgmi_read_data_acc),
std::end(rsmi_gpu_metrics.xgmi_read_data_acc),
init_max_uint_types<std::uint64_t>());
std::fill_n(&rsmi_gpu_metrics.xgmi_write_data_acc[0],
(sizeof(rsmi_gpu_metrics.xgmi_write_data_acc) /
sizeof(rsmi_gpu_metrics.xgmi_write_data_acc[0])),
std::numeric_limits<uint64_t>::max());
std::fill(std::begin(rsmi_gpu_metrics.xgmi_write_data_acc),
std::end(rsmi_gpu_metrics.xgmi_write_data_acc),
init_max_uint_types<std::uint64_t>());
std::fill_n(&rsmi_gpu_metrics.current_gfxclks[0],
(sizeof(rsmi_gpu_metrics.current_gfxclks) /
sizeof(rsmi_gpu_metrics.current_gfxclks[0])),
std::numeric_limits<uint16_t>::max());
std::fill(std::begin(rsmi_gpu_metrics.current_gfxclks),
std::end(rsmi_gpu_metrics.current_gfxclks),
init_max_uint_types<std::uint16_t>());
std::fill_n(&rsmi_gpu_metrics.current_socclks[0],
(sizeof(rsmi_gpu_metrics.current_socclks) /
sizeof(rsmi_gpu_metrics.current_socclks[0])),
std::numeric_limits<uint16_t>::max());
std::fill(std::begin(rsmi_gpu_metrics.current_socclks),
std::end(rsmi_gpu_metrics.current_socclks),
init_max_uint_types<std::uint16_t>());
std::fill_n(&rsmi_gpu_metrics.current_vclk0s[0],
(sizeof(rsmi_gpu_metrics.current_vclk0s) /
sizeof(rsmi_gpu_metrics.current_vclk0s[0])),
std::numeric_limits<uint16_t>::max());
std::fill(std::begin(rsmi_gpu_metrics.current_vclk0s),
std::end(rsmi_gpu_metrics.current_vclk0s),
init_max_uint_types<std::uint16_t>());
std::fill(std::begin(rsmi_gpu_metrics.current_dclk0s),
std::end(rsmi_gpu_metrics.current_dclk0s),
init_max_uint_types<std::uint16_t>());
rsmi_gpu_metrics.pcie_nak_sent_count_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_nak_sent_count_acc)>();
rsmi_gpu_metrics.pcie_nak_rcvd_count_acc = init_max_uint_types<decltype(rsmi_gpu_metrics.pcie_nak_rcvd_count_acc)>();
std::fill_n(&rsmi_gpu_metrics.current_dclk0s[0],
(sizeof(rsmi_gpu_metrics.current_dclk0s) /
sizeof(rsmi_gpu_metrics.current_dclk0s[0])),
std::numeric_limits<uint16_t>::max());
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
@@ -876,6 +1218,195 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m
return status_code;
}
AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v15_t::copy_internal_to_external_metrics()
{
std::ostringstream ostrstream;
auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS);
ostrstream << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ostrstream);
auto copy_data_from_internal_metrics_tbl = [&]() {
AMGpuMetricsPublicLatest_t metrics_public_init{};
//
// Note: Initializing data members with their max. If field is max,
// no data was assigned to it.
init_max_public_gpu_matrics(metrics_public_init);
// Header
metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size;
metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision;
metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision;
// Temperature
metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot;
metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem;
metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc;
// Power
metrics_public_init.current_socket_power = m_gpu_metrics_tbl.m_current_socket_power;
// Utilization
metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity;
metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity;
// vcn_activity
const auto vcn_activity_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_vcn_activity) -
std::begin(m_gpu_metrics_tbl.m_vcn_activity));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_vcn_activity),
vcn_activity_num_elems,
metrics_public_init.vcn_activity);
// jpeg_activity
const auto jpeg_activity_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_jpeg_activity) -
std::begin(m_gpu_metrics_tbl.m_jpeg_activity));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_jpeg_activity),
jpeg_activity_num_elems,
metrics_public_init.jpeg_activity);
// Power/Energy
metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator;
// Driver attached timestamp (in ns)
metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter;
// Throttle status
metrics_public_init.throttle_status = m_gpu_metrics_tbl.m_throttle_status;
// Clock Lock Status. Each bit corresponds to clock instance
metrics_public_init.gfxclk_lock_status = m_gpu_metrics_tbl.m_gfxclk_lock_status;
// Link width (number of lanes) and speed
metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width;
metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed;
// XGMI bus width and bitrate
metrics_public_init.xgmi_link_width = m_gpu_metrics_tbl.m_xgmi_link_width;
metrics_public_init.xgmi_link_speed = m_gpu_metrics_tbl.m_xgmi_link_speed;
// Utilization Accumulated
metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc;
metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc;
// PCIE accumulated bandwidth
metrics_public_init.pcie_bandwidth_acc = m_gpu_metrics_tbl.m_pcie_bandwidth_acc;
// PCIE instantaneous bandwidth
metrics_public_init.pcie_bandwidth_inst = m_gpu_metrics_tbl.m_pcie_bandwidth_inst;
// PCIE L0 to recovery state transition accumulated count
metrics_public_init.pcie_l0_to_recov_count_acc = m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc;
// PCIE replay accumulated count
metrics_public_init.pcie_replay_count_acc = m_gpu_metrics_tbl.m_pcie_replay_count_acc;
// PCIE replay rollover accumulated count
metrics_public_init.pcie_replay_rover_count_acc = m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc;
// PCIE NAK sent accumulated count
metrics_public_init.pcie_nak_sent_count_acc = m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc;
// PCIE NAK received accumulated count
metrics_public_init.pcie_nak_rcvd_count_acc = m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc;
// XGMI accumulated data transfer size
// xgmi_read_data
const auto xgmi_read_data_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_xgmi_read_data_acc) -
std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc),
xgmi_read_data_num_elems,
metrics_public_init.xgmi_read_data_acc);
// xgmi_write_data
const auto xgmi_write_data_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_xgmi_write_data_acc) -
std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc),
xgmi_write_data_num_elems,
metrics_public_init.xgmi_write_data_acc);
// PMFW attached timestamp (10ns resolution)
metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp;
// Current clocks
// current_gfxclk
const auto curr_gfxclk_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_current_gfxclk) -
std::begin(m_gpu_metrics_tbl.m_current_gfxclk));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_gfxclk),
curr_gfxclk_num_elems,
metrics_public_init.current_gfxclks);
// current_socclk
const auto curr_socclk_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_current_socclk) -
std::begin(m_gpu_metrics_tbl.m_current_socclk));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_socclk),
curr_socclk_num_elems,
metrics_public_init.current_socclks);
// current_vclk0
const auto curr_vclk0_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_current_vclk0) -
std::begin(m_gpu_metrics_tbl.m_current_vclk0));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_vclk0),
curr_vclk0_num_elems,
metrics_public_init.current_vclk0s);
// current_dclk0
const auto curr_dclk0_num_elems =
static_cast<uint16_t>(
std::end(m_gpu_metrics_tbl.m_current_dclk0) -
std::begin(m_gpu_metrics_tbl.m_current_dclk0));
std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_dclk0),
curr_dclk0_num_elems,
metrics_public_init.current_dclk0s);
metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk;
//
// Note: Backwards compatibility -> Handling extra/exception cases
// related to earlier versions (1.3)
metrics_public_init.current_gfxclk = metrics_public_init.current_gfxclks[0];
// metrics_public_init.average_gfxclk_frequency = metrics_public_init.current_gfxclks[0];
metrics_public_init.current_socclk = metrics_public_init.current_socclks[0];
// metrics_public_init.average_socclk_frequency = metrics_public_init.current_socclks[0];
metrics_public_init.current_vclk0 = metrics_public_init.current_vclk0s[0];
// metrics_public_init.average_vclk0_frequency = metrics_public_init.current_vclk0s[0];
metrics_public_init.current_vclk1 = metrics_public_init.current_vclk0s[1];
// metrics_public_init.average_vclk1_frequency = metrics_public_init.current_vclk0s[1];
metrics_public_init.current_dclk0 = metrics_public_init.current_dclk0s[0];
// metrics_public_init.average_dclk0_frequency = metrics_public_init.current_dclk0s[0];
metrics_public_init.current_dclk1 = metrics_public_init.current_dclk0s[1];
// metrics_public_init.average_dclk1_frequency = metrics_public_init.current_dclk0s[1];
return metrics_public_init;
}();
ostrstream << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success "
<< " | Returning = " << getRSMIStatusString(status_code)
<< " |";
LOG_TRACE(ostrstream);
return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl);
}
AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v14_t::copy_internal_to_external_metrics()
{
@@ -2154,11 +2685,9 @@ rsmi_status_t Device::dev_read_gpu_metrics_header_data()
// Check if/when metrics table needs to be refreshed.
auto now_ts = actual_timestamp_in_secs();
if (((!m_gpu_metrics_header.m_structure_size) ||
(!m_gpu_metrics_header.m_format_revision) ||
(!m_gpu_metrics_header.m_content_revision)) ||
((now_ts - m_gpu_metrics_updated_timestamp) >=
kRSMI_GPU_METRICS_EXPIRATION_SECS)) {
if ((!m_gpu_metrics_header.m_structure_size) ||
(!m_gpu_metrics_header.m_format_revision) ||
(!m_gpu_metrics_header.m_content_revision)) {
auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics,
sizeof(AMDGpuMetricsHeader_v1_t),
&m_gpu_metrics_header);
@@ -2617,7 +3146,7 @@ rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t met
<< " | Returning = "
<< getRSMIStatusString(status_code)
<< " |";
LOG_ERROR(ostrstream);
LOG_TRACE(ostrstream);
return status_code;
}
}
@@ -2727,7 +3256,7 @@ rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnit
<< " | Returning = "
<< getRSMIStatusString(status_code)
<< " |";
LOG_ERROR(ostrstream);
LOG_TRACE(ostrstream);
return status_code;
}
+3 -5
Féach ar an gComhad
@@ -52,8 +52,8 @@
#include <fstream>
#include <functional>
#include <iostream>
#include <algorithm>
#include <memory>
#include <algorithm>
#include <set>
#include <sstream>
#include <string>
@@ -391,10 +391,6 @@ RocmSMI::Initialize(uint64_t flags) {
<< "\n | final update: device->bdfid() holds correct device bdf";
LOG_TRACE(ss);
}
if (ret != 0) {
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
"Failed to initialize rocm_smi library (amdgpu node discovery).");
}
std::shared_ptr<amd::smi::Device> dev;
// Sort index based on the BDF, collect BDF id firstly.
@@ -437,6 +433,7 @@ RocmSMI::Initialize(uint64_t flags) {
for (it = io_link_map_tmp.begin(); it != io_link_map_tmp.end(); it++)
io_link_map_[it->first] = it->second;
// Remove any drm nodes that don't have a corresponding readable kfd node.
// kfd nodes will not be added if their properties file is not readable.
auto dev_iter = devices_.begin();
@@ -480,6 +477,7 @@ RocmSMI::Initialize(uint64_t flags) {
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
logSystemDetails();
}
// Leaving below to help debug temp file issues
// displayAppTmpFilesContent();
std::string amdGPUDeviceList = displayAllDevicePaths(devices_);
+5 -2
Féach ar an gComhad
@@ -42,11 +42,14 @@
*/
#define _GNU_SOURCE 1 // REQUIRED: to utilize some GNU features/functions, see
// _GNU_SOURCE functions which check
#include <assert.h>
#include <errno.h>
#include <sys/stat.h>
#include <unistd.h>
#include <dirent.h>
#include <dlfcn.h>
#include <glob.h>
#include <sys/utsname.h>
#include <unistd.h>
#include <dlfcn.h>
#include <algorithm>
#include <cassert>