GPU Usage/activity update

CLI:
Every usage field is notated by "activity"
gfx_usage -> gfx_activity
umc_usage -> umc_activity
vcn_activities -> vcn_activity
jpeg_activities[AID#] -> jpeg_activity

Wrapper: fixed metric output, misalignment
with generator

update_wrapper.sh:
DOCKER_BUILDKIT to 0 (if unset)

API:
amdsmi_get_gpu_metrics_info:
1.3: Removed commenting out avg socket power

Signed-off-by: Charis Poag <Charis.Poag@amd.com>
Change-Id: Id3fcc20aef420c7b7a90ba22fa3bc643b2716333


[ROCm/amdsmi commit: 4575990ae7]
Šī revīzija ir iekļauta:
Charis Poag
2024-01-15 23:14:44 -06:00
vecāks 28f354796d
revīzija 23a0cb827f
6 mainīti faili ar 23 papildinājumiem un 29 dzēšanām
@@ -1122,14 +1122,11 @@ class AMDSMICommands():
if args.usage:
try:
engine_usage = amdsmi_interface.amdsmi_get_gpu_activity(args.gpu)
engine_usage['gfx_usage'] = engine_usage.pop('gfx_activity')
engine_usage['mem_usage'] = engine_usage.pop('umc_activity')
engine_usage['mm_ip_usage'] = engine_usage.pop('mm_activity')
engine_usage['vcn_activities'] = gpu_metric_output.pop('vcn_activity')
engine_usage['jpeg_activities[AID0]'] = gpu_metric_output.pop('jpeg_activities[AID0]')
engine_usage['jpeg_activities[AID1]'] = gpu_metric_output.pop('jpeg_activities[AID1]')
engine_usage['jpeg_activities[AID2]'] = gpu_metric_output.pop('jpeg_activities[AID2]')
engine_usage['jpeg_activities[AID3]'] = gpu_metric_output.pop('jpeg_activities[AID3]')
engine_usage['gfx_activity'] = engine_usage.pop('gfx_activity')
engine_usage['umc_activity'] = engine_usage.pop('umc_activity')
engine_usage['mm_activity'] = engine_usage.pop('mm_activity')
engine_usage['vcn_activity'] = gpu_metric_output.pop('vcn_activity')
engine_usage['jpeg_activity'] = gpu_metric_output.pop('jpeg_activity')
for key, value in engine_usage.items():
if not isinstance(value, list) and value > 100:
engine_usage[key] = "N/A"
+3 -3
Parādīt failu
@@ -2126,8 +2126,8 @@ Output: Dictionary with fields
`temperature_vrsoc` | vrsoc temperature value | Celsius (C)
`temperature_vrmem` | vrmem temperature value | Celsius (C)
`average_gfx_activity` | Average gfx activity | %
`average_umc_activity` | Average umc activity | %
`average_mm_activity` | Average mm activity | %
`average_umc_activity` | Average umc (Universal Memory Controller) activity | %
`average_mm_activity` | Average mm (multimedia) engine activity | %
`average_socket_power` | Average socket power | W
`energy_accumulator` | Energy accumulated with a 15.3 uJ resolution over 1ns | uJ
`system_clock_counter` | System clock counter | ns
@@ -2178,7 +2178,7 @@ Output: Dictionary with fields
`mem_max_bandwidth` | Maximum memory bandwidth usage accumulated | GB/s
`pcie_nak_sent_count_acc` | PCIe NAC sent count accumulated |
`pcie_nak_rcvd_count_acc` | PCIe NAC received count accumulated |
`jpeg_activitys[AID<X>]` | List of JPEG engine activity for each AID (X=0-3) | %
`jpeg_activity` | List of JPEG engine activity | %
Exceptions that can be thrown by `amdsmi_get_gpu_metrics_info` function:
@@ -3242,10 +3242,7 @@ def amdsmi_get_gpu_metrics_info(
"mem_max_bandwidth": gpu_metrics.mem_max_bandwidth,
"pcie_nak_sent_count_acc": gpu_metrics.pcie_nak_sent_count_acc,
"pcie_nak_rcvd_count_acc": gpu_metrics.pcie_nak_rcvd_count_acc,
"jpeg_activities[AID0]": list(gpu_metrics.jpeg_activities)[:8],
"jpeg_activities[AID1]": list(gpu_metrics.jpeg_activities)[8:16],
"jpeg_activities[AID2]": list(gpu_metrics.jpeg_activities)[16:24],
"jpeg_activities[AID3]": list(gpu_metrics.jpeg_activities)[24:32],
"jpeg_activity": list(gpu_metrics.jpeg_activity),
}
@@ -906,7 +906,6 @@ amdsmi_clk_info_t = struct_amdsmi_clk_info_t
class struct_amdsmi_engine_usage_t(Structure):
pass
struct_amdsmi_engine_usage_t._pack_ = 1 # source:False
struct_amdsmi_engine_usage_t._fields_ = [
('gfx_activity', ctypes.c_uint32),
@@ -920,6 +919,16 @@ amdsmi_process_handle_t = ctypes.c_uint32
class struct_amdsmi_proc_info_t(Structure):
pass
class struct_engine_usage_(Structure):
pass
struct_engine_usage_._pack_ = 1 # source:False
struct_engine_usage_._fields_ = [
('gfx', ctypes.c_uint64),
('enc', ctypes.c_uint64),
('reserved', ctypes.c_uint32 * 12),
]
class struct_memory_usage_(Structure):
pass
@@ -931,16 +940,6 @@ struct_memory_usage_._fields_ = [
('reserved', ctypes.c_uint32 * 10),
]
class struct_engine_usage_(Structure):
pass
struct_engine_usage_._pack_ = 1 # source:False
struct_engine_usage_._fields_ = [
('gfx', ctypes.c_uint64),
('enc', ctypes.c_uint64),
('reserved', ctypes.c_uint32 * 12),
]
struct_amdsmi_proc_info_t._pack_ = 1 # source:False
struct_amdsmi_proc_info_t._fields_ = [
('name', ctypes.c_char * 32),
@@ -1527,11 +1526,12 @@ struct_amdsmi_gpu_metrics_t._fields_ = [
('current_socclks', ctypes.c_uint16 * 4),
('current_vclk0s', ctypes.c_uint16 * 4),
('current_dclk0s', ctypes.c_uint16 * 4),
('jpeg_activity', ctypes.c_uint16 * 32),
('mem_bandwidth_acc', ctypes.c_uint64),
('mem_max_bandwidth', ctypes.c_uint32),
('pcie_nak_sent_count_acc', ctypes.c_uint32),
('pcie_nak_rcvd_count_acc', ctypes.c_uint32),
('jpeg_activities', ctypes.c_uint16 * 32),
('PADDING_4', ctypes.c_ubyte * 4),
]
amdsmi_gpu_metrics_t = struct_amdsmi_gpu_metrics_t
@@ -1413,7 +1413,7 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v13_t::copy_internal_to_external_m
metrics_public_init.average_mm_activity = m_gpu_metrics_tbl.m_average_mm_activity;
// Power/Energy
// metrics_public_init.average_socket_power = m_gpu_metrics_tbl.m_average_socket_power; // 1.3 and 1.4 have the same value
metrics_public_init.average_socket_power = m_gpu_metrics_tbl.m_average_socket_power; // 1.3 and 1.4 have the same value
metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator;
// Driver attached timestamp (in ns)
+1 -1
Parādīt failu
@@ -27,7 +27,7 @@ if ! does_image_exist; then
# docker pull dmitriigalantsev/amdsmi_wrapper_updater
echo "No docker image found! Generating one"
# set to 0 because it's compatible with more systems
DOCKER_BUILDKIT="${DOCKER_BUILDKIT:0}" docker build "$DIR/py-interface" -t "$DOCKER_NAME":latest
DOCKER_BUILDKIT="${DOCKER_BUILDKIT:=0}" docker build "$DIR/py-interface" -t "$DOCKER_NAME":latest
fi
ENABLE_ESMI_LIB=""