SWDEV-518214: GPU Metrics 1.8 (#31)

* SWDEV-518214: GPU Metrics 1.8 (#31)

- Updates:
    - Adding the following metrics to allow new calculations for violation status:
        - Per XCP metrics gfx_below_host_limit_ppt_acc
        - Per XCP metrics gfx_below_host_limit_thm_acc
        - Per XCP metrics gfx_low_utilization_acc
        - Per XCP metrics gfx_below_host_limit_total_acc
    - Increasing available JPEG engines to 40. Current ASICs may not support all 40. These will be indicated as UINT16_MAX or N/A in CLI.

Signed-off-by: Juan Castillo <juan.castillo@amd.com>
Co-authored-by: Charis Poag <Charis.Poag@amd.com>

[ROCm/rocm_smi_lib commit: f69e65f7bd]
Этот коммит содержится в:
Castillo, Juan
2025-03-20 18:07:32 -05:00
коммит произвёл GitHub
родитель 6075f89576
Коммит 3aa80ec0e4
8 изменённых файлов: 954 добавлений и 785 удалений
+57 -1
Просмотреть файл
@@ -3795,9 +3795,41 @@ def showGPUMetrics(deviceList):
},
"xcp_stats.gfx_busy_acc": {
"value": gpu_metrics.xcp_stats,
"unit": percent_unit,
"unit": count,
},
"xcp_stats.gfx_below_host_limit_acc": {
"value": gpu_metrics.xcp_stats,
"unit": count,
},
"xcp_stats.gfx_below_host_limit_ppt_acc": {
"value": gpu_metrics.xcp_stats,
"unit": count,
},
"xcp_stats.gfx_below_host_limit_thm_acc": {
"value": gpu_metrics.xcp_stats,
"unit": count,
},
"xcp_stats.gfx_low_utilization_acc": {
"value": gpu_metrics.xcp_stats,
"unit": count,
},
"xcp_stats.gfx_below_host_limit_total_acc": {
"value": gpu_metrics.xcp_stats,
"unit": count,
},
"xcp_stats.gfx_below_host_limit_ppt_acc": {
"value": gpu_metrics.xcp_stats,
"unit": percent_unit,
},
"xcp_stats.gfx_below_host_limit_thm_acc": {
"value": gpu_metrics.xcp_stats,
"unit": percent_unit,
},
"xcp_stats.gfx_low_utilization_acc": {
"value": gpu_metrics.xcp_stats,
"unit": percent_unit,
},
"xcp_stats.gfx_below_host_limit_total_acc": {
"value": gpu_metrics.xcp_stats,
"unit": percent_unit,
},
@@ -3841,6 +3873,30 @@ def showGPUMetrics(deviceList):
for _, val in enumerate(item.gfx_below_host_limit_acc):
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
if 'xcp_stats.gfx_below_host_limit_ppt_acc' in k:
for curr_xcp, item in enumerate(v['value']):
print_xcp_detail = []
for _, val in enumerate(item.gfx_below_host_limit_ppt_acc):
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
if 'xcp_stats.gfx_below_host_limit_thm_acc' in k:
for curr_xcp, item in enumerate(v['value']):
print_xcp_detail = []
for _, val in enumerate(item.gfx_below_host_limit_thm_acc):
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
if 'xcp_stats.gfx_low_utilization_acc' in k:
for curr_xcp, item in enumerate(v['value']):
print_xcp_detail = []
for _, val in enumerate(item.gfx_low_utilization_acc):
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
if 'xcp_stats.gfx_below_host_limit_total_acc' in k:
for curr_xcp, item in enumerate(v['value']):
print_xcp_detail = []
for _, val in enumerate(item.gfx_below_host_limit_total_acc):
print_xcp_detail.append(validateIfMaxUint(val, UIntegerTypes.UINT64_T))
printLog(device, k + " (" + str(v["unit"]) + ")", str(print_xcp_detail), xcp=str(curr_xcp))
if int(device) < (len(deviceList) - 1):
printLogSpacer()
+5 -1
Просмотреть файл
@@ -662,10 +662,14 @@ class amdgpu_xcp_metrics_t(Structure):
# amdgpu_xcp_metrics_t._pack_ = 1 # source:False
amdgpu_xcp_metrics_t._fields_ = [
('gfx_busy_inst', c_uint32 * 8),
('jpeg_busy', c_uint16 * 32),
('jpeg_busy', c_uint16 * 40),
('vcn_busy', c_uint16 * 4),
('gfx_busy_acc', c_uint64 * 8),
('gfx_below_host_limit_acc', c_uint64 * 8),
('gfx_below_host_limit_ppt_acc', c_uint64 * 8),
('gfx_below_host_limit_thm_acc', c_uint64 * 8),
('gfx_low_utilization_acc', c_uint64 * 8),
('gfx_below_host_limit_total_acc', c_uint64 * 8),
]
xcp_stats_t = amdgpu_xcp_metrics_t