2
0

[SWDEV-502744] Fix "amd-smi monitor" shows VCN ENC utilization & clock but not VCN DEC

Reason for this fix:
Navi products use vclk and dclk for both encode and decode.
On MI products, only decode is supported.
Navi products cannot support displaying ENC_UTIL % at this time.

Change-Id: I107bb761794ae4724949ac21c110b23a4f616700
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/amdsmi commit: d323ecff97]
Este cometimento está contido em:
Charis Poag
2024-12-06 12:18:21 -06:00
ascendente cdecce3658
cometimento 44adc457b2
3 ficheiros modificados com 135 adições e 29 eliminações
+80
Ver ficheiro
@@ -23,6 +23,86 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
### Upcoming changes
### Known issues
## amd_smi_lib for ROCm 6.3.1
### Added
### Changed
- **Changed `amd-smi monitor`: No longer display `ENC_CLOCK`/`DEC_CLOCK` but `VCLOCK` and `DCLOCK`**.
Due to fix mentioned in `Resolved Issues`, this change was needed.
Reason: Navi products use vclk and dclk for both encode and decode. On MI products, only decode is supported.
Before:
```shell
$ amd-smi monitor -n -d
GPU ENC_UTIL ENC_CLOCK DEC_UTIL DEC_CLOCK
0 0.0 % 29 MHz N/A 22 MHz
1 0.0 % 29 MHz N/A 22 MHz
2 0.0 % 29 MHz N/A 22 MHz
3 0.0 % 29 MHz N/A 22 MHz
4 0.0 % 29 MHz N/A 22 MHz
5 0.0 % 29 MHz N/A 22 MHz
6 0.0 % 29 MHz N/A 22 MHz
7 0.0 % 29 MHz N/A 22 MHz
```
After:
```shell
$ amd-smi monitor -n -d
GPU ENC_UTIL DEC_UTIL VCLOCK DCLOCK
0 N/A 0.0 % 29 MHz 22 MHz
1 N/A 0.0 % 29 MHz 22 MHz
2 N/A 0.0 % 29 MHz 22 MHz
3 N/A 0.0 % 29 MHz 22 MHz
4 N/A 0.0 % 29 MHz 22 MHz
5 N/A 0.0 % 29 MHz 22 MHz
6 N/A 0.0 % 29 MHz 22 MHz
7 N/A 0.0 % 29 MHz 22 MHz
```
### Removed
### Optimized
### Resolved issues
- **Fixed `amd-smi monitor`'s encode/decode: `ENC_UTIL`, `DEC_UTIL`, and now associate `VCLOCK`/`DCLOCK` with both**.
Navi products use vclk and dclk for both encode and decode. On MI products, only decode is supported.
Navi products cannot support displaying ENC_UTIL % at this time.
Before:
```shell
$ amd-smi monitor -n -d
GPU ENC_UTIL ENC_CLOCK DEC_UTIL DEC_CLOCK
0 0.0 % 29 MHz N/A 22 MHz
1 0.0 % 29 MHz N/A 22 MHz
2 0.0 % 29 MHz N/A 22 MHz
3 0.0 % 29 MHz N/A 22 MHz
4 0.0 % 29 MHz N/A 22 MHz
5 0.0 % 29 MHz N/A 22 MHz
6 0.0 % 29 MHz N/A 22 MHz
7 0.0 % 29 MHz N/A 22 MHz
```
After:
```shell
$ amd-smi monitor -n -d
GPU ENC_UTIL DEC_UTIL VCLOCK DCLOCK
0 N/A 0.0 % 29 MHz 22 MHz
1 N/A 0.0 % 29 MHz 22 MHz
2 N/A 0.0 % 29 MHz 22 MHz
3 N/A 0.0 % 29 MHz 22 MHz
4 N/A 0.0 % 29 MHz 22 MHz
5 N/A 0.0 % 29 MHz 22 MHz
6 N/A 0.0 % 29 MHz 22 MHz
7 N/A 0.0 % 29 MHz 22 MHz
```
### Upcoming changes
### Known issues
## amd_smi_lib for ROCm 6.3.0
### Added
+52 -28
Ver ficheiro
@@ -4874,9 +4874,10 @@ class AMDSMICommands():
self.logger.table_header += 'MEM_CLOCK'.rjust(11)
if args.encoder:
# TODO: The encoding utilization is in progress for Navi. Note: MI3x ASICs only support decoding.
try:
# Get List of vcn activity values
encoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['vcn_activity']
encoder_util = "N/A" # Not yet implemented
encoding_activity_avg = []
for value in encoder_util:
if isinstance(value, int):
@@ -4903,49 +4904,72 @@ class AMDSMICommands():
self.logger.table_header += 'ENC_UTIL'.rjust(10)
try:
encoder_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_vclk0']
monitor_values['encoder_clock'] = encoder_clock
freq_unit = 'MHz'
if encoder_clock != "N/A":
if self.logger.is_human_readable_format():
monitor_values['encoder_clock'] = f"{monitor_values['encoder_clock']} {freq_unit}"
if self.logger.is_json_format():
monitor_values['encoder_clock'] = {"value" : monitor_values['encoder_clock'],
"unit" : freq_unit}
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['encoder_clock'] = "N/A"
logging.debug("Failed to get encoder clock on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'ENC_CLOCK'.rjust(11)
if args.decoder:
try:
decoder_util = "N/A" # Not yet implemented
monitor_values['decoder'] = decoder_util
# if self.logger.is_human_readable_format():
# monitor_values['decoder'] = f"{monitor_values['decoder']} %"
# Get List of vcn activity values
# Note: MI3x ASICs only support decoding, so the vcn_activity is used for decoding activity.
decoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['vcn_activity']
decoding_activity_avg = []
for value in decoder_util:
if isinstance(value, int):
decoding_activity_avg.append(value)
# Averaging the possible decoding activity values
if decoding_activity_avg:
decoding_activity_avg = sum(decoding_activity_avg) / len(decoding_activity_avg)
else:
decoding_activity_avg = "N/A"
monitor_values['decoder'] = decoding_activity_avg
activity_unit = '%'
if monitor_values['decoder'] != "N/A":
if self.logger.is_human_readable_format():
monitor_values['decoder'] = f"{monitor_values['decoder']} {activity_unit}"
if self.logger.is_json_format():
monitor_values['decoder'] = {"value" : monitor_values['decoder'],
"unit" : activity_unit}
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['decoder'] = "N/A"
logging.debug("Failed to get decoder utilization on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'DEC_UTIL'.rjust(10)
if args.encoder or args.decoder:
try:
decoder_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_dclk0']
monitor_values['decoder_clock'] = decoder_clock
vclock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_vclk0']
monitor_values['vclock'] = vclock
freq_unit = 'MHz'
if decoder_clock != "N/A":
if vclock != "N/A":
if self.logger.is_human_readable_format():
monitor_values['decoder_clock'] = f"{monitor_values['decoder_clock']} {freq_unit}"
monitor_values['vclock'] = f"{monitor_values['vclock']} {freq_unit}"
if self.logger.is_json_format():
monitor_values['decoder_clock'] = {"value" : monitor_values['decoder_clock'],
monitor_values['vclock'] = {"value" : monitor_values['vclock'],
"unit" : freq_unit}
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['decoder_clock'] = "N/A"
logging.debug("Failed to get decoder clock on gpu %s | %s", gpu_id, e.get_error_info())
monitor_values['vclock'] = "N/A"
logging.debug("Failed to get dclock on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'VCLOCK'.rjust(8)
try:
dclock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_dclk0']
monitor_values['dclock'] = dclock
freq_unit = 'MHz'
if dclock != "N/A":
if self.logger.is_human_readable_format():
monitor_values['dclock'] = f"{monitor_values['dclock']} {freq_unit}"
if self.logger.is_json_format():
monitor_values['dclock'] = {"value" : monitor_values['dclock'],
"unit" : freq_unit}
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['dclock'] = "N/A"
logging.debug("Failed to get vclock on gpu %s | %s", gpu_id, e.get_error_info())
self.logger.table_header += 'DCLOCK'.rjust(8)
self.logger.table_header += 'DEC_CLOCK'.rjust(11)
if args.ecc:
try:
ecc = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu)
+3 -1
Ver ficheiro
@@ -117,8 +117,10 @@ class AMDSMILogger():
table_values += string_value.rjust(10) + ' '
elif key == 'power_usage':
table_values += string_value.rjust(7)
elif key in ('gfx_clock', 'mem_clock', 'encoder_clock', 'decoder_clock', 'vram_used'):
elif key in ('gfx_clock', 'mem_clock', 'vram_used'):
table_values += string_value.rjust(11)
elif key in ('vclock', 'dclock'):
table_values += string_value.rjust(8)
elif key == 'vram_total' or 'ecc' in key or key == 'pcie_bw':
table_values += string_value.rjust(12)
elif key in ['pcie_replay']: