From 44adc457b259b2475c96b04e4b1d579738d11ea3 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Fri, 6 Dec 2024 12:18:21 -0600 Subject: [PATCH] [SWDEV-502744] Fix "amd-smi monitor" shows VCN ENC utilization & clock but not VCN DEC Reason for this fix: Navi products use vclk and dclk for both encode and decode. On MI products, only decode is supported. Navi products cannot support displaying ENC_UTIL % at this time. Change-Id: I107bb761794ae4724949ac21c110b23a4f616700 Signed-off-by: Charis Poag [ROCm/amdsmi commit: d323ecff97df232b734bd2a0d5740fe8a1968f84] --- projects/amdsmi/CHANGELOG.md | 80 +++++++++++++++++++ projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 80 ++++++++++++------- projects/amdsmi/amdsmi_cli/amdsmi_logger.py | 4 +- 3 files changed, 135 insertions(+), 29 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 915f52bbd7..e9e7f17f9a 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -23,6 +23,86 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ### Upcoming changes +### Known issues + +## amd_smi_lib for ROCm 6.3.1 + +### Added + +### Changed + +- **Changed `amd-smi monitor`: No longer display `ENC_CLOCK`/`DEC_CLOCK` but `VCLOCK` and `DCLOCK`**. + Due to fix mentioned in `Resolved Issues`, this change was needed. + Reason: Navi products use vclk and dclk for both encode and decode. On MI products, only decode is supported. + Before: + ```shell + $ amd-smi monitor -n -d + GPU ENC_UTIL ENC_CLOCK DEC_UTIL DEC_CLOCK + 0 0.0 % 29 MHz N/A 22 MHz + 1 0.0 % 29 MHz N/A 22 MHz + 2 0.0 % 29 MHz N/A 22 MHz + 3 0.0 % 29 MHz N/A 22 MHz + 4 0.0 % 29 MHz N/A 22 MHz + 5 0.0 % 29 MHz N/A 22 MHz + 6 0.0 % 29 MHz N/A 22 MHz + 7 0.0 % 29 MHz N/A 22 MHz + ``` + After: + ```shell + $ amd-smi monitor -n -d + GPU ENC_UTIL DEC_UTIL VCLOCK DCLOCK + 0 N/A 0.0 % 29 MHz 22 MHz + 1 N/A 0.0 % 29 MHz 22 MHz + 2 N/A 0.0 % 29 MHz 22 MHz + 3 N/A 0.0 % 29 MHz 22 MHz + 4 N/A 0.0 % 29 MHz 22 MHz + 5 N/A 0.0 % 29 MHz 22 MHz + 6 N/A 0.0 % 29 MHz 22 MHz + 7 N/A 0.0 % 29 MHz 22 MHz + ``` + +### Removed + +### Optimized + +### Resolved issues + +- **Fixed `amd-smi monitor`'s encode/decode: `ENC_UTIL`, `DEC_UTIL`, and now associate `VCLOCK`/`DCLOCK` with both**. + Navi products use vclk and dclk for both encode and decode. On MI products, only decode is supported. + + Navi products cannot support displaying ENC_UTIL % at this time. + + Before: + ```shell + $ amd-smi monitor -n -d + GPU ENC_UTIL ENC_CLOCK DEC_UTIL DEC_CLOCK + 0 0.0 % 29 MHz N/A 22 MHz + 1 0.0 % 29 MHz N/A 22 MHz + 2 0.0 % 29 MHz N/A 22 MHz + 3 0.0 % 29 MHz N/A 22 MHz + 4 0.0 % 29 MHz N/A 22 MHz + 5 0.0 % 29 MHz N/A 22 MHz + 6 0.0 % 29 MHz N/A 22 MHz + 7 0.0 % 29 MHz N/A 22 MHz + ``` + After: + ```shell + $ amd-smi monitor -n -d + GPU ENC_UTIL DEC_UTIL VCLOCK DCLOCK + 0 N/A 0.0 % 29 MHz 22 MHz + 1 N/A 0.0 % 29 MHz 22 MHz + 2 N/A 0.0 % 29 MHz 22 MHz + 3 N/A 0.0 % 29 MHz 22 MHz + 4 N/A 0.0 % 29 MHz 22 MHz + 5 N/A 0.0 % 29 MHz 22 MHz + 6 N/A 0.0 % 29 MHz 22 MHz + 7 N/A 0.0 % 29 MHz 22 MHz + ``` + +### Upcoming changes + +### Known issues + ## amd_smi_lib for ROCm 6.3.0 ### Added diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 109a01e0cd..cf2ba3378a 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -4874,9 +4874,10 @@ class AMDSMICommands(): self.logger.table_header += 'MEM_CLOCK'.rjust(11) if args.encoder: + # TODO: The encoding utilization is in progress for Navi. Note: MI3x ASICs only support decoding. try: # Get List of vcn activity values - encoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['vcn_activity'] + encoder_util = "N/A" # Not yet implemented encoding_activity_avg = [] for value in encoder_util: if isinstance(value, int): @@ -4903,49 +4904,72 @@ class AMDSMICommands(): self.logger.table_header += 'ENC_UTIL'.rjust(10) - try: - encoder_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_vclk0'] - monitor_values['encoder_clock'] = encoder_clock - freq_unit = 'MHz' - if encoder_clock != "N/A": - if self.logger.is_human_readable_format(): - monitor_values['encoder_clock'] = f"{monitor_values['encoder_clock']} {freq_unit}" - if self.logger.is_json_format(): - monitor_values['encoder_clock'] = {"value" : monitor_values['encoder_clock'], - "unit" : freq_unit} - except amdsmi_exception.AmdSmiLibraryException as e: - monitor_values['encoder_clock'] = "N/A" - logging.debug("Failed to get encoder clock on gpu %s | %s", gpu_id, e.get_error_info()) - - self.logger.table_header += 'ENC_CLOCK'.rjust(11) if args.decoder: try: - decoder_util = "N/A" # Not yet implemented - monitor_values['decoder'] = decoder_util - # if self.logger.is_human_readable_format(): - # monitor_values['decoder'] = f"{monitor_values['decoder']} %" + # Get List of vcn activity values + # Note: MI3x ASICs only support decoding, so the vcn_activity is used for decoding activity. + decoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['vcn_activity'] + decoding_activity_avg = [] + for value in decoder_util: + if isinstance(value, int): + decoding_activity_avg.append(value) + + # Averaging the possible decoding activity values + if decoding_activity_avg: + decoding_activity_avg = sum(decoding_activity_avg) / len(decoding_activity_avg) + else: + decoding_activity_avg = "N/A" + + monitor_values['decoder'] = decoding_activity_avg + + activity_unit = '%' + if monitor_values['decoder'] != "N/A": + if self.logger.is_human_readable_format(): + monitor_values['decoder'] = f"{monitor_values['decoder']} {activity_unit}" + if self.logger.is_json_format(): + monitor_values['decoder'] = {"value" : monitor_values['decoder'], + "unit" : activity_unit} except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['decoder'] = "N/A" logging.debug("Failed to get decoder utilization on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.table_header += 'DEC_UTIL'.rjust(10) + if args.encoder or args.decoder: try: - decoder_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_dclk0'] - monitor_values['decoder_clock'] = decoder_clock + vclock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_vclk0'] + monitor_values['vclock'] = vclock freq_unit = 'MHz' - if decoder_clock != "N/A": + if vclock != "N/A": if self.logger.is_human_readable_format(): - monitor_values['decoder_clock'] = f"{monitor_values['decoder_clock']} {freq_unit}" + monitor_values['vclock'] = f"{monitor_values['vclock']} {freq_unit}" if self.logger.is_json_format(): - monitor_values['decoder_clock'] = {"value" : monitor_values['decoder_clock'], + monitor_values['vclock'] = {"value" : monitor_values['vclock'], "unit" : freq_unit} except amdsmi_exception.AmdSmiLibraryException as e: - monitor_values['decoder_clock'] = "N/A" - logging.debug("Failed to get decoder clock on gpu %s | %s", gpu_id, e.get_error_info()) + monitor_values['vclock'] = "N/A" + logging.debug("Failed to get dclock on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'VCLOCK'.rjust(8) + + try: + dclock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_dclk0'] + monitor_values['dclock'] = dclock + + freq_unit = 'MHz' + if dclock != "N/A": + if self.logger.is_human_readable_format(): + monitor_values['dclock'] = f"{monitor_values['dclock']} {freq_unit}" + if self.logger.is_json_format(): + monitor_values['dclock'] = {"value" : monitor_values['dclock'], + "unit" : freq_unit} + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['dclock'] = "N/A" + logging.debug("Failed to get vclock on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'DCLOCK'.rjust(8) - self.logger.table_header += 'DEC_CLOCK'.rjust(11) if args.ecc: try: ecc = amdsmi_interface.amdsmi_get_gpu_total_ecc_count(args.gpu) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index d52abdf1fb..9997e7b4c7 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -117,8 +117,10 @@ class AMDSMILogger(): table_values += string_value.rjust(10) + ' ' elif key == 'power_usage': table_values += string_value.rjust(7) - elif key in ('gfx_clock', 'mem_clock', 'encoder_clock', 'decoder_clock', 'vram_used'): + elif key in ('gfx_clock', 'mem_clock', 'vram_used'): table_values += string_value.rjust(11) + elif key in ('vclock', 'dclock'): + table_values += string_value.rjust(8) elif key == 'vram_total' or 'ecc' in key or key == 'pcie_bw': table_values += string_value.rjust(12) elif key in ['pcie_replay']: