diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index a92c98e6a9..7c20043bf2 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -3,6 +3,7 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/projects/amdsmi](https://rocm.docs.amd.com/projects/amdsmi/en/latest/). ***All information listed below is for reference and subject to change.*** + ## amd_smi_lib for ROCm 6.4.0 ### Added @@ -18,18 +19,19 @@ Added temperature violation active or not status to `amd-smi monitor`. TVIOL_ACT - N/A if not supported. Example CLI output: -```shell -$ amd-smi monitor --viol -GPU PVIOL TVIOL TVIOL_ACTIVE PHOT_TVIOL VR_TVIOL HBM_TVIOL - 0 100 % 1 % True 0 % 0 % 0 % - 1 100 % 0 % False 0 % 0 % 0 % - 2 100 % 0 % False 0 % 0 % 0 % - 3 100 % 0 % False 0 % 0 % 0 % - 4 100 % 0 % False 0 % 0 % 0 % - 5 100 % 3 % True 0 % 0 % 0 % - 6 100 % 0 % False 0 % 0 % 0 % - 7 100 % 0 % False 0 % 0 % 0 % -``` + + ```shell + $ amd-smi monitor --viol + GPU PVIOL TVIOL TVIOL_ACTIVE PHOT_TVIOL VR_TVIOL HBM_TVIOL + 0 100 % 1 % True 0 % 0 % 0 % + 1 100 % 0 % False 0 % 0 % 0 % + 2 100 % 0 % False 0 % 0 % 0 % + 3 100 % 0 % False 0 % 0 % 0 % + 4 100 % 0 % False 0 % 0 % 0 % + 5 100 % 3 % True 0 % 0 % 0 % + 6 100 % 0 % False 0 % 0 % 0 % + 7 100 % 0 % False 0 % 0 % 0 % + ``` - **Added support for GPU metrics 1.7 to `amdsmi_get_gpu_metrics_info()`**. Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to include new fields for XGMI Link Status, graphics clocks below host limit (per XCP), and VRAM max bandwidth: @@ -38,34 +40,38 @@ Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to - `uint64_t gfx_below_host_limit_acc[MAX_NUM_XCC]` - graphics clocks below host limit (per XCP) accumulators. Used for graphic clk below host limit violation status. - **Added new API `amdsmi_get_gpu_xgmi_link_status()` and CLI `amd-smi xgmi --link-status`** -New API is defined as: -```C -typedef enum { - AMDSMI_XGMI_LINK_DOWN, //!< The XGMI Link is down - AMDSMI_XGMI_LINK_UP, //!< The XGMI Link is up - AMDSMI_XGMI_LINK_DISABLE, //!< The XGMI Link is disabled -} amdsmi_xgmi_link_status_type_t; -typedef struct { - uint32_t total_links; //!< The total links in the status array - amdsmi_xgmi_link_status_type_t status[AMDSMI_MAX_NUM_XGMI_LINKS]; - uint64_t reserved[7]; -} amdsmi_xgmi_link_status_t; + New API is defined as: -amdsmi_status_t amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle, amdsmi_xgmi_link_status_t *link_status) -``` -Example CLI output: -```shell -$ amd-smi xgmi --link-status + ```C + typedef enum { + AMDSMI_XGMI_LINK_DOWN, //!< The XGMI Link is down + AMDSMI_XGMI_LINK_UP, //!< The XGMI Link is up + AMDSMI_XGMI_LINK_DISABLE, //!< The XGMI Link is disabled + } amdsmi_xgmi_link_status_type_t; -XGMI LINK STATUS: - bdf link_status -GPU0 0000:08:00.0 U U U U D U D X -GPU1 0000:44:00.0 U U U U D U D X -... + typedef struct { + uint32_t total_links; //!< The total links in the status array + amdsmi_xgmi_link_status_type_t status[AMDSMI_MAX_NUM_XGMI_LINKS]; + uint64_t reserved[7]; + } amdsmi_xgmi_link_status_t; -* U:Up D:Down X:Disabled -``` + amdsmi_status_t amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle, amdsmi_xgmi_link_status_t *link_status) + ``` + + Example CLI output: + + ```shell + $ amd-smi xgmi --link-status + + XGMI LINK STATUS: + bdf link_status + GPU0 0000:08:00.0 U U U U D U D X + GPU1 0000:44:00.0 U U U U D U D X + ... + + * U:Up D:Down X:Disabled + ``` - **Added fclk and socclk info to `amd-smi metric -c/--clock`**. fclk and socclk information such as min and max clock have been added to the metric command, in line with all the other clocks. @@ -90,79 +96,80 @@ GPU1 0000:44:00.0 U U U U D U D X - **Added new command `amd-smi set -c/--clock-level`**. This new command sets the performance level of the selected clock on the desired GPUs. The command can accept a range of acceptable levels, but will not set the level when a level is beyond the number of frequency levels as show in `amd-smi static -C/--clock`. -```shell -sudo amd-smi set -c sclk 5 6 -GPU: 0 - CLK_LEVEL: Successfully changed sclk perf level(s) to 5, 6 + ```shell + sudo amd-smi set -c sclk 5 6 + GPU: 0 + CLK_LEVEL: Successfully changed sclk perf level(s) to 5, 6 -GPU: 1 - CLK_LEVEL: level(s) 5, 6 is/are greater than performance levels supported for device -``` + GPU: 1 + CLK_LEVEL: level(s) 5, 6 is/are greater than performance levels supported for device + ``` - **Added new command `amd-smi static -C/--clock`**. This new command displays the clock frequency performance levels for the selected GPUs and clocks. -```shell -amd-smi static --clock all -g 0 -GPU: 0 - CLOCK: - SYS: - CURRENT LEVEL: 2 - FREQUENCY_LEVELS: - 0: 300 MHz - 1: 904 MHz - 2: 1165 MHz - 3: 1360 MHz - 4: 1440 MHz - 5: 1544 MHz - 6: 1627 MHz - 7: 1720 MHz - 8: 1800 MHz - MEM: - CURRENT LEVEL: 0 - FREQUENCY_LEVELS: - 0: 167 MHz - DF: - CURRENT LEVEL: 0 - FREQUENCY_LEVELS: - 0: 1400 MHz - SOC: - CURRENT LEVEL: 0 - FREQUENCY_LEVELS: - 0: 302 MHz - DCEF: N/A - VCLK0: N/A - VCLK1: N/A - DCLK0: N/A - DCLK1: N/A -``` + ```shell + amd-smi static --clock all -g 0 + GPU: 0 + CLOCK: + SYS: + CURRENT LEVEL: 2 + FREQUENCY_LEVELS: + 0: 300 MHz + 1: 904 MHz + 2: 1165 MHz + 3: 1360 MHz + 4: 1440 MHz + 5: 1544 MHz + 6: 1627 MHz + 7: 1720 MHz + 8: 1800 MHz + MEM: + CURRENT LEVEL: 0 + FREQUENCY_LEVELS: + 0: 167 MHz + DF: + CURRENT LEVEL: 0 + FREQUENCY_LEVELS: + 0: 1400 MHz + SOC: + CURRENT LEVEL: 0 + FREQUENCY_LEVELS: + 0: 302 MHz + DCEF: N/A + VCLK0: N/A + VCLK1: N/A + DCLK0: N/A + DCLK1: N/A + ``` ### Changed - **AMDSMI Library Version number to reflect changes in backwards compatability**. -Removed Year from AMDSMI Library version number. Version changed from 25.2.0.0 (Year.Major.Minor.Patch) to 25.2.0 (Major.Minor.Patch) -Removed year in all version references + - Removed Year from AMDSMI Library version number. + - Version changed from 25.2.0.0 (Year.Major.Minor.Patch) to 25.2.0 (Major.Minor.Patch) + - Removed year in all version references - **Removed initialization requirements for `amdsmi_get_lib_version()` and added `amdsmi_get_rocm_version()` to the python API & CLI**. - **Added an additional argument `sensor_ind` to `amdsmi_get_power_info()`**. -This change breaks previous C API calls and will require a change -Python API now accepts `sensor_ind` as an optional argument, does not imapact previous usage + - This change breaks previous C API calls and will require a change + - Python API now accepts `sensor_ind` as an optional argument, does not imapact previous usage - **Depricated enum `AMDSMI_NORMAL_STRING_LENGTH` in favor of `AMDSMI_MAX_STRING_LENGTH`**. - **Changed to use thread local mutex by default**. -Most sysfs reads do not require cross-process level mutex, and writes to sysfs should be protected by the kernel already. -Users can still switch to the old behavior by setting the environment variable `AMDSMI_MUTEX_CROSS_PROCESS=1`. + - Most sysfs reads do not require cross-process level mutex, and writes to sysfs should be protected by the kernel already. + - Users can still switch to the old behavior by setting the environment variable `AMDSMI_MUTEX_CROSS_PROCESS=1`. -- **Changed `amdsmi_vram_vendor_type_t` enum names impacting `amdsmi_vram_info_t` structure**. -This also change impacts usage of the vram_vendor output of `amdsmi_get_gpu_vram_info()` +- **Changed `amdsmi_vram_vendor_type_t` enum names impacting `amdsmi_vram_info_t` structure**. +This also change impacts usage of the vram_vendor output of `amdsmi_get_gpu_vram_info()` - **Changed `amdsmi_nps_caps_t` struct impacting `amdsmi_memory_partition_config_t`, `amdsmi_accelerator_partition_t`, `amdsmi_accelerator_partition_profile_config_t`**. - - Functions affected by struct change are: - - `amdsmi_get_gpu_memory_partition_config()` - - `amdsmi_get_gpu_accelerator_partition_profile()` - - `amdsmi_get_gpu_accelerator_partition_profile_config()` +Functions affected by struct change are: + - `amdsmi_get_gpu_memory_partition_config()` + - `amdsmi_get_gpu_accelerator_partition_profile()` + - `amdsmi_get_gpu_accelerator_partition_profile_config()` - **Corrected CLI CPU argument name**. - `--cpu-pwr-svi-telemtry-rails` to `--cpu-pwr-svi-telemetry-rails` @@ -172,16 +179,16 @@ This also change impacts usage of the vram_vendor output of `amdsmi_get_gpu_vram - The amd_hsmp driver version can also be displayed using the `-c` flag. - The new default for the `version` command is to display all the version information, including both amdgpu and amd_hsmp driver versions. -```shell -amd-smi version -AMDSMI Tool: 24.7.1+b446d6c-dirty | AMDSMI Library version: 24.7.2.0 | ROCm version: N/A | amdgpu version: 6.10.10 | amd_hsmp version: 2.2 + ```shell + amd-smi version + AMDSMI Tool: 24.7.1+b446d6c-dirty | AMDSMI Library version: 24.7.2.0 | ROCm version: N/A | amdgpu version: 6.10.10 | amd_hsmp version: 2.2 -amd-smi version -g -AMDSMI Tool: 24.7.1+b446d6c-dirty | AMDSMI Library version: 24.7.2.0 | ROCm version: N/A | amdgpu version: 6.10.10 + amd-smi version -g + AMDSMI Tool: 24.7.1+b446d6c-dirty | AMDSMI Library version: 24.7.2.0 | ROCm version: N/A | amdgpu version: 6.10.10 -amd-smi version -c -AMDSMI Tool: 24.7.1+b446d6c-dirty | AMDSMI Library version: 24.7.2.0 | ROCm version: N/A | amd_hsmp version: 2.2 -``` + amd-smi version -c + AMDSMI Tool: 24.7.1+b446d6c-dirty | AMDSMI Library version: 24.7.2.0 | ROCm version: N/A | amd_hsmp version: 2.2 + ``` - **All `amd-smi set` and `amd-smi reset` options are now mutually exclusive**. - Users can only use one set option at a time now. @@ -307,7 +314,8 @@ Updated structure `amdsmi_vram_info_t`: - **Removed `GFX_BUSY_ACC` from `amd-smi metric --usage`**. Displaying `GFX_BUSY_ACC` does not provide helpful outputs for users. - Old output: + Old output: + ```shell $ amd-smi metric --usage GPU: 0 @@ -328,7 +336,8 @@ Updated structure `amdsmi_vram_info_t`: ... ``` - New Output: + New Output: + ```shell $ amd-smi metric --usage GPU: 0 @@ -347,7 +356,6 @@ Updated structure `amdsmi_vram_info_t`: ... ``` - ### Optimized - **Added additional help information to `amd-smi set --help` command**. @@ -366,34 +374,38 @@ Updated structure `amdsmi_vram_info_t`: - **Converted xgmi read and write from KB's to readable units**. - With this change `amd-smi xgmi` will now display the statistics in dynamically selected readable units. - - Example output is shown below. + - Example output CLI output: -```shell -$ amd-smi xgmi -LINK METRIC TABLE: - bdf bit_rate max_bandwidth link_type 0000:05:00.0 0000:26:00.0 0000:46:00.0 0000:65:00.0 0000:85:00.0 0000:a6:00.0 0000:c6:00.0 0000:e5:00.0 -GPU0 0000:05:00.0 32 Gb/s 512 Gb/s XGMI - Read N/A 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB - Write N/A 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB -GPU1 0000:26:00.0 32 Gb/s 512 Gb/s XGMI - Read 1.123 PB N/A 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB - Write 229.1 MB N/A 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB -GPU2 0000:46:00.0 32 Gb/s 512 Gb/s XGMI - Read 1.123 PB 1.123 PB N/A 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB - Write 229.1 MB 229.1 MB N/A 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB -... -``` + ```shell + $ amd-smi xgmi + LINK METRIC TABLE: + bdf bit_rate max_bandwidth link_type 0000:05:00.0 0000:26:00.0 0000:46:00.0 0000:65:00.0 0000:85:00.0 0000:a6:00.0 0000:c6:00.0 0000:e5:00.0 + GPU0 0000:05:00.0 32 Gb/s 512 Gb/s XGMI + Read N/A 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB + Write N/A 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB + GPU1 0000:26:00.0 32 Gb/s 512 Gb/s XGMI + Read 1.123 PB N/A 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB + Write 229.1 MB N/A 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB + GPU2 0000:46:00.0 32 Gb/s 512 Gb/s XGMI + Read 1.123 PB 1.123 PB N/A 1.123 PB 1.123 PB 1.123 PB 1.123 PB 1.123 PB + Write 229.1 MB 229.1 MB N/A 229.1 MB 229.1 MB 229.1 MB 229.1 MB 229.1 MB + ... + ``` ### Resolved issues - **Fixed `amdsmi_get_gpu_asic_info` and `amd-smi static --asic` not displaying graphics version properly for MI2x, MI1x or Navi 3x ASICs.** + Before on MI100: + ```shell $ amd-smi static --asic | grep TARGET_GRAPHICS_VERSION TARGET_GRAPHICS_VERSION: gfx9008 TARGET_GRAPHICS_VERSION: gfx9008 ``` + After on MI100: + ```shell $ amd-smi static --asic | grep TARGET_GRAPHICS_VERSION TARGET_GRAPHICS_VERSION: gfx908 @@ -402,18 +414,18 @@ GPU2 0000:46:00.0 32 Gb/s 512 Gb/s XGMI ### Upcoming changes -- **Deprication of the `AMDSMI_LIB_VERSION_YEAR` +- **Deprication of the `AMDSMI_LIB_VERSION_YEAR` enum and API fields.** ### Known issues - **AMD SMI only reports 63 GPU devices when setting CPX on all 8 GPUs** When setting CPX as a partition mode, there is a DRM node limitation of 64. - This is a known limitation of the Linux kernel, not the driver. Other drivers, such as those using PCIe space (e.g., ast), may be occupying the necessary DRM nodes. + This is a known limitation of the Linux kernel, not the driver. Other drivers, such as those using PCIe space (e.g., ast), may be occupying the necessary DRM nodes. The number of DRM nodes used can be checked via `ls /sys/class/drm` - - References to kernel changes: - - [Updates to number of node](https://cgit.freedesktop.org/drm/libdrm/commit/?id=7130cb163eb860d4a965c6708b64fe87cee881d6) - - [Identification of node type](https://cgit.freedesktop.org/drm/libdrm/commit/?id=3bc3cca230c5a064b2f554f26fdec27db0f5ead8) + - References to kernel changes: + - [Updates to number of node](https://cgit.freedesktop.org/drm/libdrm/commit/?id=7130cb163eb860d4a965c6708b64fe87cee881d6) + - [Identification of node type](https://cgit.freedesktop.org/drm/libdrm/commit/?id=3bc3cca230c5a064b2f554f26fdec27db0f5ead8) Options are as follows: 1) ***Workaround - removing other devices using DRM nodes*** diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 8cdf7b62e2..8aa2bcd622 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -20,21 +20,19 @@ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import argparse +import json import logging +import multiprocessing +import os import sys import threading import time -import json -import multiprocessing -import threading -import os from _version import __version__ +from amdsmi import amdsmi_exception, amdsmi_interface +from amdsmi_cli_exceptions import AmdSmiInvalidParameterException, AmdSmiRequiredCommandException from amdsmi_helpers import AMDSMIHelpers from amdsmi_logger import AMDSMILogger -from amdsmi_cli_exceptions import AmdSmiRequiredCommandException, AmdSmiInvalidParameterException -from amdsmi import amdsmi_interface -from amdsmi import amdsmi_exception class AMDSMICommands(): @@ -4298,7 +4296,7 @@ class AMDSMICommands(): break # successful case except amdsmi_exception.AmdSmiLibraryException as e: - f = open(os.devnull, 'w') #redirect to /dev/null (crossplatform) + f = open(os.devnull, 'w', encoding='utf-8') #redirect to /dev/null (crossplatform) print("\n\n", end='\r', flush=True, file=f) for thread in threads: thread.terminate() @@ -4334,7 +4332,7 @@ class AMDSMICommands(): return continue - f = open(os.devnull, 'w') #redirect to /dev/null (crossplatform) + f = open(os.devnull, 'w', encoding='utf-8') #redirect to /dev/null (crossplatform) print("\n\n", end='\r', flush=True, file=f) out = f"Unable to set memory partition to {args.memory_partition} on {gpu_string}" print(out) @@ -4897,9 +4895,10 @@ class AMDSMICommands(): def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None, - watch=None, watch_time=None, iterations=None, power_usage=None, - temperature=None, gfx_util=None, mem_util=None, encoder=None, decoder=None, - ecc=None, vram_usage=None, pcie=None, process=None, violation=None): + watch=None, watch_time=None, iterations=None, power_usage=None, + temperature=None, gfx_util=None, mem_util=None, encoder=None, + decoder=None, ecc=None, vram_usage=None, pcie=None, process=None, + violation=None): """ Populate a table with each GPU as an index to rows of targeted data Args: @@ -4912,7 +4911,7 @@ class AMDSMICommands(): power_usage (bool, optional): Value override for args.power_usage. Defaults to None. temperature (bool, optional): Value override for args.temperature. Defaults to None. gfx (bool, optional): Value override for args.gfx. Defaults to None. - mem (bool, optional): Value override for args.mem. Defaults to None. + mem_util (bool, optional): Value override for args.mem. Defaults to None. encoder (bool, optional): Value override for args.encoder. Defaults to None. decoder (bool, optional): Value override for args.decoder. Defaults to None. ecc (bool, optional): Value override for args.ecc. Defaults to None. @@ -4973,11 +4972,15 @@ class AMDSMICommands(): # If all arguments are False, the print all values # Don't include process in this logic as it's an optional edge case if not any([args.power_usage, args.temperature, args.gfx, args.mem, - args.encoder, args.decoder, args.ecc, - args.vram_usage, args.pcie, args.violation]): + args.encoder, args.decoder, args.ecc, args.vram_usage, + args.pcie, args.violation]): args.power_usage = args.temperature = args.gfx = args.mem = \ - args.encoder = args.decoder = args.ecc = \ - args.vram_usage = args.pcie = args.violation = True + args.encoder = args.decoder = args.vram_usage = True + # set extra args for default output filtering + args.default_output = True + else: + if not hasattr(args, 'default_output'): + args.default_output = False # Handle watch logic, will only enter this block once if args.watch: @@ -5078,6 +5081,7 @@ class AMDSMICommands(): logging.debug("Failed to get power usage on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.table_header += 'POWER'.rjust(7) + if args.temperature: try: temperature = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['temperature_hotspot'] @@ -5108,12 +5112,30 @@ class AMDSMICommands(): monitor_values['memory_temperature'] = {"value" : monitor_values['memory_temperature'], "unit" : temp_unit_json} - self.logger.table_header += 'GPU_TEMP'.rjust(10) - self.logger.table_header += 'MEM_TEMP'.rjust(10) + self.logger.table_header += 'GPU_T'.rjust(8) + self.logger.table_header += 'MEM_T'.rjust(8) + if args.gfx: + try: + gfx_clk = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_gfxclk'] + monitor_values['gfx_clk'] = gfx_clk + freq_unit = 'MHz' + if gfx_clk != "N/A": + if self.logger.is_human_readable_format(): + monitor_values['gfx_clk'] = f"{monitor_values['gfx_clk']} {freq_unit}" + if self.logger.is_json_format(): + monitor_values['gfx_clk'] = {"value" : monitor_values['gfx_clk'], + "unit" : freq_unit} + + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['gfx_clk'] = "N/A" + logging.debug("Failed to get gfx clock on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'GFX_CLK'.rjust(10) + try: gfx_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_gfx_activity'] - monitor_values['gfx'] = gfx_util + monitor_values['gfx'] = round(gfx_util) activity_unit = '%' if gfx_util != "N/A": if self.logger.is_human_readable_format(): @@ -5125,28 +5147,12 @@ class AMDSMICommands(): monitor_values['gfx'] = "N/A" logging.debug("Failed to get gfx utilization on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.table_header += 'GFX_UTIL'.rjust(10) + self.logger.table_header += 'GFX%'.rjust(7) - try: - gfx_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_gfxclk'] - monitor_values['gfx_clock'] = gfx_clock - freq_unit = 'MHz' - if gfx_clock != "N/A": - if self.logger.is_human_readable_format(): - monitor_values['gfx_clock'] = f"{monitor_values['gfx_clock']} {freq_unit}" - if self.logger.is_json_format(): - monitor_values['gfx_clock'] = {"value" : monitor_values['gfx_clock'], - "unit" : freq_unit} - - except amdsmi_exception.AmdSmiLibraryException as e: - monitor_values['gfx_clock'] = "N/A" - logging.debug("Failed to get gfx clock on gpu %s | %s", gpu_id, e.get_error_info()) - - self.logger.table_header += 'GFX_CLOCK'.rjust(11) if args.mem: try: mem_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_umc_activity'] - monitor_values['mem'] = mem_util + monitor_values['mem'] = round(mem_util) activity_unit = '%' if mem_util != "N/A": if self.logger.is_human_readable_format(): @@ -5158,23 +5164,26 @@ class AMDSMICommands(): monitor_values['mem'] = "N/A" logging.debug("Failed to get mem utilization on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.table_header += 'MEM_UTIL'.rjust(10) + self.logger.table_header += 'MEM%'.rjust(7) - try: - mem_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_uclk'] - monitor_values['mem_clock'] = mem_clock - freq_unit = 'MHz' - if mem_clock != "N/A": - if self.logger.is_human_readable_format(): - monitor_values['mem_clock'] = f"{monitor_values['mem_clock']} {freq_unit}" - if self.logger.is_json_format(): - monitor_values['mem_clock'] = {"value" : monitor_values['mem_clock'], - "unit" : freq_unit} - except amdsmi_exception.AmdSmiLibraryException as e: - monitor_values['mem_clock'] = "N/A" - logging.debug("Failed to get mem clock on gpu %s | %s", gpu_id, e.get_error_info()) + # don't populate mem clock on default output + if not args.default_output: + try: + mem_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_uclk'] + monitor_values['mem_clock'] = mem_clock + freq_unit = 'MHz' + if mem_clock != "N/A": + if self.logger.is_human_readable_format(): + monitor_values['mem_clock'] = f"{monitor_values['mem_clock']} {freq_unit}" + if self.logger.is_json_format(): + monitor_values['mem_clock'] = {"value" : monitor_values['mem_clock'], + "unit" : freq_unit} + except amdsmi_exception.AmdSmiLibraryException as e: + monitor_values['mem_clock'] = "N/A" + logging.debug("Failed to get mem clock on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'MEM_CLOCK'.rjust(11) - self.logger.table_header += 'MEM_CLOCK'.rjust(11) if args.encoder: # TODO: The encoding utilization is in progress for Navi. Note: MI3x ASICs only support decoding. try: @@ -5187,7 +5196,7 @@ class AMDSMICommands(): # Averaging the possible encoding activity values if encoding_activity_avg: - encoding_activity_avg = sum(encoding_activity_avg) / len(encoding_activity_avg) + encoding_activity_avg = round(sum(encoding_activity_avg) / len(encoding_activity_avg)) else: encoding_activity_avg = "N/A" @@ -5204,12 +5213,12 @@ class AMDSMICommands(): monitor_values['encoder'] = "N/A" logging.debug("Failed to get encoder utilization on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.table_header += 'ENC_UTIL'.rjust(10) + self.logger.table_header += 'ENC%'.rjust(7) if args.decoder: try: # Get List of vcn activity values - # Note: MI3x ASICs only support decoding, so the vcn_activity is used for decoding activity. + # Note: MI3x ASICs only support decoding, so the vcn_activity is used for decoding activity. decoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['vcn_activity'] decoding_activity_avg = [] for value in decoder_util: @@ -5218,7 +5227,7 @@ class AMDSMICommands(): # Averaging the possible decoding activity values if decoding_activity_avg: - decoding_activity_avg = sum(decoding_activity_avg) / len(decoding_activity_avg) + decoding_activity_avg = round(sum(decoding_activity_avg) / len(decoding_activity_avg)) else: decoding_activity_avg = "N/A" @@ -5235,9 +5244,9 @@ class AMDSMICommands(): monitor_values['decoder'] = "N/A" logging.debug("Failed to get decoder utilization on gpu %s | %s", gpu_id, e.get_error_info()) - self.logger.table_header += 'DEC_UTIL'.rjust(10) + self.logger.table_header += 'DEC%'.rjust(7) - if args.encoder or args.decoder: + if (args.encoder or args.decoder) and not args.default_output: try: vclock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_vclk0'] monitor_values['vclock'] = vclock @@ -5300,7 +5309,8 @@ class AMDSMICommands(): logging.debug("Failed to get sysfs pcie replay counter on gpu %s | %s", gpu_id, e.get_error_info()) self.logger.table_header += 'PCIE_REPLAY'.rjust(13) - if args.vram_usage: + + if args.vram_usage and not args.default_output: try: vram_usage = amdsmi_interface.amdsmi_get_gpu_vram_usage(args.gpu) monitor_values['vram_used'] = vram_usage['vram_used'] @@ -5321,6 +5331,31 @@ class AMDSMICommands(): self.logger.table_header += 'VRAM_USED'.rjust(11) self.logger.table_header += 'VRAM_TOTAL'.rjust(12) + + if args.vram_usage and args.default_output: + try: + vram_usage = amdsmi_interface.amdsmi_get_gpu_vram_usage(args.gpu) + vram_usage_unit = "GB" + if self.logger.is_json_format(): + monitor_values['vram_used'] = {"value" : round(vram_usage['vram_used']/1024,1), + "unit" : vram_usage_unit} + monitor_values['vram_total'] = {"value" : round(vram_usage['vram_total']/1024,1), + "unit" : vram_usage_unit} + elif self.logger.is_csv_format(): + monitor_values['vram_used'] = round(vram_usage['vram_used']/1024,1) + monitor_values['vram_total'] = round(vram_usage['vram_total']/1024,1) + else: + monitor_values['vram_usage'] = f"{vram_usage['vram_used']/1024:5.1f}/{vram_usage['vram_total']/1024:5.1f} {vram_usage_unit}".rjust(16,' ') + except amdsmi_exception.AmdSmiLibraryException as e: + if self.logger.is_json_format(): + monitor_values['vram_used'] = "N/A" + monitor_values['vram_total'] = "N/A" + else: + monitor_values['vram_usage'] = "N/A" + logging.debug("Failed to get vram memory usage on gpu %s | %s", gpu_id, e.get_error_info()) + + self.logger.table_header += 'VRAM_USAGE'.rjust(16) + if args.pcie: if pcie_info != "N/A": pcie_bw_unit = 'Mb/s' diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index 71f4991c37..a056360ea8 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -127,13 +127,21 @@ class AMDSMILogger(): table_values += string_value.rjust(10) + ' ' elif key == 'power_usage': table_values += string_value.rjust(7) - elif key in ('gfx_clock', 'mem_clock', 'vram_used'): + elif key in ('hotspot_temperature', 'memory_temperature'): + table_values += string_value.rjust(8) + elif key in ('gfx', 'mem'): + table_values += string_value.rjust(7) + elif key in ('gfx_clk'): + table_values += string_value.rjust(10) + elif key in ('mem_clock', 'vram_used'): table_values += string_value.rjust(11) + elif key in ('encoder', 'decoder'): + table_values += string_value.rjust(7) elif key in ('vclock', 'dclock'): table_values += string_value.rjust(10) - elif key == 'vram_total' or 'ecc' in key or key == 'pcie_bw': + elif key in ('single_bit_ecc', 'double_bit_ecc', 'pcie_bw', 'vram_total'): table_values += string_value.rjust(12) - elif key in ['pcie_replay']: + elif key in ('pcie_replay'): table_values += string_value.rjust(13) # Only for handling topology tables elif 'gpu_' in key: diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 534b9cc0f4..4c02f36541 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -1258,7 +1258,7 @@ class AMDSMIParser(argparse.ArgumentParser): mem_usage_help = "Monitor memory usage in MB" pcie_bandwidth_help = "Monitor PCIe bandwidth in Mb/s" process_help = "Enable Process information table below monitor output" - violation_help = "Monitor power and thermal violation status (%%); Only available for MI300 or newer ASICs" + violation_help = "Monitor power and thermal violation status (%%);\n Only available for MI300 or newer ASICs" # Create monitor subparser monitor_parser = subparsers.add_parser('monitor', help=monitor_help, description=monitor_subcommand_help, aliases=["dmon"])