diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index bfe447ac73..b157cd4124 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -5,13 +5,20 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. ## Unreleased ### Added + * Add `--list-blocks ` option to general options to list available IP blocks on specified arch (similar to `--list-metrics`), cannot be used with `--block`. + * Added `config_delta/gfx950_diff.yaml` to analysis config yamls to track the revision between a gfx9 architecture against the latest supported architecture gfx950 ### Changed + * `-b/--block` accepts block alias(es) (See block aliases using command-line option `--list-blocks `). + * analysis configs yamls are now managed with the new config management workflow in `tools/config_management/` +* `amdsmi` python API is used instead of `amd-smi` CLI to query GPU specifications. + + ### Removed ### Optimized diff --git a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py index 93e1d8057d..49c6d989d4 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py @@ -503,6 +503,7 @@ class RocProfCompute_Base: console_error("Profiler not supported") # PC sampling data is only collected when block "21" is specified + print(args.filter_blocks) if not ( "21" in args.filter_blocks and "pc_sampling" in args.filter_blocks diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py index e273bb0ab1..aaa27bb009 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py @@ -24,7 +24,6 @@ ############################################################################## import argparse -import json import math import os import re @@ -37,6 +36,7 @@ import yaml import config from roofline import Roofline +from utils.amdsmi_interface import amdsmi_ctx, get_gpu_model, get_mem_max_clock from utils.logger import ( console_debug, console_error, @@ -102,7 +102,7 @@ class OmniSoC_Base: return self.__compatible_profilers def populate_mspec(self) -> None: - from utils.specs import run, search, total_sqc + from utils.specs import search, total_sqc if ( not hasattr(self._mspec, "rocminfo_lines") @@ -167,29 +167,8 @@ class OmniSoC_Base: ) ) - # Parse json from amd-smi static --clock - static_data = json.loads( - run(["amd-smi", "static", "--gpu=0", "--json"], exit_on_error=True) - ) - - # Extract GPU data - gpu_list = ( - static_data - if isinstance(static_data, list) - else static_data.get("gpu_data", []) - ) - gpu_data = gpu_list[0] if gpu_list else {} - - frequency_levels = ( - gpu_data.get("clock", {}).get("mem", {}).get("frequency_levels") - ) - if frequency_levels: - # Extract max memory clock frequency - amd_smi_mclk = frequency_levels[max(frequency_levels.keys())] - # 100 Mhz -> 100 - self._mspec.max_mclk = amd_smi_mclk.split()[0] - - console_debug(f"max mem clock is {self._mspec.max_mclk}") + with amdsmi_ctx(): + self._mspec.max_mclk = str(get_mem_max_clock()) # These are just max values now, because the parsing was broken and this was # inconsistent with how we use the clocks elsewhere (all max, all the time) @@ -220,44 +199,13 @@ class OmniSoC_Base: Detects the GPU model using various identifiers from 'amd-smi static'. Falls back through multiple methods if the primary method fails. """ - - from utils.specs import run - - # TODO: use amd-smi python api when available - # Load AMD-SMI data - static_data = run( - ["amd-smi", "static", "--gpu=0", "--json"], exit_on_error=True - ) - try: - parsed_data = json.loads(static_data) - gpu_list = ( - parsed_data - if isinstance(parsed_data, list) - else parsed_data.get("gpu_data", []) - ) - except json.JSONDecodeError: - gpu_list = [] - gpu_data = gpu_list[0] if gpu_list else {} - - # Try detection methods until we find a match - detection_methods = [ - ("asic", "market_name"), - ("vbios", "name"), - ("board", "product_name"), - ] - - gpu_model = None - for section, field in detection_methods: - detected_name = gpu_data.get(section, {}).get(field, "").lower() + with amdsmi_ctx(): + gpu_model = "N/A" for model in mi_gpu_specs.get_all_gpu_models(): - if model in detected_name: - console_log(f'GPU model "{model}" detected using {section}.{field}') - gpu_model = model - break - - if not gpu_model: - console_warning("Unable to determine the GPU model from amd-smi.") - return + for amdsmi_gpu_model in get_gpu_model(): + if model.lower() in amdsmi_gpu_model.lower(): + gpu_model = model + break gpu_model = self._adjust_mi300_model(gpu_model.lower(), gpu_arch.lower()) diff --git a/projects/rocprofiler-compute/src/utils/amdsmi_interface.py b/projects/rocprofiler-compute/src/utils/amdsmi_interface.py new file mode 100644 index 0000000000..1675e3abe9 --- /dev/null +++ b/projects/rocprofiler-compute/src/utils/amdsmi_interface.py @@ -0,0 +1,136 @@ +############################################################################## +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +############################################################################## + +import os +import sys +from collections.abc import Iterator +from contextlib import contextmanager + +from utils.logger import ( + console_debug, + console_error, + console_warning, +) + +sys.path.insert(0, os.getenv("ROCM_PATH", "/opt/rocm") + "/share/amd_smi") + +try: + import amdsmi +except ImportError as e: + console_warning(f"Unhandled import error: {e}") + console_error("Failed to import the amdsmi Python library.") + + +@contextmanager +def amdsmi_ctx() -> Iterator[None]: + """Context manager to initialize and shutdown amdsmi.""" + try: + amdsmi.amdsmi_init() + yield + except Exception as e: + console_warning(f"amd-smi init failed: {e}") + finally: + try: + amdsmi.amdsmi_shut_down() + except Exception as e: + console_warning(f"amd-smi shutdown failed: {e}") + + +def get_device_handle() -> "amdsmi.ProcessorHandle | None": + """Get the first AMD device handle.""" + try: + devices = amdsmi.amdsmi_get_processor_handles() + if len(devices) == 0: + console_warning("No AMD GPU detected!") + return None + console_debug(f"Found {len(devices)} AMD device(s).") + return devices[0] + except Exception as e: + console_warning(f"Error getting device handle: {e}") + return None + + +def get_mem_max_clock() -> float: + """Get the maximum memory clock of the device.""" + try: + return amdsmi.amdsmi_get_clock_info( + get_device_handle(), amdsmi.AmdSmiClkType.GFX + )["max_clk"] + except Exception as e: + console_warning(f"Error getting memory clocks: {e}") + return 0.0 + + +def get_gpu_model() -> str: + """Get the GPU model name.""" + try: + gpu_model_info = ( + # board -> product_name + amdsmi.amdsmi_get_gpu_board_info(get_device_handle())["product_name"], + # asic -> market_name + amdsmi.amdsmi_get_gpu_asic_info(get_device_handle())["market_name"], + # vbios -> name + amdsmi.amdsmi_get_gpu_vbios_info(get_device_handle())["name"], + ) + console_debug(f"gpu model info: {str(gpu_model_info)}") + return gpu_model_info + except Exception as e: + console_warning(f"Error getting gpu model info: {e}") + return "N/A" + + +def get_gpu_vbios_part_number() -> str: + """Get the GPU VBIOS part number.""" + try: + vbios_part_number = amdsmi.amdsmi_get_gpu_vbios_info(get_device_handle())[ + "part_number" + ] + console_debug(f"GPU VBIOS Part Number: {vbios_part_number}") + return vbios_part_number + except Exception as e: + console_warning(f"Error getting GPU VBIOS part number: {e}") + return "N/A" + + +def get_gpu_compute_partition() -> str: + """Get the GPU compute partition.""" + try: + compute_partition = amdsmi.amdsmi_get_gpu_compute_partition(get_device_handle()) + console_debug(f"GPU Compute Partition: {compute_partition}") + return compute_partition + except Exception as e: + console_warning(f"Error getting GPU compute partition: {e}") + return "N/A" + + +def get_gpu_memory_partition() -> str: + """Get the GPU memory partition.""" + try: + memory_partition = amdsmi.amdsmi_get_gpu_memory_partition(get_device_handle()) + console_debug(f"GPU Memory Partition: {memory_partition}") + return memory_partition + except Exception as e: + console_warning(f"Error getting GPU memory partition: {e}") + return "N/A" diff --git a/projects/rocprofiler-compute/src/utils/specs.py b/projects/rocprofiler-compute/src/utils/specs.py index 7819583421..cce4b7ddf2 100644 --- a/projects/rocprofiler-compute/src/utils/specs.py +++ b/projects/rocprofiler-compute/src/utils/specs.py @@ -28,7 +28,6 @@ from __future__ import annotations import argparse import importlib -import json import os import re import socket @@ -42,6 +41,12 @@ from typing import Any, Optional, TypeVar import pandas as pd import config +from utils.amdsmi_interface import ( + amdsmi_ctx, + get_gpu_compute_partition, + get_gpu_memory_partition, + get_gpu_vbios_part_number, +) from utils.logger import ( console_debug, console_error, @@ -252,57 +257,18 @@ def extract_gpu_info() -> dict[str, Any]: "memory_partition": None, } - # Load amd-smi static data for GPU 0 - static_output = run(["amd-smi", "static", "--gpu=0", "--json"], exit_on_error=True) - if static_output is None: - return result - - try: - static_data = json.loads(static_output) - except json.JSONDecodeError as e: - console_warning(f"Failed to parse amd-smi static output: {e}") - return result - - # Extract GPU data - gpu_list = ( - static_data - if isinstance(static_data, list) - else static_data.get("gpu_data", []) - ) - gpu_data = gpu_list[0] if gpu_list else {} - result["vbios"] = gpu_data.get("vbios", {}).get("part_number") - - # Load amd-smi partition data for GPU 0 (amd-smi >= 26.0.0) - partition_output = run( - ["amd-smi", "partition", "--gpu=0", "--json"], exit_on_error=False - ) - partition_data = {} - - if partition_output: - try: - partition_data = json.loads(partition_output) - except json.JSONDecodeError: - partition_data = {} - - current_partition = partition_data.get("current_partition", [{}])[0] - - # Extract partition values with gpu_data fallback (amd-smi < 26.0.0) - result["compute_partition"] = ( - current_partition.get("accelerator_type") - or gpu_data.get("partition", {}).get("accelerator_partition") - or gpu_data.get("partition", {}).get("compute_partition") - ) - result["memory_partition"] = current_partition.get("memory") or gpu_data.get( - "partition", {} - ).get("memory_partition") + with amdsmi_ctx(): + result["vbios"] = get_gpu_vbios_part_number() + result["compute_partition"] = get_gpu_compute_partition() + result["memory_partition"] = get_gpu_memory_partition() # Apply defaults and warnings - if not result["compute_partition"]: + if result["compute_partition"] == "N/A" or not result["compute_partition"]: console_warning("Cannot detect accelerator partition from amd-smi.") console_warning("Applying default accelerator partition: SPX") result["compute_partition"] = "SPX" - if not result["memory_partition"]: + if result["memory_partition"] == "N/A" or not result["memory_partition"]: console_warning("Cannot detect memory partition from amd-smi.") console_debug( @@ -891,12 +857,8 @@ def run(cmd: list[str], exit_on_error: bool = False) -> str: 'Try passing a path to an existing workload results in "analyze" mode.' ) - if exit_on_error: - if cmd[0] == "amd-smi": - if p.returncode != 2 and p.returncode != 0: # type: ignore - console_error("No GPU detected. Unable to load amd-smi") - elif p.returncode != 0: # type: ignore - console_error(f"Command {cmd} failed with non-zero exit code") + if exit_on_error and p.returncode != 0: # type: ignore + console_error(f"Command {cmd} failed with non-zero exit code") return p.stdout.decode("utf-8") # type: ignore diff --git a/projects/rocprofiler-compute/tests/test_TCP_counters.py b/projects/rocprofiler-compute/tests/test_TCP_counters.py index 48012a60d9..90445e10b6 100644 --- a/projects/rocprofiler-compute/tests/test_TCP_counters.py +++ b/projects/rocprofiler-compute/tests/test_TCP_counters.py @@ -57,14 +57,6 @@ MI300_CHIP_IDS = { } -def run(cmd): - p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if cmd[0] == "amd-smi" and p.returncode == 8: - print("ERROR: No GPU detected. Unable to load amd-smi") - assert 0 - return p.stdout.decode("ascii") - - def gpu_soc(): ## 1) Parse arch details from rocminfo rocminfo = str( diff --git a/projects/rocprofiler-compute/tests/test_gpu_specs.py b/projects/rocprofiler-compute/tests/test_gpu_specs.py index 6fffadb0b6..b77bd01367 100644 --- a/projects/rocprofiler-compute/tests/test_gpu_specs.py +++ b/projects/rocprofiler-compute/tests/test_gpu_specs.py @@ -92,14 +92,6 @@ def parse_table_dict(output: str) -> dict: return result -def run(cmd): - p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if cmd[0] == "amd-smi" and p.returncode == 8: - print("ERROR: No GPU detected. Unable to load amd-smi") - assert 0 - return p.stdout.decode("utf-8") - - def get_num_xcds(): num_xcds = None diff --git a/projects/rocprofiler-compute/tests/test_profile_general.py b/projects/rocprofiler-compute/tests/test_profile_general.py index 7e106f68e4..e3c88b80fa 100644 --- a/projects/rocprofiler-compute/tests/test_profile_general.py +++ b/projects/rocprofiler-compute/tests/test_profile_general.py @@ -314,14 +314,6 @@ def counter_compare(test_name, errors_pd, baseline_df, run_df, threshold=5): return errors_pd -def run(cmd): - p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if cmd[0] == "amd-smi" and p.returncode == 8: - print("ERROR: No GPU detected. Unable to load amd-smi") - assert 0 - return p.stdout.decode("ascii") - - def gpu_soc(): global num_devices ## 1) Parse arch details from rocminfo diff --git a/projects/rocprofiler-compute/tests/test_utils.py b/projects/rocprofiler-compute/tests/test_utils.py index 866cdbf786..48bd0e02ce 100644 --- a/projects/rocprofiler-compute/tests/test_utils.py +++ b/projects/rocprofiler-compute/tests/test_utils.py @@ -8669,3 +8669,129 @@ def test_list_metrics(binary_handler_analyze_rocprof_compute, capsys): output = capsys.readouterr().out assert "6 -> Workgroup Manager (SPI)" in output assert "5.2 -> Command processor packet processor (CPC)" in output + + +# ============================================================================= +# TESTS FOR AMDSMI INTERFACE +# ============================================================================= + + +def test_amdsmi_ctx(): + from utils.amdsmi_interface import amdsmi_ctx + + with mock.patch("amdsmi.amdsmi_init") as amdsmi_init_mock: + with mock.patch("amdsmi.amdsmi_shut_down") as amdsmi_shutdown_mock: + with amdsmi_ctx(): + amdsmi_init_mock.assert_called_once() + amdsmi_shutdown_mock.assert_called_once() + + +def test_get_device_handle(): + from utils.amdsmi_interface import get_device_handle + + with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: + device_handles_mock.return_value = [12345] + get_device_handle() + device_handles_mock.assert_called_once() + + with mock.patch( + "amdsmi.amdsmi_get_processor_handles", side_effect=Exception("Mock exception") + ) as device_handles_mock: + handle = get_device_handle() + assert handle is None + + +def test_get_mem_max_clock(): + from utils.amdsmi_interface import get_mem_max_clock + + with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: + device_handles_mock.return_value = [12345] + with mock.patch("amdsmi.amdsmi_get_clock_info") as mem_max_clock_mock: + mem_max_clock_mock.return_value = {"max_clk": 100} + clk = get_mem_max_clock() + mem_max_clock_mock.assert_called_once() + assert clk == 100 + + +def test_get_gpu_model(): + from utils.amdsmi_interface import get_gpu_model + + with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: + device_handles_mock.return_value = [12345] + with mock.patch("amdsmi.amdsmi_get_gpu_board_info") as device_name_mock: + with mock.patch("amdsmi.amdsmi_get_gpu_asic_info") as asic_name_mock: + with mock.patch("amdsmi.amdsmi_get_gpu_vbios_info") as vbios_name_mock: + device_name_mock.return_value = {"product_name": "AMD MIXXX"} + asic_name_mock.return_value = {"market_name": "MIXXX"} + vbios_name_mock.return_value = {"name": "mixxx"} + model = get_gpu_model() + device_name_mock.assert_called_once() + assert model == ("AMD MIXXX", "MIXXX", "mixxx") + + with mock.patch( + "amdsmi.amdsmi_get_gpu_board_info", side_effect=Exception("Mock exception") + ): + model = get_gpu_model() + assert model == "N/A" + + +def test_get_gpu_vbios_part_number(): + from utils.amdsmi_interface import get_gpu_vbios_part_number + + with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: + device_handles_mock.return_value = [12345] + with mock.patch("amdsmi.amdsmi_get_gpu_vbios_info") as vbios_part_number_mock: + vbios_part_number_mock.return_value = { + "part_number": "12345-67890", + } + part_number = get_gpu_vbios_part_number() + vbios_part_number_mock.assert_called_once() + assert part_number == "12345-67890" + + with mock.patch( + "amdsmi.amdsmi_get_gpu_vbios_info", side_effect=Exception("Mock exception") + ): + part_number = get_gpu_vbios_part_number() + assert part_number == "N/A" + + +def test_get_gpu_compute_partition(): + from utils.amdsmi_interface import get_gpu_compute_partition + + with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: + device_handles_mock.return_value = [12345] + with mock.patch( + "amdsmi.amdsmi_get_gpu_compute_partition" + ) as compute_partition_mock: + compute_partition_mock.return_value = "Mock Partition" + partition = get_gpu_compute_partition() + compute_partition_mock.assert_called_once() + assert partition == "Mock Partition" + + with mock.patch( + "amdsmi.amdsmi_get_gpu_compute_partition", + side_effect=Exception("Mock exception"), + ): + partition = get_gpu_compute_partition() + assert partition == "N/A" + + +def test_get_gpu_memory_partition(): + from utils.amdsmi_interface import get_gpu_memory_partition + + with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: + device_handles_mock.return_value = [12345] + with mock.patch( + "amdsmi.amdsmi_get_gpu_memory_partition" + ) as memory_partition_mock: + memory_partition_mock.return_value = "Mock Memory Partition" + partition = get_gpu_memory_partition() + memory_partition_mock.assert_called_once() + assert partition == "Mock Memory Partition" + + with mock.patch( + "amdsmi.amdsmi_get_gpu_memory_partition", + side_effect=Exception("Mock exception"), + ): + partition = get_gpu_memory_partition() + assert partition == "N/A"