Use amd-smi Python API instead of CLI (#1334)

* Use amd-smi Python API instead of CLI Formatting fix python path * Update CHANGELOG * Create amdsmi interface * Added amdsmi tests * Removed run * Prioritize rocm's amdsmi python API * address review comments * update changelog * fix ruff formatting --------- Co-authored-by: Vignesh Edithal <Vignesh.Edithal@amd.com>
2025-10-24 11:11:33 +05:30
@@ -5,13 +5,20 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
 ## Unreleased

 ### Added
+
 * Add `--list-blocks <arch>` option to general options to list available IP blocks on specified arch (similar to `--list-metrics`), cannot be used with `--block`.
+
 * Added `config_delta/gfx950_diff.yaml` to analysis config yamls to track the revision between a gfx9 architecture against the latest supported architecture gfx950

 ### Changed
+
 * `-b/--block` accepts block alias(es) (See block aliases using command-line option `--list-blocks <arch>`).
+
 * analysis configs yamls are now managed with the new config management workflow in `tools/config_management/`

+* `amdsmi` python API is used instead of `amd-smi` CLI to query GPU specifications.
+
+
 ### Removed

 ### Optimized
@@ -503,6 +503,7 @@ class RocProfCompute_Base:
                console_error("Profiler not supported")

        # PC sampling data is only collected when block "21" is specified
+        print(args.filter_blocks)
        if not (
            "21" in args.filter_blocks
            and "pc_sampling" in args.filter_blocks
@@ -24,7 +24,6 @@
 ##############################################################################

 import argparse
-import json
 import math
 import os
 import re
@@ -37,6 +36,7 @@ import yaml

 import config
 from roofline import Roofline
+from utils.amdsmi_interface import amdsmi_ctx, get_gpu_model, get_mem_max_clock
 from utils.logger import (
    console_debug,
    console_error,
@@ -102,7 +102,7 @@ class OmniSoC_Base:
        return self.__compatible_profilers

    def populate_mspec(self) -> None:
-        from utils.specs import run, search, total_sqc
+        from utils.specs import search, total_sqc

        if (
            not hasattr(self._mspec, "rocminfo_lines")
@@ -167,29 +167,8 @@ class OmniSoC_Base:
                )
            )

-        # Parse json from amd-smi static --clock
-        static_data = json.loads(
-            run(["amd-smi", "static", "--gpu=0", "--json"], exit_on_error=True)
-        )
-
-        # Extract GPU data
-        gpu_list = (
-            static_data
-            if isinstance(static_data, list)
-            else static_data.get("gpu_data", [])
-        )
-        gpu_data = gpu_list[0] if gpu_list else {}
-
-        frequency_levels = (
-            gpu_data.get("clock", {}).get("mem", {}).get("frequency_levels")
-        )
-        if frequency_levels:
-            # Extract max memory clock frequency
-            amd_smi_mclk = frequency_levels[max(frequency_levels.keys())]
-            # 100 Mhz -> 100
-            self._mspec.max_mclk = amd_smi_mclk.split()[0]
-
-        console_debug(f"max mem clock is {self._mspec.max_mclk}")
+        with amdsmi_ctx():
+            self._mspec.max_mclk = str(get_mem_max_clock())

        # These are just max values now, because the parsing was broken and this was
        # inconsistent with how we use the clocks elsewhere (all max, all the time)
@@ -220,44 +199,13 @@ class OmniSoC_Base:
        Detects the GPU model using various identifiers from 'amd-smi static'.
        Falls back through multiple methods if the primary method fails.
        """
-
-        from utils.specs import run
-
-        # TODO: use amd-smi python api when available
-        # Load AMD-SMI data
-        static_data = run(
-            ["amd-smi", "static", "--gpu=0", "--json"], exit_on_error=True
-        )
-        try:
-            parsed_data = json.loads(static_data)
-            gpu_list = (
-                parsed_data
-                if isinstance(parsed_data, list)
-                else parsed_data.get("gpu_data", [])
-            )
-        except json.JSONDecodeError:
-            gpu_list = []
-        gpu_data = gpu_list[0] if gpu_list else {}
-
-        # Try detection methods until we find a match
-        detection_methods = [
-            ("asic", "market_name"),
-            ("vbios", "name"),
-            ("board", "product_name"),
-        ]
-
-        gpu_model = None
-        for section, field in detection_methods:
-            detected_name = gpu_data.get(section, {}).get(field, "").lower()
+        with amdsmi_ctx():
+            gpu_model = "N/A"
            for model in mi_gpu_specs.get_all_gpu_models():
-                if model in detected_name:
-                    console_log(f'GPU model "{model}" detected using {section}.{field}')
-                    gpu_model = model
-                    break
-
-        if not gpu_model:
-            console_warning("Unable to determine the GPU model from amd-smi.")
-            return
+                for amdsmi_gpu_model in get_gpu_model():
+                    if model.lower() in amdsmi_gpu_model.lower():
+                        gpu_model = model
+                        break

        gpu_model = self._adjust_mi300_model(gpu_model.lower(), gpu_arch.lower())

@@ -0,0 +1,136 @@
+##############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+##############################################################################
+
+import os
+import sys
+from collections.abc import Iterator
+from contextlib import contextmanager
+
+from utils.logger import (
+    console_debug,
+    console_error,
+    console_warning,
+)
+
+sys.path.insert(0, os.getenv("ROCM_PATH", "/opt/rocm") + "/share/amd_smi")
+
+try:
+    import amdsmi
+except ImportError as e:
+    console_warning(f"Unhandled import error: {e}")
+    console_error("Failed to import the amdsmi Python library.")
+
+
+@contextmanager
+def amdsmi_ctx() -> Iterator[None]:
+    """Context manager to initialize and shutdown amdsmi."""
+    try:
+        amdsmi.amdsmi_init()
+        yield
+    except Exception as e:
+        console_warning(f"amd-smi init failed: {e}")
+    finally:
+        try:
+            amdsmi.amdsmi_shut_down()
+        except Exception as e:
+            console_warning(f"amd-smi shutdown failed: {e}")
+
+
+def get_device_handle() -> "amdsmi.ProcessorHandle | None":
+    """Get the first AMD device handle."""
+    try:
+        devices = amdsmi.amdsmi_get_processor_handles()
+        if len(devices) == 0:
+            console_warning("No AMD GPU detected!")
+            return None
+        console_debug(f"Found {len(devices)} AMD device(s).")
+        return devices[0]
+    except Exception as e:
+        console_warning(f"Error getting device handle: {e}")
+        return None
+
+
+def get_mem_max_clock() -> float:
+    """Get the maximum memory clock of the device."""
+    try:
+        return amdsmi.amdsmi_get_clock_info(
+            get_device_handle(), amdsmi.AmdSmiClkType.GFX
+        )["max_clk"]
+    except Exception as e:
+        console_warning(f"Error getting memory clocks: {e}")
+        return 0.0
+
+
+def get_gpu_model() -> str:
+    """Get the GPU model name."""
+    try:
+        gpu_model_info = (
+            # board -> product_name
+            amdsmi.amdsmi_get_gpu_board_info(get_device_handle())["product_name"],
+            # asic -> market_name
+            amdsmi.amdsmi_get_gpu_asic_info(get_device_handle())["market_name"],
+            # vbios -> name
+            amdsmi.amdsmi_get_gpu_vbios_info(get_device_handle())["name"],
+        )
+        console_debug(f"gpu model info: {str(gpu_model_info)}")
+        return gpu_model_info
+    except Exception as e:
+        console_warning(f"Error getting gpu model info: {e}")
+        return "N/A"
+
+
+def get_gpu_vbios_part_number() -> str:
+    """Get the GPU VBIOS part number."""
+    try:
+        vbios_part_number = amdsmi.amdsmi_get_gpu_vbios_info(get_device_handle())[
+            "part_number"
+        ]
+        console_debug(f"GPU VBIOS Part Number: {vbios_part_number}")
+        return vbios_part_number
+    except Exception as e:
+        console_warning(f"Error getting GPU VBIOS part number: {e}")
+        return "N/A"
+
+
+def get_gpu_compute_partition() -> str:
+    """Get the GPU compute partition."""
+    try:
+        compute_partition = amdsmi.amdsmi_get_gpu_compute_partition(get_device_handle())
+        console_debug(f"GPU Compute Partition: {compute_partition}")
+        return compute_partition
+    except Exception as e:
+        console_warning(f"Error getting GPU compute partition: {e}")
+        return "N/A"
+
+
+def get_gpu_memory_partition() -> str:
+    """Get the GPU memory partition."""
+    try:
+        memory_partition = amdsmi.amdsmi_get_gpu_memory_partition(get_device_handle())
+        console_debug(f"GPU Memory Partition: {memory_partition}")
+        return memory_partition
+    except Exception as e:
+        console_warning(f"Error getting GPU memory partition: {e}")
+        return "N/A"
@@ -28,7 +28,6 @@ from __future__ import annotations

 import argparse
 import importlib
-import json
 import os
 import re
 import socket
@@ -42,6 +41,12 @@ from typing import Any, Optional, TypeVar
 import pandas as pd

 import config
+from utils.amdsmi_interface import (
+    amdsmi_ctx,
+    get_gpu_compute_partition,
+    get_gpu_memory_partition,
+    get_gpu_vbios_part_number,
+)
 from utils.logger import (
    console_debug,
    console_error,
@@ -252,57 +257,18 @@ def extract_gpu_info() -> dict[str, Any]:
        "memory_partition": None,
    }

-    # Load amd-smi static data for GPU 0
-    static_output = run(["amd-smi", "static", "--gpu=0", "--json"], exit_on_error=True)
-    if static_output is None:
-        return result
-
-    try:
-        static_data = json.loads(static_output)
-    except json.JSONDecodeError as e:
-        console_warning(f"Failed to parse amd-smi static output: {e}")
-        return result
-
-    # Extract GPU data
-    gpu_list = (
-        static_data
-        if isinstance(static_data, list)
-        else static_data.get("gpu_data", [])
-    )
-    gpu_data = gpu_list[0] if gpu_list else {}
-    result["vbios"] = gpu_data.get("vbios", {}).get("part_number")
-
-    # Load amd-smi partition data for GPU 0 (amd-smi >= 26.0.0)
-    partition_output = run(
-        ["amd-smi", "partition", "--gpu=0", "--json"], exit_on_error=False
-    )
-    partition_data = {}
-
-    if partition_output:
-        try:
-            partition_data = json.loads(partition_output)
-        except json.JSONDecodeError:
-            partition_data = {}
-
-    current_partition = partition_data.get("current_partition", [{}])[0]
-
-    # Extract partition values with gpu_data fallback (amd-smi < 26.0.0)
-    result["compute_partition"] = (
-        current_partition.get("accelerator_type")
-        or gpu_data.get("partition", {}).get("accelerator_partition")
-        or gpu_data.get("partition", {}).get("compute_partition")
-    )
-    result["memory_partition"] = current_partition.get("memory") or gpu_data.get(
-        "partition", {}
-    ).get("memory_partition")
+    with amdsmi_ctx():
+        result["vbios"] = get_gpu_vbios_part_number()
+        result["compute_partition"] = get_gpu_compute_partition()
+        result["memory_partition"] = get_gpu_memory_partition()

    # Apply defaults and warnings
-    if not result["compute_partition"]:
+    if result["compute_partition"] == "N/A" or not result["compute_partition"]:
        console_warning("Cannot detect accelerator partition from amd-smi.")
        console_warning("Applying default accelerator partition: SPX")
        result["compute_partition"] = "SPX"

-    if not result["memory_partition"]:
+    if result["memory_partition"] == "N/A" or not result["memory_partition"]:
        console_warning("Cannot detect memory partition from amd-smi.")

    console_debug(
@@ -891,12 +857,8 @@ def run(cmd: list[str], exit_on_error: bool = False) -> str:
            'Try passing a path to an existing workload results in "analyze" mode.'
        )

-    if exit_on_error:
-        if cmd[0] == "amd-smi":
-            if p.returncode != 2 and p.returncode != 0:  # type: ignore
-                console_error("No GPU detected. Unable to load amd-smi")
-        elif p.returncode != 0:  # type: ignore
-            console_error(f"Command {cmd} failed with non-zero exit code")
+    if exit_on_error and p.returncode != 0:  # type: ignore
+        console_error(f"Command {cmd} failed with non-zero exit code")
    return p.stdout.decode("utf-8")  # type: ignore


@@ -57,14 +57,6 @@ MI300_CHIP_IDS = {
 }


-def run(cmd):
-    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    if cmd[0] == "amd-smi" and p.returncode == 8:
-        print("ERROR: No GPU detected. Unable to load amd-smi")
-        assert 0
-    return p.stdout.decode("ascii")
-
-
 def gpu_soc():
    ## 1) Parse arch details from rocminfo
    rocminfo = str(
@@ -92,14 +92,6 @@ def parse_table_dict(output: str) -> dict:
    return result


-def run(cmd):
-    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    if cmd[0] == "amd-smi" and p.returncode == 8:
-        print("ERROR: No GPU detected. Unable to load amd-smi")
-        assert 0
-    return p.stdout.decode("utf-8")
-
-
 def get_num_xcds():
    num_xcds = None

@@ -314,14 +314,6 @@ def counter_compare(test_name, errors_pd, baseline_df, run_df, threshold=5):
    return errors_pd


-def run(cmd):
-    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    if cmd[0] == "amd-smi" and p.returncode == 8:
-        print("ERROR: No GPU detected. Unable to load amd-smi")
-        assert 0
-    return p.stdout.decode("ascii")
-
-
 def gpu_soc():
    global num_devices
    ## 1) Parse arch details from rocminfo
@@ -8669,3 +8669,129 @@ def test_list_metrics(binary_handler_analyze_rocprof_compute, capsys):
    output = capsys.readouterr().out
    assert "6 -> Workgroup Manager (SPI)" in output
    assert "5.2 -> Command processor packet processor (CPC)" in output
+
+
+# =============================================================================
+# TESTS FOR AMDSMI INTERFACE
+# =============================================================================
+
+
+def test_amdsmi_ctx():
+    from utils.amdsmi_interface import amdsmi_ctx
+
+    with mock.patch("amdsmi.amdsmi_init") as amdsmi_init_mock:
+        with mock.patch("amdsmi.amdsmi_shut_down") as amdsmi_shutdown_mock:
+            with amdsmi_ctx():
+                amdsmi_init_mock.assert_called_once()
+            amdsmi_shutdown_mock.assert_called_once()
+
+
+def test_get_device_handle():
+    from utils.amdsmi_interface import get_device_handle
+
+    with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
+        device_handles_mock.return_value = [12345]
+        get_device_handle()
+        device_handles_mock.assert_called_once()
+
+    with mock.patch(
+        "amdsmi.amdsmi_get_processor_handles", side_effect=Exception("Mock exception")
+    ) as device_handles_mock:
+        handle = get_device_handle()
+        assert handle is None
+
+
+def test_get_mem_max_clock():
+    from utils.amdsmi_interface import get_mem_max_clock
+
+    with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
+        device_handles_mock.return_value = [12345]
+        with mock.patch("amdsmi.amdsmi_get_clock_info") as mem_max_clock_mock:
+            mem_max_clock_mock.return_value = {"max_clk": 100}
+            clk = get_mem_max_clock()
+            mem_max_clock_mock.assert_called_once()
+            assert clk == 100
+
+
+def test_get_gpu_model():
+    from utils.amdsmi_interface import get_gpu_model
+
+    with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
+        device_handles_mock.return_value = [12345]
+        with mock.patch("amdsmi.amdsmi_get_gpu_board_info") as device_name_mock:
+            with mock.patch("amdsmi.amdsmi_get_gpu_asic_info") as asic_name_mock:
+                with mock.patch("amdsmi.amdsmi_get_gpu_vbios_info") as vbios_name_mock:
+                    device_name_mock.return_value = {"product_name": "AMD MIXXX"}
+                    asic_name_mock.return_value = {"market_name": "MIXXX"}
+                    vbios_name_mock.return_value = {"name": "mixxx"}
+                    model = get_gpu_model()
+                    device_name_mock.assert_called_once()
+                    assert model == ("AMD MIXXX", "MIXXX", "mixxx")
+
+        with mock.patch(
+            "amdsmi.amdsmi_get_gpu_board_info", side_effect=Exception("Mock exception")
+        ):
+            model = get_gpu_model()
+            assert model == "N/A"
+
+
+def test_get_gpu_vbios_part_number():
+    from utils.amdsmi_interface import get_gpu_vbios_part_number
+
+    with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
+        device_handles_mock.return_value = [12345]
+        with mock.patch("amdsmi.amdsmi_get_gpu_vbios_info") as vbios_part_number_mock:
+            vbios_part_number_mock.return_value = {
+                "part_number": "12345-67890",
+            }
+            part_number = get_gpu_vbios_part_number()
+            vbios_part_number_mock.assert_called_once()
+            assert part_number == "12345-67890"
+
+        with mock.patch(
+            "amdsmi.amdsmi_get_gpu_vbios_info", side_effect=Exception("Mock exception")
+        ):
+            part_number = get_gpu_vbios_part_number()
+            assert part_number == "N/A"
+
+
+def test_get_gpu_compute_partition():
+    from utils.amdsmi_interface import get_gpu_compute_partition
+
+    with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
+        device_handles_mock.return_value = [12345]
+        with mock.patch(
+            "amdsmi.amdsmi_get_gpu_compute_partition"
+        ) as compute_partition_mock:
+            compute_partition_mock.return_value = "Mock Partition"
+            partition = get_gpu_compute_partition()
+            compute_partition_mock.assert_called_once()
+            assert partition == "Mock Partition"
+
+        with mock.patch(
+            "amdsmi.amdsmi_get_gpu_compute_partition",
+            side_effect=Exception("Mock exception"),
+        ):
+            partition = get_gpu_compute_partition()
+            assert partition == "N/A"
+
+
+def test_get_gpu_memory_partition():
+    from utils.amdsmi_interface import get_gpu_memory_partition
+
+    with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
+        device_handles_mock.return_value = [12345]
+        with mock.patch(
+            "amdsmi.amdsmi_get_gpu_memory_partition"
+        ) as memory_partition_mock:
+            memory_partition_mock.return_value = "Mock Memory Partition"
+            partition = get_gpu_memory_partition()
+            memory_partition_mock.assert_called_once()
+            assert partition == "Mock Memory Partition"
+
+        with mock.patch(
+            "amdsmi.amdsmi_get_gpu_memory_partition",
+            side_effect=Exception("Mock exception"),
+        ):
+            partition = get_gpu_memory_partition()
+            assert partition == "N/A"