Use amd-smi Python API instead of CLI (#1334)

* Use amd-smi Python API instead of CLI

Formatting fix

python path

* Update CHANGELOG

* Create amdsmi interface

* Added amdsmi tests

* Removed run

* Prioritize rocm's amdsmi python API

* address review comments

* update changelog

* fix ruff formatting

---------

Co-authored-by: Vignesh Edithal <Vignesh.Edithal@amd.com>
Этот коммит содержится в:
abchoudh-amd
2025-10-24 11:11:33 +05:30
коммит произвёл GitHub
родитель 839fb95717
Коммит a7bbe0c5d2
9 изменённых файлов: 294 добавлений и 138 удалений
+7
Просмотреть файл
@@ -5,13 +5,20 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
## Unreleased
### Added
* Add `--list-blocks <arch>` option to general options to list available IP blocks on specified arch (similar to `--list-metrics`), cannot be used with `--block`.
* Added `config_delta/gfx950_diff.yaml` to analysis config yamls to track the revision between a gfx9 architecture against the latest supported architecture gfx950
### Changed
* `-b/--block` accepts block alias(es) (See block aliases using command-line option `--list-blocks <arch>`).
* analysis configs yamls are now managed with the new config management workflow in `tools/config_management/`
* `amdsmi` python API is used instead of `amd-smi` CLI to query GPU specifications.
### Removed
### Optimized
+1
Просмотреть файл
@@ -503,6 +503,7 @@ class RocProfCompute_Base:
console_error("Profiler not supported")
# PC sampling data is only collected when block "21" is specified
print(args.filter_blocks)
if not (
"21" in args.filter_blocks
and "pc_sampling" in args.filter_blocks
+10 -62
Просмотреть файл
@@ -24,7 +24,6 @@
##############################################################################
import argparse
import json
import math
import os
import re
@@ -37,6 +36,7 @@ import yaml
import config
from roofline import Roofline
from utils.amdsmi_interface import amdsmi_ctx, get_gpu_model, get_mem_max_clock
from utils.logger import (
console_debug,
console_error,
@@ -102,7 +102,7 @@ class OmniSoC_Base:
return self.__compatible_profilers
def populate_mspec(self) -> None:
from utils.specs import run, search, total_sqc
from utils.specs import search, total_sqc
if (
not hasattr(self._mspec, "rocminfo_lines")
@@ -167,29 +167,8 @@ class OmniSoC_Base:
)
)
# Parse json from amd-smi static --clock
static_data = json.loads(
run(["amd-smi", "static", "--gpu=0", "--json"], exit_on_error=True)
)
# Extract GPU data
gpu_list = (
static_data
if isinstance(static_data, list)
else static_data.get("gpu_data", [])
)
gpu_data = gpu_list[0] if gpu_list else {}
frequency_levels = (
gpu_data.get("clock", {}).get("mem", {}).get("frequency_levels")
)
if frequency_levels:
# Extract max memory clock frequency
amd_smi_mclk = frequency_levels[max(frequency_levels.keys())]
# 100 Mhz -> 100
self._mspec.max_mclk = amd_smi_mclk.split()[0]
console_debug(f"max mem clock is {self._mspec.max_mclk}")
with amdsmi_ctx():
self._mspec.max_mclk = str(get_mem_max_clock())
# These are just max values now, because the parsing was broken and this was
# inconsistent with how we use the clocks elsewhere (all max, all the time)
@@ -220,44 +199,13 @@ class OmniSoC_Base:
Detects the GPU model using various identifiers from 'amd-smi static'.
Falls back through multiple methods if the primary method fails.
"""
from utils.specs import run
# TODO: use amd-smi python api when available
# Load AMD-SMI data
static_data = run(
["amd-smi", "static", "--gpu=0", "--json"], exit_on_error=True
)
try:
parsed_data = json.loads(static_data)
gpu_list = (
parsed_data
if isinstance(parsed_data, list)
else parsed_data.get("gpu_data", [])
)
except json.JSONDecodeError:
gpu_list = []
gpu_data = gpu_list[0] if gpu_list else {}
# Try detection methods until we find a match
detection_methods = [
("asic", "market_name"),
("vbios", "name"),
("board", "product_name"),
]
gpu_model = None
for section, field in detection_methods:
detected_name = gpu_data.get(section, {}).get(field, "").lower()
with amdsmi_ctx():
gpu_model = "N/A"
for model in mi_gpu_specs.get_all_gpu_models():
if model in detected_name:
console_log(f'GPU model "{model}" detected using {section}.{field}')
gpu_model = model
break
if not gpu_model:
console_warning("Unable to determine the GPU model from amd-smi.")
return
for amdsmi_gpu_model in get_gpu_model():
if model.lower() in amdsmi_gpu_model.lower():
gpu_model = model
break
gpu_model = self._adjust_mi300_model(gpu_model.lower(), gpu_arch.lower())
+136
Просмотреть файл
@@ -0,0 +1,136 @@
##############################################################################
# MIT License
#
# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
##############################################################################
import os
import sys
from collections.abc import Iterator
from contextlib import contextmanager
from utils.logger import (
console_debug,
console_error,
console_warning,
)
sys.path.insert(0, os.getenv("ROCM_PATH", "/opt/rocm") + "/share/amd_smi")
try:
import amdsmi
except ImportError as e:
console_warning(f"Unhandled import error: {e}")
console_error("Failed to import the amdsmi Python library.")
@contextmanager
def amdsmi_ctx() -> Iterator[None]:
"""Context manager to initialize and shutdown amdsmi."""
try:
amdsmi.amdsmi_init()
yield
except Exception as e:
console_warning(f"amd-smi init failed: {e}")
finally:
try:
amdsmi.amdsmi_shut_down()
except Exception as e:
console_warning(f"amd-smi shutdown failed: {e}")
def get_device_handle() -> "amdsmi.ProcessorHandle | None":
"""Get the first AMD device handle."""
try:
devices = amdsmi.amdsmi_get_processor_handles()
if len(devices) == 0:
console_warning("No AMD GPU detected!")
return None
console_debug(f"Found {len(devices)} AMD device(s).")
return devices[0]
except Exception as e:
console_warning(f"Error getting device handle: {e}")
return None
def get_mem_max_clock() -> float:
"""Get the maximum memory clock of the device."""
try:
return amdsmi.amdsmi_get_clock_info(
get_device_handle(), amdsmi.AmdSmiClkType.GFX
)["max_clk"]
except Exception as e:
console_warning(f"Error getting memory clocks: {e}")
return 0.0
def get_gpu_model() -> str:
"""Get the GPU model name."""
try:
gpu_model_info = (
# board -> product_name
amdsmi.amdsmi_get_gpu_board_info(get_device_handle())["product_name"],
# asic -> market_name
amdsmi.amdsmi_get_gpu_asic_info(get_device_handle())["market_name"],
# vbios -> name
amdsmi.amdsmi_get_gpu_vbios_info(get_device_handle())["name"],
)
console_debug(f"gpu model info: {str(gpu_model_info)}")
return gpu_model_info
except Exception as e:
console_warning(f"Error getting gpu model info: {e}")
return "N/A"
def get_gpu_vbios_part_number() -> str:
"""Get the GPU VBIOS part number."""
try:
vbios_part_number = amdsmi.amdsmi_get_gpu_vbios_info(get_device_handle())[
"part_number"
]
console_debug(f"GPU VBIOS Part Number: {vbios_part_number}")
return vbios_part_number
except Exception as e:
console_warning(f"Error getting GPU VBIOS part number: {e}")
return "N/A"
def get_gpu_compute_partition() -> str:
"""Get the GPU compute partition."""
try:
compute_partition = amdsmi.amdsmi_get_gpu_compute_partition(get_device_handle())
console_debug(f"GPU Compute Partition: {compute_partition}")
return compute_partition
except Exception as e:
console_warning(f"Error getting GPU compute partition: {e}")
return "N/A"
def get_gpu_memory_partition() -> str:
"""Get the GPU memory partition."""
try:
memory_partition = amdsmi.amdsmi_get_gpu_memory_partition(get_device_handle())
console_debug(f"GPU Memory Partition: {memory_partition}")
return memory_partition
except Exception as e:
console_warning(f"Error getting GPU memory partition: {e}")
return "N/A"
+14 -52
Просмотреть файл
@@ -28,7 +28,6 @@ from __future__ import annotations
import argparse
import importlib
import json
import os
import re
import socket
@@ -42,6 +41,12 @@ from typing import Any, Optional, TypeVar
import pandas as pd
import config
from utils.amdsmi_interface import (
amdsmi_ctx,
get_gpu_compute_partition,
get_gpu_memory_partition,
get_gpu_vbios_part_number,
)
from utils.logger import (
console_debug,
console_error,
@@ -252,57 +257,18 @@ def extract_gpu_info() -> dict[str, Any]:
"memory_partition": None,
}
# Load amd-smi static data for GPU 0
static_output = run(["amd-smi", "static", "--gpu=0", "--json"], exit_on_error=True)
if static_output is None:
return result
try:
static_data = json.loads(static_output)
except json.JSONDecodeError as e:
console_warning(f"Failed to parse amd-smi static output: {e}")
return result
# Extract GPU data
gpu_list = (
static_data
if isinstance(static_data, list)
else static_data.get("gpu_data", [])
)
gpu_data = gpu_list[0] if gpu_list else {}
result["vbios"] = gpu_data.get("vbios", {}).get("part_number")
# Load amd-smi partition data for GPU 0 (amd-smi >= 26.0.0)
partition_output = run(
["amd-smi", "partition", "--gpu=0", "--json"], exit_on_error=False
)
partition_data = {}
if partition_output:
try:
partition_data = json.loads(partition_output)
except json.JSONDecodeError:
partition_data = {}
current_partition = partition_data.get("current_partition", [{}])[0]
# Extract partition values with gpu_data fallback (amd-smi < 26.0.0)
result["compute_partition"] = (
current_partition.get("accelerator_type")
or gpu_data.get("partition", {}).get("accelerator_partition")
or gpu_data.get("partition", {}).get("compute_partition")
)
result["memory_partition"] = current_partition.get("memory") or gpu_data.get(
"partition", {}
).get("memory_partition")
with amdsmi_ctx():
result["vbios"] = get_gpu_vbios_part_number()
result["compute_partition"] = get_gpu_compute_partition()
result["memory_partition"] = get_gpu_memory_partition()
# Apply defaults and warnings
if not result["compute_partition"]:
if result["compute_partition"] == "N/A" or not result["compute_partition"]:
console_warning("Cannot detect accelerator partition from amd-smi.")
console_warning("Applying default accelerator partition: SPX")
result["compute_partition"] = "SPX"
if not result["memory_partition"]:
if result["memory_partition"] == "N/A" or not result["memory_partition"]:
console_warning("Cannot detect memory partition from amd-smi.")
console_debug(
@@ -891,12 +857,8 @@ def run(cmd: list[str], exit_on_error: bool = False) -> str:
'Try passing a path to an existing workload results in "analyze" mode.'
)
if exit_on_error:
if cmd[0] == "amd-smi":
if p.returncode != 2 and p.returncode != 0: # type: ignore
console_error("No GPU detected. Unable to load amd-smi")
elif p.returncode != 0: # type: ignore
console_error(f"Command {cmd} failed with non-zero exit code")
if exit_on_error and p.returncode != 0: # type: ignore
console_error(f"Command {cmd} failed with non-zero exit code")
return p.stdout.decode("utf-8") # type: ignore
-8
Просмотреть файл
@@ -57,14 +57,6 @@ MI300_CHIP_IDS = {
}
def run(cmd):
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if cmd[0] == "amd-smi" and p.returncode == 8:
print("ERROR: No GPU detected. Unable to load amd-smi")
assert 0
return p.stdout.decode("ascii")
def gpu_soc():
## 1) Parse arch details from rocminfo
rocminfo = str(
-8
Просмотреть файл
@@ -92,14 +92,6 @@ def parse_table_dict(output: str) -> dict:
return result
def run(cmd):
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if cmd[0] == "amd-smi" and p.returncode == 8:
print("ERROR: No GPU detected. Unable to load amd-smi")
assert 0
return p.stdout.decode("utf-8")
def get_num_xcds():
num_xcds = None
-8
Просмотреть файл
@@ -314,14 +314,6 @@ def counter_compare(test_name, errors_pd, baseline_df, run_df, threshold=5):
return errors_pd
def run(cmd):
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if cmd[0] == "amd-smi" and p.returncode == 8:
print("ERROR: No GPU detected. Unable to load amd-smi")
assert 0
return p.stdout.decode("ascii")
def gpu_soc():
global num_devices
## 1) Parse arch details from rocminfo
+126
Просмотреть файл
@@ -8669,3 +8669,129 @@ def test_list_metrics(binary_handler_analyze_rocprof_compute, capsys):
output = capsys.readouterr().out
assert "6 -> Workgroup Manager (SPI)" in output
assert "5.2 -> Command processor packet processor (CPC)" in output
# =============================================================================
# TESTS FOR AMDSMI INTERFACE
# =============================================================================
def test_amdsmi_ctx():
from utils.amdsmi_interface import amdsmi_ctx
with mock.patch("amdsmi.amdsmi_init") as amdsmi_init_mock:
with mock.patch("amdsmi.amdsmi_shut_down") as amdsmi_shutdown_mock:
with amdsmi_ctx():
amdsmi_init_mock.assert_called_once()
amdsmi_shutdown_mock.assert_called_once()
def test_get_device_handle():
from utils.amdsmi_interface import get_device_handle
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
device_handles_mock.return_value = [12345]
get_device_handle()
device_handles_mock.assert_called_once()
with mock.patch(
"amdsmi.amdsmi_get_processor_handles", side_effect=Exception("Mock exception")
) as device_handles_mock:
handle = get_device_handle()
assert handle is None
def test_get_mem_max_clock():
from utils.amdsmi_interface import get_mem_max_clock
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
device_handles_mock.return_value = [12345]
with mock.patch("amdsmi.amdsmi_get_clock_info") as mem_max_clock_mock:
mem_max_clock_mock.return_value = {"max_clk": 100}
clk = get_mem_max_clock()
mem_max_clock_mock.assert_called_once()
assert clk == 100
def test_get_gpu_model():
from utils.amdsmi_interface import get_gpu_model
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
device_handles_mock.return_value = [12345]
with mock.patch("amdsmi.amdsmi_get_gpu_board_info") as device_name_mock:
with mock.patch("amdsmi.amdsmi_get_gpu_asic_info") as asic_name_mock:
with mock.patch("amdsmi.amdsmi_get_gpu_vbios_info") as vbios_name_mock:
device_name_mock.return_value = {"product_name": "AMD MIXXX"}
asic_name_mock.return_value = {"market_name": "MIXXX"}
vbios_name_mock.return_value = {"name": "mixxx"}
model = get_gpu_model()
device_name_mock.assert_called_once()
assert model == ("AMD MIXXX", "MIXXX", "mixxx")
with mock.patch(
"amdsmi.amdsmi_get_gpu_board_info", side_effect=Exception("Mock exception")
):
model = get_gpu_model()
assert model == "N/A"
def test_get_gpu_vbios_part_number():
from utils.amdsmi_interface import get_gpu_vbios_part_number
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
device_handles_mock.return_value = [12345]
with mock.patch("amdsmi.amdsmi_get_gpu_vbios_info") as vbios_part_number_mock:
vbios_part_number_mock.return_value = {
"part_number": "12345-67890",
}
part_number = get_gpu_vbios_part_number()
vbios_part_number_mock.assert_called_once()
assert part_number == "12345-67890"
with mock.patch(
"amdsmi.amdsmi_get_gpu_vbios_info", side_effect=Exception("Mock exception")
):
part_number = get_gpu_vbios_part_number()
assert part_number == "N/A"
def test_get_gpu_compute_partition():
from utils.amdsmi_interface import get_gpu_compute_partition
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
device_handles_mock.return_value = [12345]
with mock.patch(
"amdsmi.amdsmi_get_gpu_compute_partition"
) as compute_partition_mock:
compute_partition_mock.return_value = "Mock Partition"
partition = get_gpu_compute_partition()
compute_partition_mock.assert_called_once()
assert partition == "Mock Partition"
with mock.patch(
"amdsmi.amdsmi_get_gpu_compute_partition",
side_effect=Exception("Mock exception"),
):
partition = get_gpu_compute_partition()
assert partition == "N/A"
def test_get_gpu_memory_partition():
from utils.amdsmi_interface import get_gpu_memory_partition
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
device_handles_mock.return_value = [12345]
with mock.patch(
"amdsmi.amdsmi_get_gpu_memory_partition"
) as memory_partition_mock:
memory_partition_mock.return_value = "Mock Memory Partition"
partition = get_gpu_memory_partition()
memory_partition_mock.assert_called_once()
assert partition == "Mock Memory Partition"
with mock.patch(
"amdsmi.amdsmi_get_gpu_memory_partition",
side_effect=Exception("Mock exception"),
):
partition = get_gpu_memory_partition()
assert partition == "N/A"