[rocprofiler-compute] Only depend on amdsmi in profile phase (#2044)

* Only depepnd on amdsmi in profile phase

* amdsmi interface tests should have common prefix for easier testing
Этот коммит содержится в:
vedithal-amd
2025-11-28 11:32:00 -05:00
коммит произвёл GitHub
родитель 68c8e111ae
Коммит 3f2fbc18e9
2 изменённых файлов: 60 добавлений и 20 удалений
+33 -7
Просмотреть файл
@@ -34,18 +34,35 @@ from utils.logger import (
console_warning,
)
sys.path.insert(0, os.getenv("ROCM_PATH", "/opt/rocm") + "/share/amd_smi")
_amdsmi_module = None
try:
import amdsmi
except ImportError as e:
console_warning(f"Unhandled import error: {e}")
console_error("Failed to import the amdsmi Python library.")
# Ignore undefined name amdsmi since it's dynamically imported
def import_amdsmi_module() -> "amdsmi": # noqa: F821
"""
Dynamically import the amdsmi module because we only
want profile time dependency on amdsmi.
Uses global cache to avoid repeated imports.
"""
global _amdsmi_module
if not _amdsmi_module:
sys.path.insert(0, os.getenv("ROCM_PATH", "/opt/rocm") + "/share/amd_smi")
try:
import amdsmi
_amdsmi_module = amdsmi
except ImportError as e:
console_warning(f"Unhandled import error: {e}")
console_error("Failed to import the amdsmi Python library.")
return _amdsmi_module
@contextmanager
def amdsmi_ctx() -> Iterator[None]:
"""Context manager to initialize and shutdown amdsmi."""
amdsmi = import_amdsmi_module()
try:
amdsmi.amdsmi_init()
yield
@@ -58,8 +75,10 @@ def amdsmi_ctx() -> Iterator[None]:
console_warning(f"amd-smi shutdown failed: {e}")
def get_device_handle() -> "amdsmi.ProcessorHandle | None":
# Ignore undefined name amdsmi since it's dynamically imported
def get_device_handle() -> "amdsmi.ProcessorHandle | None": # noqa: F821
"""Get the first AMD device handle."""
amdsmi = import_amdsmi_module()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if len(devices) == 0:
@@ -74,6 +93,7 @@ def get_device_handle() -> "amdsmi.ProcessorHandle | None":
def get_mem_max_clock() -> float:
"""Get the maximum memory clock of the device."""
amdsmi = import_amdsmi_module()
try:
return amdsmi.amdsmi_get_clock_info(
get_device_handle(), amdsmi.AmdSmiClkType.GFX
@@ -85,6 +105,7 @@ def get_mem_max_clock() -> float:
def get_gpu_model() -> str:
"""Get the GPU model name."""
amdsmi = import_amdsmi_module()
try:
gpu_model_info = (
# board -> product_name
@@ -103,6 +124,7 @@ def get_gpu_model() -> str:
def get_gpu_vbios_part_number() -> str:
"""Get the GPU VBIOS part number."""
amdsmi = import_amdsmi_module()
try:
vbios_part_number = amdsmi.amdsmi_get_gpu_vbios_info(get_device_handle())[
"part_number"
@@ -116,6 +138,7 @@ def get_gpu_vbios_part_number() -> str:
def get_gpu_compute_partition() -> str:
"""Get the GPU compute partition."""
amdsmi = import_amdsmi_module()
try:
compute_partition = amdsmi.amdsmi_get_gpu_compute_partition(get_device_handle())
console_debug(f"GPU Compute Partition: {compute_partition}")
@@ -127,6 +150,7 @@ def get_gpu_compute_partition() -> str:
def get_gpu_memory_partition() -> str:
"""Get the GPU memory partition."""
amdsmi = import_amdsmi_module()
try:
memory_partition = amdsmi.amdsmi_get_gpu_memory_partition(get_device_handle())
console_debug(f"GPU Memory Partition: {memory_partition}")
@@ -138,6 +162,7 @@ def get_gpu_memory_partition() -> str:
def get_amdgpu_driver_version() -> str:
"""Get the AMDGPU driver version."""
amdsmi = import_amdsmi_module()
try:
driver_info = amdsmi.amdsmi_get_gpu_driver_info(get_device_handle())
driver_version = driver_info["driver_version"]
@@ -150,6 +175,7 @@ def get_amdgpu_driver_version() -> str:
def get_gpu_vram_size() -> int:
"""Get the GPU VRAM size in MB."""
amdsmi = import_amdsmi_module()
try:
vram_info = amdsmi.amdsmi_get_gpu_vram_info(get_device_handle())
vram_size = str(int(vram_info["vram_size"]) * 1024) # MB -> KB
+27 -13
Просмотреть файл
@@ -8552,7 +8552,9 @@ def test_list_metrics(binary_handler_analyze_rocprof_compute, capsys):
def test_amdsmi_ctx():
from utils.amdsmi_interface import amdsmi_ctx
from utils.amdsmi_interface import amdsmi_ctx, import_amdsmi_module
_ = import_amdsmi_module()
with mock.patch("amdsmi.amdsmi_init") as amdsmi_init_mock:
with mock.patch("amdsmi.amdsmi_shut_down") as amdsmi_shutdown_mock:
@@ -8561,8 +8563,10 @@ def test_amdsmi_ctx():
amdsmi_shutdown_mock.assert_called_once()
def test_get_device_handle():
from utils.amdsmi_interface import get_device_handle
def test_amdsmi_get_device_handle():
from utils.amdsmi_interface import get_device_handle, import_amdsmi_module
_ = import_amdsmi_module()
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
device_handles_mock.return_value = [12345]
@@ -8576,8 +8580,10 @@ def test_get_device_handle():
assert handle is None
def test_get_mem_max_clock():
from utils.amdsmi_interface import get_mem_max_clock
def test_amdsmi_get_mem_max_clock():
from utils.amdsmi_interface import get_mem_max_clock, import_amdsmi_module
_ = import_amdsmi_module()
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
device_handles_mock.return_value = [12345]
@@ -8588,8 +8594,10 @@ def test_get_mem_max_clock():
assert clk == 100
def test_get_gpu_model():
from utils.amdsmi_interface import get_gpu_model
def test_amdsmi_get_gpu_model():
from utils.amdsmi_interface import get_gpu_model, import_amdsmi_module
_ = import_amdsmi_module()
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
device_handles_mock.return_value = [12345]
@@ -8610,8 +8618,10 @@ def test_get_gpu_model():
assert model == "N/A"
def test_get_gpu_vbios_part_number():
from utils.amdsmi_interface import get_gpu_vbios_part_number
def test_amdsmi_get_gpu_vbios_part_number():
from utils.amdsmi_interface import get_gpu_vbios_part_number, import_amdsmi_module
_ = import_amdsmi_module()
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
device_handles_mock.return_value = [12345]
@@ -8630,8 +8640,10 @@ def test_get_gpu_vbios_part_number():
assert part_number == "N/A"
def test_get_gpu_compute_partition():
from utils.amdsmi_interface import get_gpu_compute_partition
def test_amdsmi_get_gpu_compute_partition():
from utils.amdsmi_interface import get_gpu_compute_partition, import_amdsmi_module
_ = import_amdsmi_module()
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
device_handles_mock.return_value = [12345]
@@ -8651,8 +8663,10 @@ def test_get_gpu_compute_partition():
assert partition == "N/A"
def test_get_gpu_memory_partition():
from utils.amdsmi_interface import get_gpu_memory_partition
def test_amdsmi_get_gpu_memory_partition():
from utils.amdsmi_interface import get_gpu_memory_partition, import_amdsmi_module
_ = import_amdsmi_module()
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
device_handles_mock.return_value = [12345]