[rocprofiler-compute] Only depend on amdsmi in profile phase (#2044)
* Only depepnd on amdsmi in profile phase * amdsmi interface tests should have common prefix for easier testing
This commit is contained in:
committed by
GitHub
orang tua
68c8e111ae
melakukan
3f2fbc18e9
@@ -34,18 +34,35 @@ from utils.logger import (
|
||||
console_warning,
|
||||
)
|
||||
|
||||
sys.path.insert(0, os.getenv("ROCM_PATH", "/opt/rocm") + "/share/amd_smi")
|
||||
_amdsmi_module = None
|
||||
|
||||
try:
|
||||
import amdsmi
|
||||
except ImportError as e:
|
||||
console_warning(f"Unhandled import error: {e}")
|
||||
console_error("Failed to import the amdsmi Python library.")
|
||||
|
||||
# Ignore undefined name amdsmi since it's dynamically imported
|
||||
def import_amdsmi_module() -> "amdsmi": # noqa: F821
|
||||
"""
|
||||
Dynamically import the amdsmi module because we only
|
||||
want profile time dependency on amdsmi.
|
||||
Uses global cache to avoid repeated imports.
|
||||
"""
|
||||
global _amdsmi_module
|
||||
|
||||
if not _amdsmi_module:
|
||||
sys.path.insert(0, os.getenv("ROCM_PATH", "/opt/rocm") + "/share/amd_smi")
|
||||
try:
|
||||
import amdsmi
|
||||
|
||||
_amdsmi_module = amdsmi
|
||||
except ImportError as e:
|
||||
console_warning(f"Unhandled import error: {e}")
|
||||
console_error("Failed to import the amdsmi Python library.")
|
||||
|
||||
return _amdsmi_module
|
||||
|
||||
|
||||
@contextmanager
|
||||
def amdsmi_ctx() -> Iterator[None]:
|
||||
"""Context manager to initialize and shutdown amdsmi."""
|
||||
amdsmi = import_amdsmi_module()
|
||||
try:
|
||||
amdsmi.amdsmi_init()
|
||||
yield
|
||||
@@ -58,8 +75,10 @@ def amdsmi_ctx() -> Iterator[None]:
|
||||
console_warning(f"amd-smi shutdown failed: {e}")
|
||||
|
||||
|
||||
def get_device_handle() -> "amdsmi.ProcessorHandle | None":
|
||||
# Ignore undefined name amdsmi since it's dynamically imported
|
||||
def get_device_handle() -> "amdsmi.ProcessorHandle | None": # noqa: F821
|
||||
"""Get the first AMD device handle."""
|
||||
amdsmi = import_amdsmi_module()
|
||||
try:
|
||||
devices = amdsmi.amdsmi_get_processor_handles()
|
||||
if len(devices) == 0:
|
||||
@@ -74,6 +93,7 @@ def get_device_handle() -> "amdsmi.ProcessorHandle | None":
|
||||
|
||||
def get_mem_max_clock() -> float:
|
||||
"""Get the maximum memory clock of the device."""
|
||||
amdsmi = import_amdsmi_module()
|
||||
try:
|
||||
return amdsmi.amdsmi_get_clock_info(
|
||||
get_device_handle(), amdsmi.AmdSmiClkType.GFX
|
||||
@@ -85,6 +105,7 @@ def get_mem_max_clock() -> float:
|
||||
|
||||
def get_gpu_model() -> str:
|
||||
"""Get the GPU model name."""
|
||||
amdsmi = import_amdsmi_module()
|
||||
try:
|
||||
gpu_model_info = (
|
||||
# board -> product_name
|
||||
@@ -103,6 +124,7 @@ def get_gpu_model() -> str:
|
||||
|
||||
def get_gpu_vbios_part_number() -> str:
|
||||
"""Get the GPU VBIOS part number."""
|
||||
amdsmi = import_amdsmi_module()
|
||||
try:
|
||||
vbios_part_number = amdsmi.amdsmi_get_gpu_vbios_info(get_device_handle())[
|
||||
"part_number"
|
||||
@@ -116,6 +138,7 @@ def get_gpu_vbios_part_number() -> str:
|
||||
|
||||
def get_gpu_compute_partition() -> str:
|
||||
"""Get the GPU compute partition."""
|
||||
amdsmi = import_amdsmi_module()
|
||||
try:
|
||||
compute_partition = amdsmi.amdsmi_get_gpu_compute_partition(get_device_handle())
|
||||
console_debug(f"GPU Compute Partition: {compute_partition}")
|
||||
@@ -127,6 +150,7 @@ def get_gpu_compute_partition() -> str:
|
||||
|
||||
def get_gpu_memory_partition() -> str:
|
||||
"""Get the GPU memory partition."""
|
||||
amdsmi = import_amdsmi_module()
|
||||
try:
|
||||
memory_partition = amdsmi.amdsmi_get_gpu_memory_partition(get_device_handle())
|
||||
console_debug(f"GPU Memory Partition: {memory_partition}")
|
||||
@@ -138,6 +162,7 @@ def get_gpu_memory_partition() -> str:
|
||||
|
||||
def get_amdgpu_driver_version() -> str:
|
||||
"""Get the AMDGPU driver version."""
|
||||
amdsmi = import_amdsmi_module()
|
||||
try:
|
||||
driver_info = amdsmi.amdsmi_get_gpu_driver_info(get_device_handle())
|
||||
driver_version = driver_info["driver_version"]
|
||||
@@ -150,6 +175,7 @@ def get_amdgpu_driver_version() -> str:
|
||||
|
||||
def get_gpu_vram_size() -> int:
|
||||
"""Get the GPU VRAM size in MB."""
|
||||
amdsmi = import_amdsmi_module()
|
||||
try:
|
||||
vram_info = amdsmi.amdsmi_get_gpu_vram_info(get_device_handle())
|
||||
vram_size = str(int(vram_info["vram_size"]) * 1024) # MB -> KB
|
||||
|
||||
@@ -8552,7 +8552,9 @@ def test_list_metrics(binary_handler_analyze_rocprof_compute, capsys):
|
||||
|
||||
|
||||
def test_amdsmi_ctx():
|
||||
from utils.amdsmi_interface import amdsmi_ctx
|
||||
from utils.amdsmi_interface import amdsmi_ctx, import_amdsmi_module
|
||||
|
||||
_ = import_amdsmi_module()
|
||||
|
||||
with mock.patch("amdsmi.amdsmi_init") as amdsmi_init_mock:
|
||||
with mock.patch("amdsmi.amdsmi_shut_down") as amdsmi_shutdown_mock:
|
||||
@@ -8561,8 +8563,10 @@ def test_amdsmi_ctx():
|
||||
amdsmi_shutdown_mock.assert_called_once()
|
||||
|
||||
|
||||
def test_get_device_handle():
|
||||
from utils.amdsmi_interface import get_device_handle
|
||||
def test_amdsmi_get_device_handle():
|
||||
from utils.amdsmi_interface import get_device_handle, import_amdsmi_module
|
||||
|
||||
_ = import_amdsmi_module()
|
||||
|
||||
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
|
||||
device_handles_mock.return_value = [12345]
|
||||
@@ -8576,8 +8580,10 @@ def test_get_device_handle():
|
||||
assert handle is None
|
||||
|
||||
|
||||
def test_get_mem_max_clock():
|
||||
from utils.amdsmi_interface import get_mem_max_clock
|
||||
def test_amdsmi_get_mem_max_clock():
|
||||
from utils.amdsmi_interface import get_mem_max_clock, import_amdsmi_module
|
||||
|
||||
_ = import_amdsmi_module()
|
||||
|
||||
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
|
||||
device_handles_mock.return_value = [12345]
|
||||
@@ -8588,8 +8594,10 @@ def test_get_mem_max_clock():
|
||||
assert clk == 100
|
||||
|
||||
|
||||
def test_get_gpu_model():
|
||||
from utils.amdsmi_interface import get_gpu_model
|
||||
def test_amdsmi_get_gpu_model():
|
||||
from utils.amdsmi_interface import get_gpu_model, import_amdsmi_module
|
||||
|
||||
_ = import_amdsmi_module()
|
||||
|
||||
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
|
||||
device_handles_mock.return_value = [12345]
|
||||
@@ -8610,8 +8618,10 @@ def test_get_gpu_model():
|
||||
assert model == "N/A"
|
||||
|
||||
|
||||
def test_get_gpu_vbios_part_number():
|
||||
from utils.amdsmi_interface import get_gpu_vbios_part_number
|
||||
def test_amdsmi_get_gpu_vbios_part_number():
|
||||
from utils.amdsmi_interface import get_gpu_vbios_part_number, import_amdsmi_module
|
||||
|
||||
_ = import_amdsmi_module()
|
||||
|
||||
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
|
||||
device_handles_mock.return_value = [12345]
|
||||
@@ -8630,8 +8640,10 @@ def test_get_gpu_vbios_part_number():
|
||||
assert part_number == "N/A"
|
||||
|
||||
|
||||
def test_get_gpu_compute_partition():
|
||||
from utils.amdsmi_interface import get_gpu_compute_partition
|
||||
def test_amdsmi_get_gpu_compute_partition():
|
||||
from utils.amdsmi_interface import get_gpu_compute_partition, import_amdsmi_module
|
||||
|
||||
_ = import_amdsmi_module()
|
||||
|
||||
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
|
||||
device_handles_mock.return_value = [12345]
|
||||
@@ -8651,8 +8663,10 @@ def test_get_gpu_compute_partition():
|
||||
assert partition == "N/A"
|
||||
|
||||
|
||||
def test_get_gpu_memory_partition():
|
||||
from utils.amdsmi_interface import get_gpu_memory_partition
|
||||
def test_amdsmi_get_gpu_memory_partition():
|
||||
from utils.amdsmi_interface import get_gpu_memory_partition, import_amdsmi_module
|
||||
|
||||
_ = import_amdsmi_module()
|
||||
|
||||
with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock:
|
||||
device_handles_mock.return_value = [12345]
|
||||
|
||||
Reference in New Issue
Block a user