From 3f2fbc18e936c2cd39149fd0409dd439f9187647 Mon Sep 17 00:00:00 2001 From: vedithal-amd Date: Fri, 28 Nov 2025 11:32:00 -0500 Subject: [PATCH] [rocprofiler-compute] Only depend on amdsmi in profile phase (#2044) * Only depepnd on amdsmi in profile phase * amdsmi interface tests should have common prefix for easier testing --- .../src/utils/amdsmi_interface.py | 40 +++++++++++++++---- .../rocprofiler-compute/tests/test_utils.py | 40 +++++++++++++------ 2 files changed, 60 insertions(+), 20 deletions(-) diff --git a/projects/rocprofiler-compute/src/utils/amdsmi_interface.py b/projects/rocprofiler-compute/src/utils/amdsmi_interface.py index 3bbae989b9..fa915734d6 100644 --- a/projects/rocprofiler-compute/src/utils/amdsmi_interface.py +++ b/projects/rocprofiler-compute/src/utils/amdsmi_interface.py @@ -34,18 +34,35 @@ from utils.logger import ( console_warning, ) -sys.path.insert(0, os.getenv("ROCM_PATH", "/opt/rocm") + "/share/amd_smi") +_amdsmi_module = None -try: - import amdsmi -except ImportError as e: - console_warning(f"Unhandled import error: {e}") - console_error("Failed to import the amdsmi Python library.") + +# Ignore undefined name amdsmi since it's dynamically imported +def import_amdsmi_module() -> "amdsmi": # noqa: F821 + """ + Dynamically import the amdsmi module because we only + want profile time dependency on amdsmi. + Uses global cache to avoid repeated imports. + """ + global _amdsmi_module + + if not _amdsmi_module: + sys.path.insert(0, os.getenv("ROCM_PATH", "/opt/rocm") + "/share/amd_smi") + try: + import amdsmi + + _amdsmi_module = amdsmi + except ImportError as e: + console_warning(f"Unhandled import error: {e}") + console_error("Failed to import the amdsmi Python library.") + + return _amdsmi_module @contextmanager def amdsmi_ctx() -> Iterator[None]: """Context manager to initialize and shutdown amdsmi.""" + amdsmi = import_amdsmi_module() try: amdsmi.amdsmi_init() yield @@ -58,8 +75,10 @@ def amdsmi_ctx() -> Iterator[None]: console_warning(f"amd-smi shutdown failed: {e}") -def get_device_handle() -> "amdsmi.ProcessorHandle | None": +# Ignore undefined name amdsmi since it's dynamically imported +def get_device_handle() -> "amdsmi.ProcessorHandle | None": # noqa: F821 """Get the first AMD device handle.""" + amdsmi = import_amdsmi_module() try: devices = amdsmi.amdsmi_get_processor_handles() if len(devices) == 0: @@ -74,6 +93,7 @@ def get_device_handle() -> "amdsmi.ProcessorHandle | None": def get_mem_max_clock() -> float: """Get the maximum memory clock of the device.""" + amdsmi = import_amdsmi_module() try: return amdsmi.amdsmi_get_clock_info( get_device_handle(), amdsmi.AmdSmiClkType.GFX @@ -85,6 +105,7 @@ def get_mem_max_clock() -> float: def get_gpu_model() -> str: """Get the GPU model name.""" + amdsmi = import_amdsmi_module() try: gpu_model_info = ( # board -> product_name @@ -103,6 +124,7 @@ def get_gpu_model() -> str: def get_gpu_vbios_part_number() -> str: """Get the GPU VBIOS part number.""" + amdsmi = import_amdsmi_module() try: vbios_part_number = amdsmi.amdsmi_get_gpu_vbios_info(get_device_handle())[ "part_number" @@ -116,6 +138,7 @@ def get_gpu_vbios_part_number() -> str: def get_gpu_compute_partition() -> str: """Get the GPU compute partition.""" + amdsmi = import_amdsmi_module() try: compute_partition = amdsmi.amdsmi_get_gpu_compute_partition(get_device_handle()) console_debug(f"GPU Compute Partition: {compute_partition}") @@ -127,6 +150,7 @@ def get_gpu_compute_partition() -> str: def get_gpu_memory_partition() -> str: """Get the GPU memory partition.""" + amdsmi = import_amdsmi_module() try: memory_partition = amdsmi.amdsmi_get_gpu_memory_partition(get_device_handle()) console_debug(f"GPU Memory Partition: {memory_partition}") @@ -138,6 +162,7 @@ def get_gpu_memory_partition() -> str: def get_amdgpu_driver_version() -> str: """Get the AMDGPU driver version.""" + amdsmi = import_amdsmi_module() try: driver_info = amdsmi.amdsmi_get_gpu_driver_info(get_device_handle()) driver_version = driver_info["driver_version"] @@ -150,6 +175,7 @@ def get_amdgpu_driver_version() -> str: def get_gpu_vram_size() -> int: """Get the GPU VRAM size in MB.""" + amdsmi = import_amdsmi_module() try: vram_info = amdsmi.amdsmi_get_gpu_vram_info(get_device_handle()) vram_size = str(int(vram_info["vram_size"]) * 1024) # MB -> KB diff --git a/projects/rocprofiler-compute/tests/test_utils.py b/projects/rocprofiler-compute/tests/test_utils.py index 01b5c8bdd8..db06ac99ad 100644 --- a/projects/rocprofiler-compute/tests/test_utils.py +++ b/projects/rocprofiler-compute/tests/test_utils.py @@ -8552,7 +8552,9 @@ def test_list_metrics(binary_handler_analyze_rocprof_compute, capsys): def test_amdsmi_ctx(): - from utils.amdsmi_interface import amdsmi_ctx + from utils.amdsmi_interface import amdsmi_ctx, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_init") as amdsmi_init_mock: with mock.patch("amdsmi.amdsmi_shut_down") as amdsmi_shutdown_mock: @@ -8561,8 +8563,10 @@ def test_amdsmi_ctx(): amdsmi_shutdown_mock.assert_called_once() -def test_get_device_handle(): - from utils.amdsmi_interface import get_device_handle +def test_amdsmi_get_device_handle(): + from utils.amdsmi_interface import get_device_handle, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: device_handles_mock.return_value = [12345] @@ -8576,8 +8580,10 @@ def test_get_device_handle(): assert handle is None -def test_get_mem_max_clock(): - from utils.amdsmi_interface import get_mem_max_clock +def test_amdsmi_get_mem_max_clock(): + from utils.amdsmi_interface import get_mem_max_clock, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: device_handles_mock.return_value = [12345] @@ -8588,8 +8594,10 @@ def test_get_mem_max_clock(): assert clk == 100 -def test_get_gpu_model(): - from utils.amdsmi_interface import get_gpu_model +def test_amdsmi_get_gpu_model(): + from utils.amdsmi_interface import get_gpu_model, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: device_handles_mock.return_value = [12345] @@ -8610,8 +8618,10 @@ def test_get_gpu_model(): assert model == "N/A" -def test_get_gpu_vbios_part_number(): - from utils.amdsmi_interface import get_gpu_vbios_part_number +def test_amdsmi_get_gpu_vbios_part_number(): + from utils.amdsmi_interface import get_gpu_vbios_part_number, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: device_handles_mock.return_value = [12345] @@ -8630,8 +8640,10 @@ def test_get_gpu_vbios_part_number(): assert part_number == "N/A" -def test_get_gpu_compute_partition(): - from utils.amdsmi_interface import get_gpu_compute_partition +def test_amdsmi_get_gpu_compute_partition(): + from utils.amdsmi_interface import get_gpu_compute_partition, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: device_handles_mock.return_value = [12345] @@ -8651,8 +8663,10 @@ def test_get_gpu_compute_partition(): assert partition == "N/A" -def test_get_gpu_memory_partition(): - from utils.amdsmi_interface import get_gpu_memory_partition +def test_amdsmi_get_gpu_memory_partition(): + from utils.amdsmi_interface import get_gpu_memory_partition, import_amdsmi_module + + _ = import_amdsmi_module() with mock.patch("amdsmi.amdsmi_get_processor_handles") as device_handles_mock: device_handles_mock.return_value = [12345]