From 6540155c9d4f1c69db65adc0ef35fdff8a41b07b Mon Sep 17 00:00:00 2001 From: vedithal-amd Date: Fri, 21 Nov 2025 10:54:25 -0500 Subject: [PATCH] Bugfixes (#1971) * Implement AMDGPU driver info and GPU VRAM attributes in system info. section of analysis report. * Backward compatibility for rocprofiler-sdk avail module path migration * Fix roofline calculation where AI data points are N/A --- projects/rocprofiler-compute/CHANGELOG.md | 2 + .../src/rocprof_compute_soc/soc_base.py | 27 +++++++--- .../src/utils/amdsmi_interface.py | 24 +++++++++ .../src/utils/roofline_calc.py | 8 +-- .../rocprofiler-compute/src/utils/specs.py | 50 +++++++++---------- 5 files changed, 76 insertions(+), 35 deletions(-) diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index c58b948f1d..cbb9da0ac4 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -19,6 +19,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * kernel: Counters are collected in a round robin fashion for unique kernels. * kernel_launch_params: Counters are collected in a round robin fashion for unique kernels having the exact same launch parameters. +* Implement AMDGPU driver info and GPU VRAM attributes in system info. section of analysis report. + ### Changed * Default output format for the underlying ROCprofiler-SDK tool has been changed from ``csv`` to ``rocpd``. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py index ad96b19462..ab1e284b7b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py @@ -414,13 +414,28 @@ class OmniSoC_Base: os.environ["ROCPROFILER_METRICS_PATH"] = str( config.rocprof_compute_home / "rocprof_compute_soc" / "profile_configs" ) - sys.path.append( - str( - Path(args.rocprofiler_sdk_tool_path).parents[1] - / "python3/site-packages" - ) + + # Backward compatibility support for sdk avail module moved from + # /bin/rocprofv3_avail_module/avail.py to + # /lib/python3/site-packages/rocprofv3/avail.py + new_path = str( + Path(args.rocprofiler_sdk_tool_path).parents[1] / "python3/site-packages" ) - from rocprofv3 import avail + old_path = str(Path(args.rocprofiler_sdk_tool_path).parents[2] / "bin") + try: + sys.path.append(new_path) + from rocprofv3 import avail + except ImportError: + console_debug( + f"Could not import rocprofiler-sdk avail module from {new_path}, " + f"trying {old_path}" + ) + try: + sys.path.remove(new_path) + sys.path.append(old_path) + from rocprofv3_avail_module import avail + except ImportError: + console_error("Failed to import rocprofiler-sdk avail module.") avail.loadLibrary.libname = str( Path(args.rocprofiler_sdk_tool_path).parent / "librocprofv3-list-avail.so" diff --git a/projects/rocprofiler-compute/src/utils/amdsmi_interface.py b/projects/rocprofiler-compute/src/utils/amdsmi_interface.py index 1675e3abe9..3bbae989b9 100644 --- a/projects/rocprofiler-compute/src/utils/amdsmi_interface.py +++ b/projects/rocprofiler-compute/src/utils/amdsmi_interface.py @@ -134,3 +134,27 @@ def get_gpu_memory_partition() -> str: except Exception as e: console_warning(f"Error getting GPU memory partition: {e}") return "N/A" + + +def get_amdgpu_driver_version() -> str: + """Get the AMDGPU driver version.""" + try: + driver_info = amdsmi.amdsmi_get_gpu_driver_info(get_device_handle()) + driver_version = driver_info["driver_version"] + console_debug(f"AMDGPU Driver Version: {driver_version}") + return driver_version + except Exception as e: + console_warning(f"Error getting AMDGPU driver version: {e}") + return "N/A" + + +def get_gpu_vram_size() -> int: + """Get the GPU VRAM size in MB.""" + try: + vram_info = amdsmi.amdsmi_get_gpu_vram_info(get_device_handle()) + vram_size = str(int(vram_info["vram_size"]) * 1024) # MB -> KB + console_debug(f"GPU VRAM Size: {vram_size} MB") + return vram_size + except Exception as e: + console_warning(f"Error getting GPU VRAM size: {e}") + return 0 diff --git a/projects/rocprofiler-compute/src/utils/roofline_calc.py b/projects/rocprofiler-compute/src/utils/roofline_calc.py index 05eeb25ca2..8825ebbe82 100644 --- a/projects/rocprofiler-compute/src/utils/roofline_calc.py +++ b/projects/rocprofiler-compute/src/utils/roofline_calc.py @@ -412,13 +412,13 @@ def calc_ai_analyze( metric = row.get("Metric", "") value = row.get("Value", 0) if metric == "AI HBM": - ai_hbm = value if value and value != "" else 0 + ai_hbm = value if value and value not in ("", "N/A") else 0 elif metric == "AI L2": - ai_l2 = value if value and value != "" else 0 + ai_l2 = value if value and value not in ("", "N/A") else 0 elif metric == "AI L1": - ai_l1 = value if value and value != "" else 0 + ai_l1 = value if value and value not in ("", "N/A") else 0 elif metric == "Performance (GFLOPs)": - performance = value if value and value != "" else 0 + performance = value if value and value not in ("", "N/A") else 0 console_debug( "roofline", diff --git a/projects/rocprofiler-compute/src/utils/specs.py b/projects/rocprofiler-compute/src/utils/specs.py index 60bb102fc3..b9202deb73 100644 --- a/projects/rocprofiler-compute/src/utils/specs.py +++ b/projects/rocprofiler-compute/src/utils/specs.py @@ -43,9 +43,11 @@ import pandas as pd import config from utils.amdsmi_interface import ( amdsmi_ctx, + get_amdgpu_driver_version, get_gpu_compute_partition, get_gpu_memory_partition, get_gpu_vbios_part_number, + get_gpu_vram_size, ) from utils.logger import ( console_debug, @@ -182,25 +184,26 @@ def generate_machine_specs( soc_info = extract_soc_info() # Combine all specifications - specs = MachineSpecs( - version=specs_version, - timestamp=timestamp, - rocminfo_lines=soc_info["rocminfo_lines"], - hostname=socket.gethostname(), - cpu_model=machine_info["cpu_model"], - sbios=machine_info["sbios"], - linux_kernel_version=machine_info["linux_kernel_version"], - amd_gpu_kernel_version="", - cpu_memory=machine_info["cpu_memory"], - gpu_memory="", - linux_distro=machine_info["linux_distro"], - rocm_version=get_rocm_ver().strip(), - vbios=gpu_info["vbios"], - compute_partition=gpu_info["compute_partition"], - memory_partition=gpu_info["memory_partition"], - gpu_arch=soc_info["gpu_arch"], - gpu_chip_id=soc_info["gpu_chip_id"], - ) + with amdsmi_ctx(): + specs = MachineSpecs( + version=specs_version, + timestamp=timestamp, + rocminfo_lines=soc_info["rocminfo_lines"], + hostname=socket.gethostname(), + cpu_model=machine_info["cpu_model"], + sbios=machine_info["sbios"], + linux_kernel_version=machine_info["linux_kernel_version"], + amd_gpu_kernel_version=get_amdgpu_driver_version(), + cpu_memory=machine_info["cpu_memory"], + gpu_memory=get_gpu_vram_size(), + linux_distro=machine_info["linux_distro"], + rocm_version=get_rocm_ver().strip(), + vbios=gpu_info["vbios"], + compute_partition=gpu_info["compute_partition"], + memory_partition=gpu_info["memory_partition"], + gpu_arch=soc_info["gpu_arch"], + gpu_chip_id=soc_info["gpu_chip_id"], + ) # Load above SoC specs via module import try: @@ -436,10 +439,7 @@ class MachineSpecs: amd_gpu_kernel_version: Optional[str] = field( default=None, metadata={ - "doc": ( - "[RESERVED] The version of the AMDGPU driver installed on the machine. " - "Unimplemented." - ), + "doc": ("The version of the AMDGPU driver installed on the machine."), "name": "AMD GPU Kernel Version", "show_in_table": True, }, @@ -457,8 +457,8 @@ class MachineSpecs: default=None, metadata={ "doc": ( - "[RESERVED] The total amount of memory available to accelerators/GPUs " - "in the system. Unimplemented." + "The total amount of memory available to accelerators/GPUs " + "in the system." ), "unit": "KB", "name": "GPU Memory",