* Implement AMDGPU driver info and GPU VRAM attributes in system info.
  section of analysis report.

* Backward compatibility for rocprofiler-sdk avail module path migration

* Fix roofline calculation where AI data points are N/A
This commit is contained in:
vedithal-amd
2025-11-21 10:54:25 -05:00
gecommit door GitHub
bovenliggende a2288eb50b
commit 6540155c9d
5 gewijzigde bestanden met toevoegingen van 76 en 35 verwijderingen
@@ -19,6 +19,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
* kernel: Counters are collected in a round robin fashion for unique kernels.
* kernel_launch_params: Counters are collected in a round robin fashion for unique kernels having the exact same launch parameters.
* Implement AMDGPU driver info and GPU VRAM attributes in system info. section of analysis report.
### Changed
* Default output format for the underlying ROCprofiler-SDK tool has been changed from ``csv`` to ``rocpd``.
@@ -414,13 +414,28 @@ class OmniSoC_Base:
os.environ["ROCPROFILER_METRICS_PATH"] = str(
config.rocprof_compute_home / "rocprof_compute_soc" / "profile_configs"
)
sys.path.append(
str(
Path(args.rocprofiler_sdk_tool_path).parents[1]
/ "python3/site-packages"
)
# Backward compatibility support for sdk avail module moved from
# <rocm_path>/bin/rocprofv3_avail_module/avail.py to
# <rocm_path>/lib/python3/site-packages/rocprofv3/avail.py
new_path = str(
Path(args.rocprofiler_sdk_tool_path).parents[1] / "python3/site-packages"
)
from rocprofv3 import avail
old_path = str(Path(args.rocprofiler_sdk_tool_path).parents[2] / "bin")
try:
sys.path.append(new_path)
from rocprofv3 import avail
except ImportError:
console_debug(
f"Could not import rocprofiler-sdk avail module from {new_path}, "
f"trying {old_path}"
)
try:
sys.path.remove(new_path)
sys.path.append(old_path)
from rocprofv3_avail_module import avail
except ImportError:
console_error("Failed to import rocprofiler-sdk avail module.")
avail.loadLibrary.libname = str(
Path(args.rocprofiler_sdk_tool_path).parent / "librocprofv3-list-avail.so"
@@ -134,3 +134,27 @@ def get_gpu_memory_partition() -> str:
except Exception as e:
console_warning(f"Error getting GPU memory partition: {e}")
return "N/A"
def get_amdgpu_driver_version() -> str:
"""Get the AMDGPU driver version."""
try:
driver_info = amdsmi.amdsmi_get_gpu_driver_info(get_device_handle())
driver_version = driver_info["driver_version"]
console_debug(f"AMDGPU Driver Version: {driver_version}")
return driver_version
except Exception as e:
console_warning(f"Error getting AMDGPU driver version: {e}")
return "N/A"
def get_gpu_vram_size() -> int:
"""Get the GPU VRAM size in MB."""
try:
vram_info = amdsmi.amdsmi_get_gpu_vram_info(get_device_handle())
vram_size = str(int(vram_info["vram_size"]) * 1024) # MB -> KB
console_debug(f"GPU VRAM Size: {vram_size} MB")
return vram_size
except Exception as e:
console_warning(f"Error getting GPU VRAM size: {e}")
return 0
@@ -412,13 +412,13 @@ def calc_ai_analyze(
metric = row.get("Metric", "")
value = row.get("Value", 0)
if metric == "AI HBM":
ai_hbm = value if value and value != "" else 0
ai_hbm = value if value and value not in ("", "N/A") else 0
elif metric == "AI L2":
ai_l2 = value if value and value != "" else 0
ai_l2 = value if value and value not in ("", "N/A") else 0
elif metric == "AI L1":
ai_l1 = value if value and value != "" else 0
ai_l1 = value if value and value not in ("", "N/A") else 0
elif metric == "Performance (GFLOPs)":
performance = value if value and value != "" else 0
performance = value if value and value not in ("", "N/A") else 0
console_debug(
"roofline",
@@ -43,9 +43,11 @@ import pandas as pd
import config
from utils.amdsmi_interface import (
amdsmi_ctx,
get_amdgpu_driver_version,
get_gpu_compute_partition,
get_gpu_memory_partition,
get_gpu_vbios_part_number,
get_gpu_vram_size,
)
from utils.logger import (
console_debug,
@@ -182,25 +184,26 @@ def generate_machine_specs(
soc_info = extract_soc_info()
# Combine all specifications
specs = MachineSpecs(
version=specs_version,
timestamp=timestamp,
rocminfo_lines=soc_info["rocminfo_lines"],
hostname=socket.gethostname(),
cpu_model=machine_info["cpu_model"],
sbios=machine_info["sbios"],
linux_kernel_version=machine_info["linux_kernel_version"],
amd_gpu_kernel_version="",
cpu_memory=machine_info["cpu_memory"],
gpu_memory="",
linux_distro=machine_info["linux_distro"],
rocm_version=get_rocm_ver().strip(),
vbios=gpu_info["vbios"],
compute_partition=gpu_info["compute_partition"],
memory_partition=gpu_info["memory_partition"],
gpu_arch=soc_info["gpu_arch"],
gpu_chip_id=soc_info["gpu_chip_id"],
)
with amdsmi_ctx():
specs = MachineSpecs(
version=specs_version,
timestamp=timestamp,
rocminfo_lines=soc_info["rocminfo_lines"],
hostname=socket.gethostname(),
cpu_model=machine_info["cpu_model"],
sbios=machine_info["sbios"],
linux_kernel_version=machine_info["linux_kernel_version"],
amd_gpu_kernel_version=get_amdgpu_driver_version(),
cpu_memory=machine_info["cpu_memory"],
gpu_memory=get_gpu_vram_size(),
linux_distro=machine_info["linux_distro"],
rocm_version=get_rocm_ver().strip(),
vbios=gpu_info["vbios"],
compute_partition=gpu_info["compute_partition"],
memory_partition=gpu_info["memory_partition"],
gpu_arch=soc_info["gpu_arch"],
gpu_chip_id=soc_info["gpu_chip_id"],
)
# Load above SoC specs via module import
try:
@@ -436,10 +439,7 @@ class MachineSpecs:
amd_gpu_kernel_version: Optional[str] = field(
default=None,
metadata={
"doc": (
"[RESERVED] The version of the AMDGPU driver installed on the machine. "
"Unimplemented."
),
"doc": ("The version of the AMDGPU driver installed on the machine."),
"name": "AMD GPU Kernel Version",
"show_in_table": True,
},
@@ -457,8 +457,8 @@ class MachineSpecs:
default=None,
metadata={
"doc": (
"[RESERVED] The total amount of memory available to accelerators/GPUs "
"in the system. Unimplemented."
"The total amount of memory available to accelerators/GPUs "
"in the system."
),
"unit": "KB",
"name": "GPU Memory",