Bugfixes (#1971)
* Implement AMDGPU driver info and GPU VRAM attributes in system info. section of analysis report. * Backward compatibility for rocprofiler-sdk avail module path migration * Fix roofline calculation where AI data points are N/A
This commit is contained in:
gecommit door
GitHub
bovenliggende
a2288eb50b
commit
6540155c9d
@@ -19,6 +19,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
|
||||
* kernel: Counters are collected in a round robin fashion for unique kernels.
|
||||
* kernel_launch_params: Counters are collected in a round robin fashion for unique kernels having the exact same launch parameters.
|
||||
|
||||
* Implement AMDGPU driver info and GPU VRAM attributes in system info. section of analysis report.
|
||||
|
||||
### Changed
|
||||
|
||||
* Default output format for the underlying ROCprofiler-SDK tool has been changed from ``csv`` to ``rocpd``.
|
||||
|
||||
@@ -414,13 +414,28 @@ class OmniSoC_Base:
|
||||
os.environ["ROCPROFILER_METRICS_PATH"] = str(
|
||||
config.rocprof_compute_home / "rocprof_compute_soc" / "profile_configs"
|
||||
)
|
||||
sys.path.append(
|
||||
str(
|
||||
Path(args.rocprofiler_sdk_tool_path).parents[1]
|
||||
/ "python3/site-packages"
|
||||
)
|
||||
|
||||
# Backward compatibility support for sdk avail module moved from
|
||||
# <rocm_path>/bin/rocprofv3_avail_module/avail.py to
|
||||
# <rocm_path>/lib/python3/site-packages/rocprofv3/avail.py
|
||||
new_path = str(
|
||||
Path(args.rocprofiler_sdk_tool_path).parents[1] / "python3/site-packages"
|
||||
)
|
||||
from rocprofv3 import avail
|
||||
old_path = str(Path(args.rocprofiler_sdk_tool_path).parents[2] / "bin")
|
||||
try:
|
||||
sys.path.append(new_path)
|
||||
from rocprofv3 import avail
|
||||
except ImportError:
|
||||
console_debug(
|
||||
f"Could not import rocprofiler-sdk avail module from {new_path}, "
|
||||
f"trying {old_path}"
|
||||
)
|
||||
try:
|
||||
sys.path.remove(new_path)
|
||||
sys.path.append(old_path)
|
||||
from rocprofv3_avail_module import avail
|
||||
except ImportError:
|
||||
console_error("Failed to import rocprofiler-sdk avail module.")
|
||||
|
||||
avail.loadLibrary.libname = str(
|
||||
Path(args.rocprofiler_sdk_tool_path).parent / "librocprofv3-list-avail.so"
|
||||
|
||||
@@ -134,3 +134,27 @@ def get_gpu_memory_partition() -> str:
|
||||
except Exception as e:
|
||||
console_warning(f"Error getting GPU memory partition: {e}")
|
||||
return "N/A"
|
||||
|
||||
|
||||
def get_amdgpu_driver_version() -> str:
|
||||
"""Get the AMDGPU driver version."""
|
||||
try:
|
||||
driver_info = amdsmi.amdsmi_get_gpu_driver_info(get_device_handle())
|
||||
driver_version = driver_info["driver_version"]
|
||||
console_debug(f"AMDGPU Driver Version: {driver_version}")
|
||||
return driver_version
|
||||
except Exception as e:
|
||||
console_warning(f"Error getting AMDGPU driver version: {e}")
|
||||
return "N/A"
|
||||
|
||||
|
||||
def get_gpu_vram_size() -> int:
|
||||
"""Get the GPU VRAM size in MB."""
|
||||
try:
|
||||
vram_info = amdsmi.amdsmi_get_gpu_vram_info(get_device_handle())
|
||||
vram_size = str(int(vram_info["vram_size"]) * 1024) # MB -> KB
|
||||
console_debug(f"GPU VRAM Size: {vram_size} MB")
|
||||
return vram_size
|
||||
except Exception as e:
|
||||
console_warning(f"Error getting GPU VRAM size: {e}")
|
||||
return 0
|
||||
|
||||
@@ -412,13 +412,13 @@ def calc_ai_analyze(
|
||||
metric = row.get("Metric", "")
|
||||
value = row.get("Value", 0)
|
||||
if metric == "AI HBM":
|
||||
ai_hbm = value if value and value != "" else 0
|
||||
ai_hbm = value if value and value not in ("", "N/A") else 0
|
||||
elif metric == "AI L2":
|
||||
ai_l2 = value if value and value != "" else 0
|
||||
ai_l2 = value if value and value not in ("", "N/A") else 0
|
||||
elif metric == "AI L1":
|
||||
ai_l1 = value if value and value != "" else 0
|
||||
ai_l1 = value if value and value not in ("", "N/A") else 0
|
||||
elif metric == "Performance (GFLOPs)":
|
||||
performance = value if value and value != "" else 0
|
||||
performance = value if value and value not in ("", "N/A") else 0
|
||||
|
||||
console_debug(
|
||||
"roofline",
|
||||
|
||||
@@ -43,9 +43,11 @@ import pandas as pd
|
||||
import config
|
||||
from utils.amdsmi_interface import (
|
||||
amdsmi_ctx,
|
||||
get_amdgpu_driver_version,
|
||||
get_gpu_compute_partition,
|
||||
get_gpu_memory_partition,
|
||||
get_gpu_vbios_part_number,
|
||||
get_gpu_vram_size,
|
||||
)
|
||||
from utils.logger import (
|
||||
console_debug,
|
||||
@@ -182,25 +184,26 @@ def generate_machine_specs(
|
||||
soc_info = extract_soc_info()
|
||||
|
||||
# Combine all specifications
|
||||
specs = MachineSpecs(
|
||||
version=specs_version,
|
||||
timestamp=timestamp,
|
||||
rocminfo_lines=soc_info["rocminfo_lines"],
|
||||
hostname=socket.gethostname(),
|
||||
cpu_model=machine_info["cpu_model"],
|
||||
sbios=machine_info["sbios"],
|
||||
linux_kernel_version=machine_info["linux_kernel_version"],
|
||||
amd_gpu_kernel_version="",
|
||||
cpu_memory=machine_info["cpu_memory"],
|
||||
gpu_memory="",
|
||||
linux_distro=machine_info["linux_distro"],
|
||||
rocm_version=get_rocm_ver().strip(),
|
||||
vbios=gpu_info["vbios"],
|
||||
compute_partition=gpu_info["compute_partition"],
|
||||
memory_partition=gpu_info["memory_partition"],
|
||||
gpu_arch=soc_info["gpu_arch"],
|
||||
gpu_chip_id=soc_info["gpu_chip_id"],
|
||||
)
|
||||
with amdsmi_ctx():
|
||||
specs = MachineSpecs(
|
||||
version=specs_version,
|
||||
timestamp=timestamp,
|
||||
rocminfo_lines=soc_info["rocminfo_lines"],
|
||||
hostname=socket.gethostname(),
|
||||
cpu_model=machine_info["cpu_model"],
|
||||
sbios=machine_info["sbios"],
|
||||
linux_kernel_version=machine_info["linux_kernel_version"],
|
||||
amd_gpu_kernel_version=get_amdgpu_driver_version(),
|
||||
cpu_memory=machine_info["cpu_memory"],
|
||||
gpu_memory=get_gpu_vram_size(),
|
||||
linux_distro=machine_info["linux_distro"],
|
||||
rocm_version=get_rocm_ver().strip(),
|
||||
vbios=gpu_info["vbios"],
|
||||
compute_partition=gpu_info["compute_partition"],
|
||||
memory_partition=gpu_info["memory_partition"],
|
||||
gpu_arch=soc_info["gpu_arch"],
|
||||
gpu_chip_id=soc_info["gpu_chip_id"],
|
||||
)
|
||||
|
||||
# Load above SoC specs via module import
|
||||
try:
|
||||
@@ -436,10 +439,7 @@ class MachineSpecs:
|
||||
amd_gpu_kernel_version: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"doc": (
|
||||
"[RESERVED] The version of the AMDGPU driver installed on the machine. "
|
||||
"Unimplemented."
|
||||
),
|
||||
"doc": ("The version of the AMDGPU driver installed on the machine."),
|
||||
"name": "AMD GPU Kernel Version",
|
||||
"show_in_table": True,
|
||||
},
|
||||
@@ -457,8 +457,8 @@ class MachineSpecs:
|
||||
default=None,
|
||||
metadata={
|
||||
"doc": (
|
||||
"[RESERVED] The total amount of memory available to accelerators/GPUs "
|
||||
"in the system. Unimplemented."
|
||||
"The total amount of memory available to accelerators/GPUs "
|
||||
"in the system."
|
||||
),
|
||||
"unit": "KB",
|
||||
"name": "GPU Memory",
|
||||
|
||||
Verwijs in nieuw issue
Block a user