[rocprofiler-compute] Improve native tool discovery and partition detection (#2630)

* Improve native tool discovery and partition detection

- Enhanced native tool path resolution to support CMAKE_INSTALL_LIBDIR variations
  (lib, lib64, lib32, etc.) using glob pattern matching
- Extracted path variables to avoid duplication in error messages
- Improved error message clarity by showing exact paths searched for .so and .cpp files
- Simplified code path construction using consistent Path.resolve().parents[x] syntax

- Fixed redundant partition warnings on pre-MI300 GPUs by adding architecture check
- Only query compute/memory partition on MI300+ series (gfx940+)
- Added proper type hints for gpu_arch parameter
- Moved gpu_info extraction after soc_info to ensure gpu_arch is available
- Improved code comments for MI300 series threshold

* Handle gpu arch like a hex string
Esse commit está contido em:
vedithal-amd
2026-01-16 10:36:19 -05:00
commit de GitHub
commit f64d8e0f43
3 arquivos alterados com 50 adições e 22 exclusões
+2
Ver Arquivo
@@ -49,6 +49,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
* Fix issue where counter collection data was empty when profiling workload which spawn multiple child processes
* Fix redundant warnings for compute/memory partition not found for < MI 300 series GPUs by skipping partition checks
### Removed
* Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter.
@@ -531,12 +531,15 @@ class RocProfCompute_Base:
and not args.attach_pid
):
# Use native counter collection tool
# Use lib* glob pattern to handle CMAKE_INSTALL_LIBDIR variations
# (lib, lib64, lib32, etc. depending on distribution)
native_tool_base_path = Path(sys.argv[0]).resolve().parents[2]
native_tool_glob_pattern = (
"lib*/rocprofiler-compute/librocprofiler-compute-tool.so"
)
try:
native_tool_path = str(
Path(sys.argv[0]).resolve().parents[2]
/ "lib"
/ "rocprofiler-compute"
/ "librocprofiler-compute-tool.so"
next(native_tool_base_path.glob(native_tool_glob_pattern))
)
except Exception as e:
console_debug(
@@ -552,6 +555,7 @@ class RocProfCompute_Base:
)
/ "librocprofiler-compute-tool.so"
)
native_tool_cpp_path = Path(__file__).resolve().parents[1] / "lib"
link_libraries = ("rocprofiler-sdk",)
build_command = (
# Create shared object
@@ -564,10 +568,10 @@ class RocProfCompute_Base:
# rocprofiler sdk library path
f"-L {str(Path(args.rocprofiler_sdk_tool_path).parent.parent)} "
# native tool source files (tool.cpp and helper.cpp)
f"{str(Path(__file__).parent.parent)}/"
"lib/rocprofiler_compute_tool.cpp "
f"{str(Path(__file__).parent.parent)}/"
"lib/helper.cpp "
f"{native_tool_cpp_path}/"
"rocprofiler_compute_tool.cpp "
f"{native_tool_cpp_path}/"
"helper.cpp "
# temporary shared object for native tool
f"-o {native_tool_path}"
)
@@ -575,7 +579,15 @@ class RocProfCompute_Base:
success, output = capture_subprocess_output(shlex.split(build_command))
console_debug(f"Build output: {output}")
if not success:
console_error("Failed to build native counter collection tool.")
console_error(
"Failed to use native counter collection tool.\n"
"Could not find pre-built .so file at: "
f"{native_tool_base_path / native_tool_glob_pattern}\n"
"Could not find source .cpp files in folder: "
f"{native_tool_cpp_path}\n"
"Please ensure the native tool library is installed "
"or source files are present."
)
if self.__profiler == "rocprofiler-sdk":
options = self.get_profiler_options(native_tool_path=native_tool_path)
+27 -13
Ver Arquivo
@@ -174,15 +174,15 @@ def generate_machine_specs(
##########################################
machine_info = extract_machine_info()
# FIXME: use device
# Load amd-smi data
gpu_info = extract_gpu_info()
##########################################
## B. SoC Specs
##########################################
soc_info = extract_soc_info()
# FIXME: use device
# Load amd-smi data
gpu_info = extract_gpu_info(gpu_arch=soc_info["gpu_arch"])
# Combine all specifications
with amdsmi_ctx():
specs = MachineSpecs(
@@ -269,7 +269,16 @@ def extract_machine_info() -> dict[str, Any]:
@demarcate
def extract_gpu_info() -> dict[str, Any]:
def extract_gpu_info(gpu_arch: Optional[str]) -> dict[str, Any]:
# Partition is only supported on >= MI 300 series
# (gpu_arch should be gfx940 or higher for MI300+)
is_partition_supported = False
if gpu_arch and gpu_arch.startswith("gfx") and len(gpu_arch) >= 6:
try:
is_partition_supported = int(gpu_arch[3:6], 16) >= 0x940
except ValueError:
pass # Invalid hex string, keep is_partition_supported as False
result: dict[str, Optional[str]] = {
"vbios": None,
"compute_partition": None,
@@ -278,17 +287,22 @@ def extract_gpu_info() -> dict[str, Any]:
with amdsmi_ctx():
result["vbios"] = get_gpu_vbios_part_number()
result["compute_partition"] = get_gpu_compute_partition()
result["memory_partition"] = get_gpu_memory_partition()
if is_partition_supported:
result["compute_partition"] = get_gpu_compute_partition()
result["memory_partition"] = get_gpu_memory_partition()
else:
result["compute_partition"] = "N/A"
result["memory_partition"] = "N/A"
# Apply defaults and warnings
if result["compute_partition"] == "N/A" or not result["compute_partition"]:
console_warning("Cannot detect accelerator partition from amd-smi.")
console_warning("Applying default accelerator partition: SPX")
result["compute_partition"] = "SPX"
if is_partition_supported:
if result["compute_partition"] == "N/A" or not result["compute_partition"]:
console_warning("Cannot detect accelerator partition from amd-smi.")
console_warning("Applying default accelerator partition: SPX")
result["compute_partition"] = "SPX"
if result["memory_partition"] == "N/A" or not result["memory_partition"]:
console_warning("Cannot detect memory partition from amd-smi.")
if result["memory_partition"] == "N/A" or not result["memory_partition"]:
console_warning("Cannot detect memory partition from amd-smi.")
console_debug(
f"vbios is {result['vbios']}, compute partition is "