[rocprofiler-compute] Improve native tool discovery and partition detection (#2630)
* Improve native tool discovery and partition detection - Enhanced native tool path resolution to support CMAKE_INSTALL_LIBDIR variations (lib, lib64, lib32, etc.) using glob pattern matching - Extracted path variables to avoid duplication in error messages - Improved error message clarity by showing exact paths searched for .so and .cpp files - Simplified code path construction using consistent Path.resolve().parents[x] syntax - Fixed redundant partition warnings on pre-MI300 GPUs by adding architecture check - Only query compute/memory partition on MI300+ series (gfx940+) - Added proper type hints for gpu_arch parameter - Moved gpu_info extraction after soc_info to ensure gpu_arch is available - Improved code comments for MI300 series threshold * Handle gpu arch like a hex string
Tento commit je obsažen v:
@@ -49,6 +49,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
|
||||
|
||||
* Fix issue where counter collection data was empty when profiling workload which spawn multiple child processes
|
||||
|
||||
* Fix redundant warnings for compute/memory partition not found for < MI 300 series GPUs by skipping partition checks
|
||||
|
||||
### Removed
|
||||
|
||||
* Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter.
|
||||
|
||||
@@ -531,12 +531,15 @@ class RocProfCompute_Base:
|
||||
and not args.attach_pid
|
||||
):
|
||||
# Use native counter collection tool
|
||||
# Use lib* glob pattern to handle CMAKE_INSTALL_LIBDIR variations
|
||||
# (lib, lib64, lib32, etc. depending on distribution)
|
||||
native_tool_base_path = Path(sys.argv[0]).resolve().parents[2]
|
||||
native_tool_glob_pattern = (
|
||||
"lib*/rocprofiler-compute/librocprofiler-compute-tool.so"
|
||||
)
|
||||
try:
|
||||
native_tool_path = str(
|
||||
Path(sys.argv[0]).resolve().parents[2]
|
||||
/ "lib"
|
||||
/ "rocprofiler-compute"
|
||||
/ "librocprofiler-compute-tool.so"
|
||||
next(native_tool_base_path.glob(native_tool_glob_pattern))
|
||||
)
|
||||
except Exception as e:
|
||||
console_debug(
|
||||
@@ -552,6 +555,7 @@ class RocProfCompute_Base:
|
||||
)
|
||||
/ "librocprofiler-compute-tool.so"
|
||||
)
|
||||
native_tool_cpp_path = Path(__file__).resolve().parents[1] / "lib"
|
||||
link_libraries = ("rocprofiler-sdk",)
|
||||
build_command = (
|
||||
# Create shared object
|
||||
@@ -564,10 +568,10 @@ class RocProfCompute_Base:
|
||||
# rocprofiler sdk library path
|
||||
f"-L {str(Path(args.rocprofiler_sdk_tool_path).parent.parent)} "
|
||||
# native tool source files (tool.cpp and helper.cpp)
|
||||
f"{str(Path(__file__).parent.parent)}/"
|
||||
"lib/rocprofiler_compute_tool.cpp "
|
||||
f"{str(Path(__file__).parent.parent)}/"
|
||||
"lib/helper.cpp "
|
||||
f"{native_tool_cpp_path}/"
|
||||
"rocprofiler_compute_tool.cpp "
|
||||
f"{native_tool_cpp_path}/"
|
||||
"helper.cpp "
|
||||
# temporary shared object for native tool
|
||||
f"-o {native_tool_path}"
|
||||
)
|
||||
@@ -575,7 +579,15 @@ class RocProfCompute_Base:
|
||||
success, output = capture_subprocess_output(shlex.split(build_command))
|
||||
console_debug(f"Build output: {output}")
|
||||
if not success:
|
||||
console_error("Failed to build native counter collection tool.")
|
||||
console_error(
|
||||
"Failed to use native counter collection tool.\n"
|
||||
"Could not find pre-built .so file at: "
|
||||
f"{native_tool_base_path / native_tool_glob_pattern}\n"
|
||||
"Could not find source .cpp files in folder: "
|
||||
f"{native_tool_cpp_path}\n"
|
||||
"Please ensure the native tool library is installed "
|
||||
"or source files are present."
|
||||
)
|
||||
|
||||
if self.__profiler == "rocprofiler-sdk":
|
||||
options = self.get_profiler_options(native_tool_path=native_tool_path)
|
||||
|
||||
@@ -174,15 +174,15 @@ def generate_machine_specs(
|
||||
##########################################
|
||||
machine_info = extract_machine_info()
|
||||
|
||||
# FIXME: use device
|
||||
# Load amd-smi data
|
||||
gpu_info = extract_gpu_info()
|
||||
|
||||
##########################################
|
||||
## B. SoC Specs
|
||||
##########################################
|
||||
soc_info = extract_soc_info()
|
||||
|
||||
# FIXME: use device
|
||||
# Load amd-smi data
|
||||
gpu_info = extract_gpu_info(gpu_arch=soc_info["gpu_arch"])
|
||||
|
||||
# Combine all specifications
|
||||
with amdsmi_ctx():
|
||||
specs = MachineSpecs(
|
||||
@@ -269,7 +269,16 @@ def extract_machine_info() -> dict[str, Any]:
|
||||
|
||||
|
||||
@demarcate
|
||||
def extract_gpu_info() -> dict[str, Any]:
|
||||
def extract_gpu_info(gpu_arch: Optional[str]) -> dict[str, Any]:
|
||||
# Partition is only supported on >= MI 300 series
|
||||
# (gpu_arch should be gfx940 or higher for MI300+)
|
||||
is_partition_supported = False
|
||||
if gpu_arch and gpu_arch.startswith("gfx") and len(gpu_arch) >= 6:
|
||||
try:
|
||||
is_partition_supported = int(gpu_arch[3:6], 16) >= 0x940
|
||||
except ValueError:
|
||||
pass # Invalid hex string, keep is_partition_supported as False
|
||||
|
||||
result: dict[str, Optional[str]] = {
|
||||
"vbios": None,
|
||||
"compute_partition": None,
|
||||
@@ -278,17 +287,22 @@ def extract_gpu_info() -> dict[str, Any]:
|
||||
|
||||
with amdsmi_ctx():
|
||||
result["vbios"] = get_gpu_vbios_part_number()
|
||||
result["compute_partition"] = get_gpu_compute_partition()
|
||||
result["memory_partition"] = get_gpu_memory_partition()
|
||||
if is_partition_supported:
|
||||
result["compute_partition"] = get_gpu_compute_partition()
|
||||
result["memory_partition"] = get_gpu_memory_partition()
|
||||
else:
|
||||
result["compute_partition"] = "N/A"
|
||||
result["memory_partition"] = "N/A"
|
||||
|
||||
# Apply defaults and warnings
|
||||
if result["compute_partition"] == "N/A" or not result["compute_partition"]:
|
||||
console_warning("Cannot detect accelerator partition from amd-smi.")
|
||||
console_warning("Applying default accelerator partition: SPX")
|
||||
result["compute_partition"] = "SPX"
|
||||
if is_partition_supported:
|
||||
if result["compute_partition"] == "N/A" or not result["compute_partition"]:
|
||||
console_warning("Cannot detect accelerator partition from amd-smi.")
|
||||
console_warning("Applying default accelerator partition: SPX")
|
||||
result["compute_partition"] = "SPX"
|
||||
|
||||
if result["memory_partition"] == "N/A" or not result["memory_partition"]:
|
||||
console_warning("Cannot detect memory partition from amd-smi.")
|
||||
if result["memory_partition"] == "N/A" or not result["memory_partition"]:
|
||||
console_warning("Cannot detect memory partition from amd-smi.")
|
||||
|
||||
console_debug(
|
||||
f"vbios is {result['vbios']}, compute partition is "
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele