[rocprofiler-compute] Improve native tool discovery and partition detection (#2630)
* Improve native tool discovery and partition detection - Enhanced native tool path resolution to support CMAKE_INSTALL_LIBDIR variations (lib, lib64, lib32, etc.) using glob pattern matching - Extracted path variables to avoid duplication in error messages - Improved error message clarity by showing exact paths searched for .so and .cpp files - Simplified code path construction using consistent Path.resolve().parents[x] syntax - Fixed redundant partition warnings on pre-MI300 GPUs by adding architecture check - Only query compute/memory partition on MI300+ series (gfx940+) - Added proper type hints for gpu_arch parameter - Moved gpu_info extraction after soc_info to ensure gpu_arch is available - Improved code comments for MI300 series threshold * Handle gpu arch like a hex string
Šī revīzija ir iekļauta:
revīziju iesūtīja
GitHub
vecāks
e6236417f7
revīzija
f64d8e0f43
@@ -49,6 +49,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
|
|||||||
|
|
||||||
* Fix issue where counter collection data was empty when profiling workload which spawn multiple child processes
|
* Fix issue where counter collection data was empty when profiling workload which spawn multiple child processes
|
||||||
|
|
||||||
|
* Fix redundant warnings for compute/memory partition not found for < MI 300 series GPUs by skipping partition checks
|
||||||
|
|
||||||
### Removed
|
### Removed
|
||||||
|
|
||||||
* Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter.
|
* Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter.
|
||||||
|
|||||||
@@ -531,12 +531,15 @@ class RocProfCompute_Base:
|
|||||||
and not args.attach_pid
|
and not args.attach_pid
|
||||||
):
|
):
|
||||||
# Use native counter collection tool
|
# Use native counter collection tool
|
||||||
|
# Use lib* glob pattern to handle CMAKE_INSTALL_LIBDIR variations
|
||||||
|
# (lib, lib64, lib32, etc. depending on distribution)
|
||||||
|
native_tool_base_path = Path(sys.argv[0]).resolve().parents[2]
|
||||||
|
native_tool_glob_pattern = (
|
||||||
|
"lib*/rocprofiler-compute/librocprofiler-compute-tool.so"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
native_tool_path = str(
|
native_tool_path = str(
|
||||||
Path(sys.argv[0]).resolve().parents[2]
|
next(native_tool_base_path.glob(native_tool_glob_pattern))
|
||||||
/ "lib"
|
|
||||||
/ "rocprofiler-compute"
|
|
||||||
/ "librocprofiler-compute-tool.so"
|
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
console_debug(
|
console_debug(
|
||||||
@@ -552,6 +555,7 @@ class RocProfCompute_Base:
|
|||||||
)
|
)
|
||||||
/ "librocprofiler-compute-tool.so"
|
/ "librocprofiler-compute-tool.so"
|
||||||
)
|
)
|
||||||
|
native_tool_cpp_path = Path(__file__).resolve().parents[1] / "lib"
|
||||||
link_libraries = ("rocprofiler-sdk",)
|
link_libraries = ("rocprofiler-sdk",)
|
||||||
build_command = (
|
build_command = (
|
||||||
# Create shared object
|
# Create shared object
|
||||||
@@ -564,10 +568,10 @@ class RocProfCompute_Base:
|
|||||||
# rocprofiler sdk library path
|
# rocprofiler sdk library path
|
||||||
f"-L {str(Path(args.rocprofiler_sdk_tool_path).parent.parent)} "
|
f"-L {str(Path(args.rocprofiler_sdk_tool_path).parent.parent)} "
|
||||||
# native tool source files (tool.cpp and helper.cpp)
|
# native tool source files (tool.cpp and helper.cpp)
|
||||||
f"{str(Path(__file__).parent.parent)}/"
|
f"{native_tool_cpp_path}/"
|
||||||
"lib/rocprofiler_compute_tool.cpp "
|
"rocprofiler_compute_tool.cpp "
|
||||||
f"{str(Path(__file__).parent.parent)}/"
|
f"{native_tool_cpp_path}/"
|
||||||
"lib/helper.cpp "
|
"helper.cpp "
|
||||||
# temporary shared object for native tool
|
# temporary shared object for native tool
|
||||||
f"-o {native_tool_path}"
|
f"-o {native_tool_path}"
|
||||||
)
|
)
|
||||||
@@ -575,7 +579,15 @@ class RocProfCompute_Base:
|
|||||||
success, output = capture_subprocess_output(shlex.split(build_command))
|
success, output = capture_subprocess_output(shlex.split(build_command))
|
||||||
console_debug(f"Build output: {output}")
|
console_debug(f"Build output: {output}")
|
||||||
if not success:
|
if not success:
|
||||||
console_error("Failed to build native counter collection tool.")
|
console_error(
|
||||||
|
"Failed to use native counter collection tool.\n"
|
||||||
|
"Could not find pre-built .so file at: "
|
||||||
|
f"{native_tool_base_path / native_tool_glob_pattern}\n"
|
||||||
|
"Could not find source .cpp files in folder: "
|
||||||
|
f"{native_tool_cpp_path}\n"
|
||||||
|
"Please ensure the native tool library is installed "
|
||||||
|
"or source files are present."
|
||||||
|
)
|
||||||
|
|
||||||
if self.__profiler == "rocprofiler-sdk":
|
if self.__profiler == "rocprofiler-sdk":
|
||||||
options = self.get_profiler_options(native_tool_path=native_tool_path)
|
options = self.get_profiler_options(native_tool_path=native_tool_path)
|
||||||
|
|||||||
@@ -174,15 +174,15 @@ def generate_machine_specs(
|
|||||||
##########################################
|
##########################################
|
||||||
machine_info = extract_machine_info()
|
machine_info = extract_machine_info()
|
||||||
|
|
||||||
# FIXME: use device
|
|
||||||
# Load amd-smi data
|
|
||||||
gpu_info = extract_gpu_info()
|
|
||||||
|
|
||||||
##########################################
|
##########################################
|
||||||
## B. SoC Specs
|
## B. SoC Specs
|
||||||
##########################################
|
##########################################
|
||||||
soc_info = extract_soc_info()
|
soc_info = extract_soc_info()
|
||||||
|
|
||||||
|
# FIXME: use device
|
||||||
|
# Load amd-smi data
|
||||||
|
gpu_info = extract_gpu_info(gpu_arch=soc_info["gpu_arch"])
|
||||||
|
|
||||||
# Combine all specifications
|
# Combine all specifications
|
||||||
with amdsmi_ctx():
|
with amdsmi_ctx():
|
||||||
specs = MachineSpecs(
|
specs = MachineSpecs(
|
||||||
@@ -269,7 +269,16 @@ def extract_machine_info() -> dict[str, Any]:
|
|||||||
|
|
||||||
|
|
||||||
@demarcate
|
@demarcate
|
||||||
def extract_gpu_info() -> dict[str, Any]:
|
def extract_gpu_info(gpu_arch: Optional[str]) -> dict[str, Any]:
|
||||||
|
# Partition is only supported on >= MI 300 series
|
||||||
|
# (gpu_arch should be gfx940 or higher for MI300+)
|
||||||
|
is_partition_supported = False
|
||||||
|
if gpu_arch and gpu_arch.startswith("gfx") and len(gpu_arch) >= 6:
|
||||||
|
try:
|
||||||
|
is_partition_supported = int(gpu_arch[3:6], 16) >= 0x940
|
||||||
|
except ValueError:
|
||||||
|
pass # Invalid hex string, keep is_partition_supported as False
|
||||||
|
|
||||||
result: dict[str, Optional[str]] = {
|
result: dict[str, Optional[str]] = {
|
||||||
"vbios": None,
|
"vbios": None,
|
||||||
"compute_partition": None,
|
"compute_partition": None,
|
||||||
@@ -278,17 +287,22 @@ def extract_gpu_info() -> dict[str, Any]:
|
|||||||
|
|
||||||
with amdsmi_ctx():
|
with amdsmi_ctx():
|
||||||
result["vbios"] = get_gpu_vbios_part_number()
|
result["vbios"] = get_gpu_vbios_part_number()
|
||||||
result["compute_partition"] = get_gpu_compute_partition()
|
if is_partition_supported:
|
||||||
result["memory_partition"] = get_gpu_memory_partition()
|
result["compute_partition"] = get_gpu_compute_partition()
|
||||||
|
result["memory_partition"] = get_gpu_memory_partition()
|
||||||
|
else:
|
||||||
|
result["compute_partition"] = "N/A"
|
||||||
|
result["memory_partition"] = "N/A"
|
||||||
|
|
||||||
# Apply defaults and warnings
|
# Apply defaults and warnings
|
||||||
if result["compute_partition"] == "N/A" or not result["compute_partition"]:
|
if is_partition_supported:
|
||||||
console_warning("Cannot detect accelerator partition from amd-smi.")
|
if result["compute_partition"] == "N/A" or not result["compute_partition"]:
|
||||||
console_warning("Applying default accelerator partition: SPX")
|
console_warning("Cannot detect accelerator partition from amd-smi.")
|
||||||
result["compute_partition"] = "SPX"
|
console_warning("Applying default accelerator partition: SPX")
|
||||||
|
result["compute_partition"] = "SPX"
|
||||||
|
|
||||||
if result["memory_partition"] == "N/A" or not result["memory_partition"]:
|
if result["memory_partition"] == "N/A" or not result["memory_partition"]:
|
||||||
console_warning("Cannot detect memory partition from amd-smi.")
|
console_warning("Cannot detect memory partition from amd-smi.")
|
||||||
|
|
||||||
console_debug(
|
console_debug(
|
||||||
f"vbios is {result['vbios']}, compute partition is "
|
f"vbios is {result['vbios']}, compute partition is "
|
||||||
|
|||||||
Atsaukties uz šo jaunā problēmā
Block a user