diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 5b4c63c36a..d541344ae7 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -49,6 +49,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Fix issue where counter collection data was empty when profiling workload which spawn multiple child processes +* Fix redundant warnings for compute/memory partition not found for < MI 300 series GPUs by skipping partition checks + ### Removed * Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter. diff --git a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py index 9bae23bf14..a79d188c75 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py @@ -531,12 +531,15 @@ class RocProfCompute_Base: and not args.attach_pid ): # Use native counter collection tool + # Use lib* glob pattern to handle CMAKE_INSTALL_LIBDIR variations + # (lib, lib64, lib32, etc. depending on distribution) + native_tool_base_path = Path(sys.argv[0]).resolve().parents[2] + native_tool_glob_pattern = ( + "lib*/rocprofiler-compute/librocprofiler-compute-tool.so" + ) try: native_tool_path = str( - Path(sys.argv[0]).resolve().parents[2] - / "lib" - / "rocprofiler-compute" - / "librocprofiler-compute-tool.so" + next(native_tool_base_path.glob(native_tool_glob_pattern)) ) except Exception as e: console_debug( @@ -552,6 +555,7 @@ class RocProfCompute_Base: ) / "librocprofiler-compute-tool.so" ) + native_tool_cpp_path = Path(__file__).resolve().parents[1] / "lib" link_libraries = ("rocprofiler-sdk",) build_command = ( # Create shared object @@ -564,10 +568,10 @@ class RocProfCompute_Base: # rocprofiler sdk library path f"-L {str(Path(args.rocprofiler_sdk_tool_path).parent.parent)} " # native tool source files (tool.cpp and helper.cpp) - f"{str(Path(__file__).parent.parent)}/" - "lib/rocprofiler_compute_tool.cpp " - f"{str(Path(__file__).parent.parent)}/" - "lib/helper.cpp " + f"{native_tool_cpp_path}/" + "rocprofiler_compute_tool.cpp " + f"{native_tool_cpp_path}/" + "helper.cpp " # temporary shared object for native tool f"-o {native_tool_path}" ) @@ -575,7 +579,15 @@ class RocProfCompute_Base: success, output = capture_subprocess_output(shlex.split(build_command)) console_debug(f"Build output: {output}") if not success: - console_error("Failed to build native counter collection tool.") + console_error( + "Failed to use native counter collection tool.\n" + "Could not find pre-built .so file at: " + f"{native_tool_base_path / native_tool_glob_pattern}\n" + "Could not find source .cpp files in folder: " + f"{native_tool_cpp_path}\n" + "Please ensure the native tool library is installed " + "or source files are present." + ) if self.__profiler == "rocprofiler-sdk": options = self.get_profiler_options(native_tool_path=native_tool_path) diff --git a/projects/rocprofiler-compute/src/utils/specs.py b/projects/rocprofiler-compute/src/utils/specs.py index b9202deb73..e55335686f 100644 --- a/projects/rocprofiler-compute/src/utils/specs.py +++ b/projects/rocprofiler-compute/src/utils/specs.py @@ -174,15 +174,15 @@ def generate_machine_specs( ########################################## machine_info = extract_machine_info() - # FIXME: use device - # Load amd-smi data - gpu_info = extract_gpu_info() - ########################################## ## B. SoC Specs ########################################## soc_info = extract_soc_info() + # FIXME: use device + # Load amd-smi data + gpu_info = extract_gpu_info(gpu_arch=soc_info["gpu_arch"]) + # Combine all specifications with amdsmi_ctx(): specs = MachineSpecs( @@ -269,7 +269,16 @@ def extract_machine_info() -> dict[str, Any]: @demarcate -def extract_gpu_info() -> dict[str, Any]: +def extract_gpu_info(gpu_arch: Optional[str]) -> dict[str, Any]: + # Partition is only supported on >= MI 300 series + # (gpu_arch should be gfx940 or higher for MI300+) + is_partition_supported = False + if gpu_arch and gpu_arch.startswith("gfx") and len(gpu_arch) >= 6: + try: + is_partition_supported = int(gpu_arch[3:6], 16) >= 0x940 + except ValueError: + pass # Invalid hex string, keep is_partition_supported as False + result: dict[str, Optional[str]] = { "vbios": None, "compute_partition": None, @@ -278,17 +287,22 @@ def extract_gpu_info() -> dict[str, Any]: with amdsmi_ctx(): result["vbios"] = get_gpu_vbios_part_number() - result["compute_partition"] = get_gpu_compute_partition() - result["memory_partition"] = get_gpu_memory_partition() + if is_partition_supported: + result["compute_partition"] = get_gpu_compute_partition() + result["memory_partition"] = get_gpu_memory_partition() + else: + result["compute_partition"] = "N/A" + result["memory_partition"] = "N/A" # Apply defaults and warnings - if result["compute_partition"] == "N/A" or not result["compute_partition"]: - console_warning("Cannot detect accelerator partition from amd-smi.") - console_warning("Applying default accelerator partition: SPX") - result["compute_partition"] = "SPX" + if is_partition_supported: + if result["compute_partition"] == "N/A" or not result["compute_partition"]: + console_warning("Cannot detect accelerator partition from amd-smi.") + console_warning("Applying default accelerator partition: SPX") + result["compute_partition"] = "SPX" - if result["memory_partition"] == "N/A" or not result["memory_partition"]: - console_warning("Cannot detect memory partition from amd-smi.") + if result["memory_partition"] == "N/A" or not result["memory_partition"]: + console_warning("Cannot detect memory partition from amd-smi.") console_debug( f"vbios is {result['vbios']}, compute partition is "