[rocprofiler-compute] Add exception handling for native tool path search (#2159)

* Add exception handling for native tool path search * Fix formatting in roofline benchmark code * Fix detection of .so files * include hip code and native tool code in standalone binary * add fallback path for ROCM_PATH
2025-12-04 10:29:49 -05:00
@@ -657,6 +657,8 @@ add_custom_target(
        --onefile-tempdir-spec=/{PRODUCT}/rocprof_compute_standalonebinary_{PID}
        --include-data-files=${PROJECT_SOURCE_DIR}/VERSION*=./ --enable-plugin=no-qt
        --include-data-files=src/lib/rocprofiler_compute_tool.cpp=lib/rocprofiler_compute_tool.cpp
+        --include-data-files=src/lib/helper.cpp=lib/helper.cpp
+        --include-data-files=src/lib/helper.hpp=lib/helper.hpp
        --include-package=dash_svg --include-package-data=dash_svg
        --include-package=dash_bootstrap_components
        --include-package-data=dash_bootstrap_components --include-package=plotly
@@ -667,7 +669,8 @@ add_custom_target(
        --include-package-data=rocprof_compute_profile
        --include-package=rocprof_compute_tui --include-package-data=rocprof_compute_tui
        --include-package=rocprof_compute_soc --include-package-data=rocprof_compute_soc
-        --include-package=utils --include-package-data=utils src/rocprof-compute
+        --include-package=utils --include-package-data=utils
+        --include-package=hip --include-package-data=hip src/rocprof-compute
    # Remove library rpath from executable
    COMMAND patchelf --remove-rpath rocprof-compute.bin
    # Move to build directory
@@ -26,6 +26,8 @@ CMD ["/bin/bash", "-c", "\
        --enable-plugin=no-qt \
        --include-data-files=VERSION*=./ \
        --include-data-files=src/lib/rocprofiler_compute_tool.cpp=lib/rocprofiler_compute_tool.cpp \
+        --include-data-files=src/lib/helper.cpp=lib/helper.cpp \
+        --include-data-files=src/lib/helper.hpp=lib/helper.hpp \
        --include-package=dash_svg --include-package-data=dash_svg \
        --include-package=dash_bootstrap_components \
        --include-package-data=dash_bootstrap_components \
@@ -38,6 +40,7 @@ CMD ["/bin/bash", "-c", "\
        --include-package=rocprof_compute_tui --include-package-data=rocprof_compute_tui \
        --include-package=rocprof_compute_soc --include-package-data=rocprof_compute_soc \
        --include-package=utils --include-package-data=utils \
+        --include-package=hip --include-package-data=hip \
        src/rocprof-compute \
    && patchelf --remove-rpath rocprof-compute.bin \
 "]
@@ -37,8 +37,9 @@ from ctypes import (
    c_uint8,
    c_void_p,
 )
+import os

-_lib = ctypes.CDLL("libamdhip64.so")
+_lib = ctypes.CDLL(f"{os.getenv('ROCM_PATH', '/opt/rocm')}/lib/libamdhip64.so")


 # Mirrors struct hipUUID_t
@@ -299,7 +300,6 @@ class HIPModule:


 def hipGetDeviceCount() -> int:
-
    device_count = c_int()
    status = _lib.hipGetDeviceCount(byref(device_count))

@@ -310,7 +310,6 @@ def hipGetDeviceCount() -> int:


 def hipGetDeviceProperties(device_id: int) -> HIPDeviceProperties:
-
    props = HIPDeviceProperties()
    res = _lib.hipGetDevicePropertiesR0600(byref(props), device_id)

@@ -321,7 +320,6 @@ def hipGetDeviceProperties(device_id: int) -> HIPDeviceProperties:


 def hipMalloc(size: int) -> HIPDeviceMemory:
-
    buf_size = c_size_t(size)
    ptr = c_void_p()

@@ -334,7 +332,6 @@ def hipMalloc(size: int) -> HIPDeviceMemory:


 def hipMemcpyHtoD(dst: HIPDeviceMemory, src: POINTER, size: int) -> None:
-
    res = _lib.hipMemcpyHtoD(dst.ptr, src, size)

    if res != 0:
@@ -342,7 +339,6 @@ def hipMemcpyHtoD(dst: HIPDeviceMemory, src: POINTER, size: int) -> None:


 def hipMemcpyDtoH(dst: POINTER, src: HIPDeviceMemory, size: int) -> None:
-
    res = _lib.hipMemcpyDtoH(dst, src.ptr, size)

    if res != 0:
@@ -350,7 +346,6 @@ def hipMemcpyDtoH(dst: POINTER, src: HIPDeviceMemory, size: int) -> None:


 def hipSetDevice(id: int) -> None:
-
    status = _lib.hipSetDevice(id)

    if status != 0:
@@ -358,7 +353,6 @@ def hipSetDevice(id: int) -> None:


 def hipDeviceSynchronize() -> None:
-
    res = _lib.hipDeviceSynchronize()

    if res != 0:
@@ -366,7 +360,6 @@ def hipDeviceSynchronize() -> None:


 def hipModuleLoadData(code: POINTER) -> HIPModule:
-
    module = c_void_p()
    res = _lib.hipModuleLoadData(byref(module), code)

@@ -377,7 +370,6 @@ def hipModuleLoadData(code: POINTER) -> HIPModule:


 def hipModuleGetFunction(module: POINTER, name: str) -> POINTER:
-
    name_bytes = name.encode("utf-8")
    func = c_void_p()

@@ -402,7 +394,6 @@ def hipModuleLaunchKernel(
    kernel_params: POINTER,
    extra: POINTER = None,
 ) -> None:
-
    res = _lib.hipModuleLaunchKernel(
        func,
        grid_dim_x,
@@ -422,7 +413,6 @@ def hipModuleLaunchKernel(


 def hipEventCreate() -> HIPEvent:
-
    handle = c_void_p()

    res = _lib.hipEventCreate(byref(handle))
@@ -434,7 +424,6 @@ def hipEventCreate() -> HIPEvent:


 def hipEventRecord(event: HIPEvent, stream: POINTER = None) -> None:
-
    res = _lib.hipEventRecord(event.handle, stream)

    if res != 0:
@@ -442,7 +431,6 @@ def hipEventRecord(event: HIPEvent, stream: POINTER = None) -> None:


 def hipEventElapsedTime(start: HIPEvent, stop: HIPEvent) -> float:
-
    ms = c_float()

    res = _lib.hipEventElapsedTime(byref(ms), start.handle, stop.handle)
@@ -33,8 +33,9 @@ from ctypes import (
    c_size_t,
    c_void_p,
 )
+import os

-_lib = ctypes.CDLL("libhiprtc.so")
+_lib = ctypes.CDLL(f"{os.getenv('ROCM_PATH', '/opt/rocm')}/lib/libhiprtc.so")


 _lib.hiprtcCreateProgram.restype = c_int
@@ -112,7 +113,6 @@ class HIPRTCProgram:

 # TODO: Handle headers
 def hiprtcCreateProgram(src: str, name: str) -> HIPRTCProgram:
-
    src_bytes = src.encode("utf-8")
    name_bytes = name.encode("utf-8")

@@ -128,7 +128,6 @@ def hiprtcCreateProgram(src: str, name: str) -> HIPRTCProgram:

 # TODO: Handle compile options
 def hiprtcCompileProgram(prog: HIPRTCProgram) -> None:
-
    res = _lib.hiprtcCompileProgram(prog.handle, 0, None)

    if res != 0:
@@ -136,7 +135,6 @@ def hiprtcCompileProgram(prog: HIPRTCProgram) -> None:


 def hiprtcGetProgramLogSize(prog: HIPRTCProgram) -> int:
-
    size = c_size_t(0)

    res = _lib.hiprtcGetProgramLogSize(prog.handle, byref(size))
@@ -148,7 +146,6 @@ def hiprtcGetProgramLogSize(prog: HIPRTCProgram) -> int:


 def hiprtcGetProgramLog(prog: HIPRTCProgram) -> str:
-
    size = hiprtcGetProgramLogSize(prog)
    buf = (ctypes.c_char * size)()

@@ -171,7 +168,6 @@ def hiprtcGetCodeSize(prog: HIPRTCProgram) -> int:


 def hiprtcGetCode(prog: HIPRTCProgram) -> POINTER:
-
    size = hiprtcGetCodeSize(prog)
    buf = (c_char * size)()
    res = _lib.hiprtcGetCode(prog.handle, buf)
@@ -183,7 +179,6 @@ def hiprtcGetCode(prog: HIPRTCProgram) -> POINTER:


 def hiprtcGetLoweredName(prog: HIPRTCProgram, name_expression: str) -> str:
-
    expr_bytes = name_expression.encode("utf-8")
    name_bytes = c_char_p()

@@ -196,7 +191,6 @@ def hiprtcGetLoweredName(prog: HIPRTCProgram, name_expression: str) -> str:


 def hiprtcAddNameExpression(prog: HIPRTCProgram, name_expression: str) -> None:
-
    expr_bytes = name_expression.encode("utf-8")

    res = _lib.hiprtcAddNameExpression(prog.handle, expr_bytes)
@@ -429,13 +429,20 @@ class RocProfCompute_Base:
            and not args.attach_pid
        ):
            # Use native counter collection tool
-            native_tool_path = str(
-                Path(sys.argv[0]).resolve().parents[2]
-                / "lib"
-                / "rocprofiler-compute"
-                / "librocprofiler-compute-tool.so"
-            )
-            if not Path(native_tool_path).is_file():
+            try:
+                native_tool_path = str(
+                    Path(sys.argv[0]).resolve().parents[2]
+                    / "lib"
+                    / "rocprofiler-compute"
+                    / "librocprofiler-compute-tool.so"
+                )
+            except Exception as e:
+                console_debug(
+                    f"Could not find pre-built native tool: {e}. "
+                    "Building native tool now."
+                )
+                native_tool_path = None
+            if not (native_tool_path and Path(native_tool_path).is_file()):
                # Build native counter collection tool if not exists
                native_tool_path = str(
                    Path(
@@ -170,7 +170,6 @@ DEFAULT_DATASET_SIZE = 512 * 1024 * 1024


 def show_progress(pct: float) -> None:
-
    bar_char = "|"
    bar_size = 60

@@ -182,7 +181,6 @@ def show_progress(pct: float) -> None:

 # Returns a named tuple with the mean, std deviation and confidence
 def calc_stats(samples: list) -> Stats:
-
    mean = sum(samples) / len(samples)

    stdev = 0.0
@@ -213,7 +211,6 @@ class Program:
        self.module = hip.hipModuleLoadData(self.code)

    def get_kernel(self, kernel_name: str) -> POINTER:
-
        # TODO: Why doesn't hiprtcGetLoweredName work with non-template functions?
        if "<" in kernel_name:
            kernel_name = hiprtc.hiprtcGetLoweredName(self.prog, kernel_name)
@@ -230,7 +227,6 @@ def launch_kernel(
    stream: POINTER,
    args: list[Any] = [],
 ) -> None:
-
    # Convert to native types
    args_converted = []
    for arg in args:
@@ -262,7 +258,6 @@ def launch_kernel(

 # Retrieve the gfx architecture
 def get_gfx_arch(device: int) -> str:
-
    arch_str = hip.hipGetDeviceProperties(device).gcnArchName

    # Parse out only gfx
@@ -280,7 +275,6 @@ def run_get_samples(
    stream: POINTER,
    args: list[Any] = [],
 ) -> list[float]:
-
    event_start = hip.hipEventCreate()
    event_stop = hip.hipEventCreate()

@@ -405,8 +399,12 @@ def hbm_bw_benchmark(device: int) -> PerfMetrics:
    perf_metrics = PerfMetrics(mean, mean - stats.confidence, mean + stats.confidence)

    event_ms = total_bytes / mean / 1e6
+
    print(
-        f"HBM BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:.1f} GB/sec"
+        f"HBM BW, GPU ID: {device}, workgroupSize:{workgroup_size}, "
+        f"workgroups:{workgroups}, experiments:{num_experiments}, "
+        f"traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, "
+        f"mean:{mean:.1f} GB/sec, stdev:{stdev:.1f} GB/sec"
    )

    return perf_metrics
@@ -463,7 +461,10 @@ def cache_bw_bench(device: int, type: str, iters: int) -> PerfMetrics:
    event_ms = total_bytes / mean / 1e6

    print(
-        f"{type} BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:1f} GB/sec"
+        f"{type} BW, GPU ID: {device}, workgroupSize:{workgroup_size}, "
+        f"workgroups:{workgroups}, experiments:{num_experiments}, "
+        f"traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, "
+        f"mean:{mean:.1f} GB/sec, stdev:{stdev:1f} GB/sec"
    )

    return perf_metrics
@@ -511,7 +512,6 @@ extern "C" __global__ void LDS_bw(int numIter, float *dummy)


 def lds_bw_benchmark(device: int) -> PerfMetrics:
-
    num_experiments = DEFAULT_NUM_EXPERIMENTS
    workgroup_size = DEFAULT_WORKGROUP_SIZE

@@ -553,7 +553,10 @@ def lds_bw_benchmark(device: int) -> PerfMetrics:
    event_ms = total_bytes / mean / 1e6

    print(
-        f"LDS BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:1f} GB/sec"
+        f"LDS BW, GPU ID: {device}, workgroupSize:{workgroup_size}, "
+        f"workgroups:{workgroups}, experiments:{num_experiments}, "
+        f"traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, "
+        f"mean:{mean:.1f} GB/sec, stdev:{stdev:1f} GB/sec"
    )

    return perf_metrics
@@ -635,7 +638,9 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics:
    event_ms = total_flops / mean / 1e6

    print(
-        f"Peak {unit}s ({type}), GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, {unit}:{total_flops}, duration:{event_ms:.1f} ms, mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS"
+        f"workgroups:{workgroups}, experiments:{num_experiments}, "
+        f"{unit}:{total_flops}, duration:{event_ms:.1f} ms, "
+        f"mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS"
    )

    return perf_metrics
@@ -862,30 +867,80 @@ template<int datatype> __global__ void mfma_f8f6f4(int iter, float *dummy)
        case FP8_E4M3: // fp8 x fp8
            for(int i = 0; i < iter; ++i)
            {
-                result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 0, 0, 0, 0, 0, 0);
-        	}
+                result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                    a,
+                    a,
+                    result,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0
+                );
+            }
        case BF8_E5M2: // bf8 x bf8
            for(int i = 0; i < iter; ++i)
            {
-                result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 1, 1, 0, 0, 0, 0);
+                result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                    a,
+                    a,
+                    result,
+                    1,
+                    1,
+                    0,
+                    0,
+                    0,
+                    0
+                );
            }
            break;
        case FP6_E2M3: // fp6 x fp6
            for(int i = 0; i < iter; ++i)
            {
-                result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 2, 2, 0, 0, 0, 0);
+                result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                    a,
+                    a,
+                    result,
+                    2,
+                    2,
+                    0,
+                    0,
+                    0,
+                    0
+                );
            }
            break;
        case BF6_E3M2: // bf6 x bf6
            for(int i = 0; i < iter; ++i)
            {
-                result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 3, 3, 0, 0, 0, 0);
+                result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                    a,
+                    a,
+                    result,
+                    3,
+                    3,
+                    0,
+                    0,
+                    0,
+                    0
+                );
            }
            break;
        case FP4_E2M1: // fp4 x fp4
            for(int i = 0; i < iter; ++i)
            {
-                result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 4, 4, 0, 0, 0, 0);
+                result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+                    a,
+                    a,
+                    result,
+                    4,
+                    4,
+                    0,
+                    0,
+                    0,
+                    0
+                );
            }
            break;
    }
@@ -954,79 +1009,69 @@ def mfma_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics:
    event_ms = total_flops / mean / 1e6

    print(
-        f"Peak MFMA {unit}s ({type}), GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{experiments}, {unit}:{total_flops}, duration:{event_ms:.2f} ms, mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS"
+        f"Peak MFMA {unit}s ({type}), GPU ID: {device}, "
+        f"workgroupSize:{workgroup_size}, workgroups:{workgroups}, "
+        f"experiments:{experiments}, {unit}:{total_flops}, "
+        f"duration:{event_ms:.2f} ms, mean:{mean:.1f} {rate}, "
+        f"stdev:{stdev:.1f} GFLOPS"
    )

    return perf_metrics


 def mfma_f32_bench(device: int) -> PerfMetrics:
-
    return mfma_bench(device, "F32", "FLOP", "GFLOPS")


 def mfma_f16_bench(device: int) -> PerfMetrics:
-
    return mfma_bench(device, "F16", "FLOP", "GFLOPS")


 def mfma_bf16_bench(device: int) -> PerfMetrics:
-
    return mfma_bench(device, "BF16", "FLOP", "GFLOPS")


 def mfma_f64_bench(device: int) -> PerfMetrics:
-
    return mfma_bench(device, "F64", "FLOP", "GFLOPS")


 def mfma_f8_bench(device: int) -> PerfMetrics:
-
    return mfma_bench(device, "F8", "FLOP", "GFLOPS")


 def mfma_i8_bench(device: int) -> PerfMetrics:
-
    return mfma_bench(device, "I8", "IOP", "GOPS")


 def mfma_f4_bench(device: int) -> PerfMetrics:
-
    return mfma_bench(device, "F4", "FLOP", "GFLOPS")


 def mfma_f6_bench(device: int) -> PerfMetrics:
-
    return mfma_bench(device, "F6", "FLOP", "GFLOPS")


 def fp16_benchmark(device: int) -> PerfMetrics:
-
    return flops_bench(device, "FP16", "FLOP", "GFLOPS")


 def fp32_benchmark(device: int) -> PerfMetrics:
-
    return flops_bench(device, "FP32", "FLOP", "GFLOPS")


 def fp64_benchmark(device: int) -> PerfMetrics:
-
    return flops_bench(device, "FP64", "FLOP", "GFLOPS")


 def int8_benchmark(device: int) -> PerfMetrics:
-
    return flops_bench(device, "INT8", "IOP", "GOPS")


 def int32_benchmark(device: int) -> PerfMetrics:
-
    return flops_bench(device, "INT32", "IOP", "GOPS")


 def int64_benchmark(device: int) -> PerfMetrics:
-
    return flops_bench(device, "INT64", "IOP", "GOPS")


@@ -1055,7 +1100,6 @@ tests = {

 # Run the roofine tests on the specified device
 def run_benchmark(device: int) -> dict[PerfMetrics]:
-
    metrics_dict = {}

    arch = get_gfx_arch(device)
@@ -1079,7 +1123,6 @@ def run_benchmark(device: int) -> dict[PerfMetrics]:
 # Returns a dictionary mapping device ID to dictionary of
 # metrics
 def run_on_devices(devices: list[int]) -> dict[dict[PerfMetrics]]:
-
    metrics = {}
    for d in devices:
        metrics[d] = run_benchmark(d)