[rocprofiler-compute] Add exception handling for native tool path search (#2159)

* Add exception handling for native tool path search

* Fix formatting in roofline benchmark code

* Fix detection of .so files

* include hip code and native tool code in standalone binary

* add fallback path for ROCM_PATH
Этот коммит содержится в:
vedithal-amd
2025-12-04 10:29:49 -05:00
коммит произвёл GitHub
родитель 4ff89b6fd1
Коммит d8a8a3ef30
6 изменённых файлов: 102 добавлений и 64 удалений
+4 -1
Просмотреть файл
@@ -657,6 +657,8 @@ add_custom_target(
--onefile-tempdir-spec=/{PRODUCT}/rocprof_compute_standalonebinary_{PID}
--include-data-files=${PROJECT_SOURCE_DIR}/VERSION*=./ --enable-plugin=no-qt
--include-data-files=src/lib/rocprofiler_compute_tool.cpp=lib/rocprofiler_compute_tool.cpp
--include-data-files=src/lib/helper.cpp=lib/helper.cpp
--include-data-files=src/lib/helper.hpp=lib/helper.hpp
--include-package=dash_svg --include-package-data=dash_svg
--include-package=dash_bootstrap_components
--include-package-data=dash_bootstrap_components --include-package=plotly
@@ -667,7 +669,8 @@ add_custom_target(
--include-package-data=rocprof_compute_profile
--include-package=rocprof_compute_tui --include-package-data=rocprof_compute_tui
--include-package=rocprof_compute_soc --include-package-data=rocprof_compute_soc
--include-package=utils --include-package-data=utils src/rocprof-compute
--include-package=utils --include-package-data=utils
--include-package=hip --include-package-data=hip src/rocprof-compute
# Remove library rpath from executable
COMMAND patchelf --remove-rpath rocprof-compute.bin
# Move to build directory
+3
Просмотреть файл
@@ -26,6 +26,8 @@ CMD ["/bin/bash", "-c", "\
--enable-plugin=no-qt \
--include-data-files=VERSION*=./ \
--include-data-files=src/lib/rocprofiler_compute_tool.cpp=lib/rocprofiler_compute_tool.cpp \
--include-data-files=src/lib/helper.cpp=lib/helper.cpp \
--include-data-files=src/lib/helper.hpp=lib/helper.hpp \
--include-package=dash_svg --include-package-data=dash_svg \
--include-package=dash_bootstrap_components \
--include-package-data=dash_bootstrap_components \
@@ -38,6 +40,7 @@ CMD ["/bin/bash", "-c", "\
--include-package=rocprof_compute_tui --include-package-data=rocprof_compute_tui \
--include-package=rocprof_compute_soc --include-package-data=rocprof_compute_soc \
--include-package=utils --include-package-data=utils \
--include-package=hip --include-package-data=hip \
src/rocprof-compute \
&& patchelf --remove-rpath rocprof-compute.bin \
"]
+2 -14
Просмотреть файл
@@ -37,8 +37,9 @@ from ctypes import (
c_uint8,
c_void_p,
)
import os
_lib = ctypes.CDLL("libamdhip64.so")
_lib = ctypes.CDLL(f"{os.getenv('ROCM_PATH', '/opt/rocm')}/lib/libamdhip64.so")
# Mirrors struct hipUUID_t
@@ -299,7 +300,6 @@ class HIPModule:
def hipGetDeviceCount() -> int:
device_count = c_int()
status = _lib.hipGetDeviceCount(byref(device_count))
@@ -310,7 +310,6 @@ def hipGetDeviceCount() -> int:
def hipGetDeviceProperties(device_id: int) -> HIPDeviceProperties:
props = HIPDeviceProperties()
res = _lib.hipGetDevicePropertiesR0600(byref(props), device_id)
@@ -321,7 +320,6 @@ def hipGetDeviceProperties(device_id: int) -> HIPDeviceProperties:
def hipMalloc(size: int) -> HIPDeviceMemory:
buf_size = c_size_t(size)
ptr = c_void_p()
@@ -334,7 +332,6 @@ def hipMalloc(size: int) -> HIPDeviceMemory:
def hipMemcpyHtoD(dst: HIPDeviceMemory, src: POINTER, size: int) -> None:
res = _lib.hipMemcpyHtoD(dst.ptr, src, size)
if res != 0:
@@ -342,7 +339,6 @@ def hipMemcpyHtoD(dst: HIPDeviceMemory, src: POINTER, size: int) -> None:
def hipMemcpyDtoH(dst: POINTER, src: HIPDeviceMemory, size: int) -> None:
res = _lib.hipMemcpyDtoH(dst, src.ptr, size)
if res != 0:
@@ -350,7 +346,6 @@ def hipMemcpyDtoH(dst: POINTER, src: HIPDeviceMemory, size: int) -> None:
def hipSetDevice(id: int) -> None:
status = _lib.hipSetDevice(id)
if status != 0:
@@ -358,7 +353,6 @@ def hipSetDevice(id: int) -> None:
def hipDeviceSynchronize() -> None:
res = _lib.hipDeviceSynchronize()
if res != 0:
@@ -366,7 +360,6 @@ def hipDeviceSynchronize() -> None:
def hipModuleLoadData(code: POINTER) -> HIPModule:
module = c_void_p()
res = _lib.hipModuleLoadData(byref(module), code)
@@ -377,7 +370,6 @@ def hipModuleLoadData(code: POINTER) -> HIPModule:
def hipModuleGetFunction(module: POINTER, name: str) -> POINTER:
name_bytes = name.encode("utf-8")
func = c_void_p()
@@ -402,7 +394,6 @@ def hipModuleLaunchKernel(
kernel_params: POINTER,
extra: POINTER = None,
) -> None:
res = _lib.hipModuleLaunchKernel(
func,
grid_dim_x,
@@ -422,7 +413,6 @@ def hipModuleLaunchKernel(
def hipEventCreate() -> HIPEvent:
handle = c_void_p()
res = _lib.hipEventCreate(byref(handle))
@@ -434,7 +424,6 @@ def hipEventCreate() -> HIPEvent:
def hipEventRecord(event: HIPEvent, stream: POINTER = None) -> None:
res = _lib.hipEventRecord(event.handle, stream)
if res != 0:
@@ -442,7 +431,6 @@ def hipEventRecord(event: HIPEvent, stream: POINTER = None) -> None:
def hipEventElapsedTime(start: HIPEvent, stop: HIPEvent) -> float:
ms = c_float()
res = _lib.hipEventElapsedTime(byref(ms), start.handle, stop.handle)
+2 -8
Просмотреть файл
@@ -33,8 +33,9 @@ from ctypes import (
c_size_t,
c_void_p,
)
import os
_lib = ctypes.CDLL("libhiprtc.so")
_lib = ctypes.CDLL(f"{os.getenv('ROCM_PATH', '/opt/rocm')}/lib/libhiprtc.so")
_lib.hiprtcCreateProgram.restype = c_int
@@ -112,7 +113,6 @@ class HIPRTCProgram:
# TODO: Handle headers
def hiprtcCreateProgram(src: str, name: str) -> HIPRTCProgram:
src_bytes = src.encode("utf-8")
name_bytes = name.encode("utf-8")
@@ -128,7 +128,6 @@ def hiprtcCreateProgram(src: str, name: str) -> HIPRTCProgram:
# TODO: Handle compile options
def hiprtcCompileProgram(prog: HIPRTCProgram) -> None:
res = _lib.hiprtcCompileProgram(prog.handle, 0, None)
if res != 0:
@@ -136,7 +135,6 @@ def hiprtcCompileProgram(prog: HIPRTCProgram) -> None:
def hiprtcGetProgramLogSize(prog: HIPRTCProgram) -> int:
size = c_size_t(0)
res = _lib.hiprtcGetProgramLogSize(prog.handle, byref(size))
@@ -148,7 +146,6 @@ def hiprtcGetProgramLogSize(prog: HIPRTCProgram) -> int:
def hiprtcGetProgramLog(prog: HIPRTCProgram) -> str:
size = hiprtcGetProgramLogSize(prog)
buf = (ctypes.c_char * size)()
@@ -171,7 +168,6 @@ def hiprtcGetCodeSize(prog: HIPRTCProgram) -> int:
def hiprtcGetCode(prog: HIPRTCProgram) -> POINTER:
size = hiprtcGetCodeSize(prog)
buf = (c_char * size)()
res = _lib.hiprtcGetCode(prog.handle, buf)
@@ -183,7 +179,6 @@ def hiprtcGetCode(prog: HIPRTCProgram) -> POINTER:
def hiprtcGetLoweredName(prog: HIPRTCProgram, name_expression: str) -> str:
expr_bytes = name_expression.encode("utf-8")
name_bytes = c_char_p()
@@ -196,7 +191,6 @@ def hiprtcGetLoweredName(prog: HIPRTCProgram, name_expression: str) -> str:
def hiprtcAddNameExpression(prog: HIPRTCProgram, name_expression: str) -> None:
expr_bytes = name_expression.encode("utf-8")
res = _lib.hiprtcAddNameExpression(prog.handle, expr_bytes)
+14 -7
Просмотреть файл
@@ -429,13 +429,20 @@ class RocProfCompute_Base:
and not args.attach_pid
):
# Use native counter collection tool
native_tool_path = str(
Path(sys.argv[0]).resolve().parents[2]
/ "lib"
/ "rocprofiler-compute"
/ "librocprofiler-compute-tool.so"
)
if not Path(native_tool_path).is_file():
try:
native_tool_path = str(
Path(sys.argv[0]).resolve().parents[2]
/ "lib"
/ "rocprofiler-compute"
/ "librocprofiler-compute-tool.so"
)
except Exception as e:
console_debug(
f"Could not find pre-built native tool: {e}. "
"Building native tool now."
)
native_tool_path = None
if not (native_tool_path and Path(native_tool_path).is_file()):
# Build native counter collection tool if not exists
native_tool_path = str(
Path(
+77 -34
Просмотреть файл
@@ -170,7 +170,6 @@ DEFAULT_DATASET_SIZE = 512 * 1024 * 1024
def show_progress(pct: float) -> None:
bar_char = "|"
bar_size = 60
@@ -182,7 +181,6 @@ def show_progress(pct: float) -> None:
# Returns a named tuple with the mean, std deviation and confidence
def calc_stats(samples: list) -> Stats:
mean = sum(samples) / len(samples)
stdev = 0.0
@@ -213,7 +211,6 @@ class Program:
self.module = hip.hipModuleLoadData(self.code)
def get_kernel(self, kernel_name: str) -> POINTER:
# TODO: Why doesn't hiprtcGetLoweredName work with non-template functions?
if "<" in kernel_name:
kernel_name = hiprtc.hiprtcGetLoweredName(self.prog, kernel_name)
@@ -230,7 +227,6 @@ def launch_kernel(
stream: POINTER,
args: list[Any] = [],
) -> None:
# Convert to native types
args_converted = []
for arg in args:
@@ -262,7 +258,6 @@ def launch_kernel(
# Retrieve the gfx architecture
def get_gfx_arch(device: int) -> str:
arch_str = hip.hipGetDeviceProperties(device).gcnArchName
# Parse out only gfx
@@ -280,7 +275,6 @@ def run_get_samples(
stream: POINTER,
args: list[Any] = [],
) -> list[float]:
event_start = hip.hipEventCreate()
event_stop = hip.hipEventCreate()
@@ -405,8 +399,12 @@ def hbm_bw_benchmark(device: int) -> PerfMetrics:
perf_metrics = PerfMetrics(mean, mean - stats.confidence, mean + stats.confidence)
event_ms = total_bytes / mean / 1e6
print(
f"HBM BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:.1f} GB/sec"
f"HBM BW, GPU ID: {device}, workgroupSize:{workgroup_size}, "
f"workgroups:{workgroups}, experiments:{num_experiments}, "
f"traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, "
f"mean:{mean:.1f} GB/sec, stdev:{stdev:.1f} GB/sec"
)
return perf_metrics
@@ -463,7 +461,10 @@ def cache_bw_bench(device: int, type: str, iters: int) -> PerfMetrics:
event_ms = total_bytes / mean / 1e6
print(
f"{type} BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:1f} GB/sec"
f"{type} BW, GPU ID: {device}, workgroupSize:{workgroup_size}, "
f"workgroups:{workgroups}, experiments:{num_experiments}, "
f"traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, "
f"mean:{mean:.1f} GB/sec, stdev:{stdev:1f} GB/sec"
)
return perf_metrics
@@ -511,7 +512,6 @@ extern "C" __global__ void LDS_bw(int numIter, float *dummy)
def lds_bw_benchmark(device: int) -> PerfMetrics:
num_experiments = DEFAULT_NUM_EXPERIMENTS
workgroup_size = DEFAULT_WORKGROUP_SIZE
@@ -553,7 +553,10 @@ def lds_bw_benchmark(device: int) -> PerfMetrics:
event_ms = total_bytes / mean / 1e6
print(
f"LDS BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:1f} GB/sec"
f"LDS BW, GPU ID: {device}, workgroupSize:{workgroup_size}, "
f"workgroups:{workgroups}, experiments:{num_experiments}, "
f"traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, "
f"mean:{mean:.1f} GB/sec, stdev:{stdev:1f} GB/sec"
)
return perf_metrics
@@ -635,7 +638,9 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics:
event_ms = total_flops / mean / 1e6
print(
f"Peak {unit}s ({type}), GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, {unit}:{total_flops}, duration:{event_ms:.1f} ms, mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS"
f"workgroups:{workgroups}, experiments:{num_experiments}, "
f"{unit}:{total_flops}, duration:{event_ms:.1f} ms, "
f"mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS"
)
return perf_metrics
@@ -862,30 +867,80 @@ template<int datatype> __global__ void mfma_f8f6f4(int iter, float *dummy)
case FP8_E4M3: // fp8 x fp8
for(int i = 0; i < iter; ++i)
{
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 0, 0, 0, 0, 0, 0);
}
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
a,
a,
result,
0,
0,
0,
0,
0,
0
);
}
case BF8_E5M2: // bf8 x bf8
for(int i = 0; i < iter; ++i)
{
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 1, 1, 0, 0, 0, 0);
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
a,
a,
result,
1,
1,
0,
0,
0,
0
);
}
break;
case FP6_E2M3: // fp6 x fp6
for(int i = 0; i < iter; ++i)
{
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 2, 2, 0, 0, 0, 0);
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
a,
a,
result,
2,
2,
0,
0,
0,
0
);
}
break;
case BF6_E3M2: // bf6 x bf6
for(int i = 0; i < iter; ++i)
{
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 3, 3, 0, 0, 0, 0);
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
a,
a,
result,
3,
3,
0,
0,
0,
0
);
}
break;
case FP4_E2M1: // fp4 x fp4
for(int i = 0; i < iter; ++i)
{
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 4, 4, 0, 0, 0, 0);
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
a,
a,
result,
4,
4,
0,
0,
0,
0
);
}
break;
}
@@ -954,79 +1009,69 @@ def mfma_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics:
event_ms = total_flops / mean / 1e6
print(
f"Peak MFMA {unit}s ({type}), GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{experiments}, {unit}:{total_flops}, duration:{event_ms:.2f} ms, mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS"
f"Peak MFMA {unit}s ({type}), GPU ID: {device}, "
f"workgroupSize:{workgroup_size}, workgroups:{workgroups}, "
f"experiments:{experiments}, {unit}:{total_flops}, "
f"duration:{event_ms:.2f} ms, mean:{mean:.1f} {rate}, "
f"stdev:{stdev:.1f} GFLOPS"
)
return perf_metrics
def mfma_f32_bench(device: int) -> PerfMetrics:
return mfma_bench(device, "F32", "FLOP", "GFLOPS")
def mfma_f16_bench(device: int) -> PerfMetrics:
return mfma_bench(device, "F16", "FLOP", "GFLOPS")
def mfma_bf16_bench(device: int) -> PerfMetrics:
return mfma_bench(device, "BF16", "FLOP", "GFLOPS")
def mfma_f64_bench(device: int) -> PerfMetrics:
return mfma_bench(device, "F64", "FLOP", "GFLOPS")
def mfma_f8_bench(device: int) -> PerfMetrics:
return mfma_bench(device, "F8", "FLOP", "GFLOPS")
def mfma_i8_bench(device: int) -> PerfMetrics:
return mfma_bench(device, "I8", "IOP", "GOPS")
def mfma_f4_bench(device: int) -> PerfMetrics:
return mfma_bench(device, "F4", "FLOP", "GFLOPS")
def mfma_f6_bench(device: int) -> PerfMetrics:
return mfma_bench(device, "F6", "FLOP", "GFLOPS")
def fp16_benchmark(device: int) -> PerfMetrics:
return flops_bench(device, "FP16", "FLOP", "GFLOPS")
def fp32_benchmark(device: int) -> PerfMetrics:
return flops_bench(device, "FP32", "FLOP", "GFLOPS")
def fp64_benchmark(device: int) -> PerfMetrics:
return flops_bench(device, "FP64", "FLOP", "GFLOPS")
def int8_benchmark(device: int) -> PerfMetrics:
return flops_bench(device, "INT8", "IOP", "GOPS")
def int32_benchmark(device: int) -> PerfMetrics:
return flops_bench(device, "INT32", "IOP", "GOPS")
def int64_benchmark(device: int) -> PerfMetrics:
return flops_bench(device, "INT64", "IOP", "GOPS")
@@ -1055,7 +1100,6 @@ tests = {
# Run the roofine tests on the specified device
def run_benchmark(device: int) -> dict[PerfMetrics]:
metrics_dict = {}
arch = get_gfx_arch(device)
@@ -1079,7 +1123,6 @@ def run_benchmark(device: int) -> dict[PerfMetrics]:
# Returns a dictionary mapping device ID to dictionary of
# metrics
def run_on_devices(devices: list[int]) -> dict[dict[PerfMetrics]]:
metrics = {}
for d in devices:
metrics[d] = run_benchmark(d)