diff --git a/projects/rocprofiler-compute/CMakeLists.txt b/projects/rocprofiler-compute/CMakeLists.txt index 4dc5614090..df97f4a212 100644 --- a/projects/rocprofiler-compute/CMakeLists.txt +++ b/projects/rocprofiler-compute/CMakeLists.txt @@ -657,6 +657,8 @@ add_custom_target( --onefile-tempdir-spec=/{PRODUCT}/rocprof_compute_standalonebinary_{PID} --include-data-files=${PROJECT_SOURCE_DIR}/VERSION*=./ --enable-plugin=no-qt --include-data-files=src/lib/rocprofiler_compute_tool.cpp=lib/rocprofiler_compute_tool.cpp + --include-data-files=src/lib/helper.cpp=lib/helper.cpp + --include-data-files=src/lib/helper.hpp=lib/helper.hpp --include-package=dash_svg --include-package-data=dash_svg --include-package=dash_bootstrap_components --include-package-data=dash_bootstrap_components --include-package=plotly @@ -667,7 +669,8 @@ add_custom_target( --include-package-data=rocprof_compute_profile --include-package=rocprof_compute_tui --include-package-data=rocprof_compute_tui --include-package=rocprof_compute_soc --include-package-data=rocprof_compute_soc - --include-package=utils --include-package-data=utils src/rocprof-compute + --include-package=utils --include-package-data=utils + --include-package=hip --include-package-data=hip src/rocprof-compute # Remove library rpath from executable COMMAND patchelf --remove-rpath rocprof-compute.bin # Move to build directory diff --git a/projects/rocprofiler-compute/docker/Dockerfile.standalone b/projects/rocprofiler-compute/docker/Dockerfile.standalone index 29df2b0453..034264ca30 100644 --- a/projects/rocprofiler-compute/docker/Dockerfile.standalone +++ b/projects/rocprofiler-compute/docker/Dockerfile.standalone @@ -26,6 +26,8 @@ CMD ["/bin/bash", "-c", "\ --enable-plugin=no-qt \ --include-data-files=VERSION*=./ \ --include-data-files=src/lib/rocprofiler_compute_tool.cpp=lib/rocprofiler_compute_tool.cpp \ + --include-data-files=src/lib/helper.cpp=lib/helper.cpp \ + --include-data-files=src/lib/helper.hpp=lib/helper.hpp \ --include-package=dash_svg --include-package-data=dash_svg \ --include-package=dash_bootstrap_components \ --include-package-data=dash_bootstrap_components \ @@ -38,6 +40,7 @@ CMD ["/bin/bash", "-c", "\ --include-package=rocprof_compute_tui --include-package-data=rocprof_compute_tui \ --include-package=rocprof_compute_soc --include-package-data=rocprof_compute_soc \ --include-package=utils --include-package-data=utils \ + --include-package=hip --include-package-data=hip \ src/rocprof-compute \ && patchelf --remove-rpath rocprof-compute.bin \ "] diff --git a/projects/rocprofiler-compute/src/hip/hip.py b/projects/rocprofiler-compute/src/hip/hip.py index 04af444127..823a4844cb 100644 --- a/projects/rocprofiler-compute/src/hip/hip.py +++ b/projects/rocprofiler-compute/src/hip/hip.py @@ -37,8 +37,9 @@ from ctypes import ( c_uint8, c_void_p, ) +import os -_lib = ctypes.CDLL("libamdhip64.so") +_lib = ctypes.CDLL(f"{os.getenv('ROCM_PATH', '/opt/rocm')}/lib/libamdhip64.so") # Mirrors struct hipUUID_t @@ -299,7 +300,6 @@ class HIPModule: def hipGetDeviceCount() -> int: - device_count = c_int() status = _lib.hipGetDeviceCount(byref(device_count)) @@ -310,7 +310,6 @@ def hipGetDeviceCount() -> int: def hipGetDeviceProperties(device_id: int) -> HIPDeviceProperties: - props = HIPDeviceProperties() res = _lib.hipGetDevicePropertiesR0600(byref(props), device_id) @@ -321,7 +320,6 @@ def hipGetDeviceProperties(device_id: int) -> HIPDeviceProperties: def hipMalloc(size: int) -> HIPDeviceMemory: - buf_size = c_size_t(size) ptr = c_void_p() @@ -334,7 +332,6 @@ def hipMalloc(size: int) -> HIPDeviceMemory: def hipMemcpyHtoD(dst: HIPDeviceMemory, src: POINTER, size: int) -> None: - res = _lib.hipMemcpyHtoD(dst.ptr, src, size) if res != 0: @@ -342,7 +339,6 @@ def hipMemcpyHtoD(dst: HIPDeviceMemory, src: POINTER, size: int) -> None: def hipMemcpyDtoH(dst: POINTER, src: HIPDeviceMemory, size: int) -> None: - res = _lib.hipMemcpyDtoH(dst, src.ptr, size) if res != 0: @@ -350,7 +346,6 @@ def hipMemcpyDtoH(dst: POINTER, src: HIPDeviceMemory, size: int) -> None: def hipSetDevice(id: int) -> None: - status = _lib.hipSetDevice(id) if status != 0: @@ -358,7 +353,6 @@ def hipSetDevice(id: int) -> None: def hipDeviceSynchronize() -> None: - res = _lib.hipDeviceSynchronize() if res != 0: @@ -366,7 +360,6 @@ def hipDeviceSynchronize() -> None: def hipModuleLoadData(code: POINTER) -> HIPModule: - module = c_void_p() res = _lib.hipModuleLoadData(byref(module), code) @@ -377,7 +370,6 @@ def hipModuleLoadData(code: POINTER) -> HIPModule: def hipModuleGetFunction(module: POINTER, name: str) -> POINTER: - name_bytes = name.encode("utf-8") func = c_void_p() @@ -402,7 +394,6 @@ def hipModuleLaunchKernel( kernel_params: POINTER, extra: POINTER = None, ) -> None: - res = _lib.hipModuleLaunchKernel( func, grid_dim_x, @@ -422,7 +413,6 @@ def hipModuleLaunchKernel( def hipEventCreate() -> HIPEvent: - handle = c_void_p() res = _lib.hipEventCreate(byref(handle)) @@ -434,7 +424,6 @@ def hipEventCreate() -> HIPEvent: def hipEventRecord(event: HIPEvent, stream: POINTER = None) -> None: - res = _lib.hipEventRecord(event.handle, stream) if res != 0: @@ -442,7 +431,6 @@ def hipEventRecord(event: HIPEvent, stream: POINTER = None) -> None: def hipEventElapsedTime(start: HIPEvent, stop: HIPEvent) -> float: - ms = c_float() res = _lib.hipEventElapsedTime(byref(ms), start.handle, stop.handle) diff --git a/projects/rocprofiler-compute/src/hip/hiprtc.py b/projects/rocprofiler-compute/src/hip/hiprtc.py index a0e33136eb..823bd4f0f1 100644 --- a/projects/rocprofiler-compute/src/hip/hiprtc.py +++ b/projects/rocprofiler-compute/src/hip/hiprtc.py @@ -33,8 +33,9 @@ from ctypes import ( c_size_t, c_void_p, ) +import os -_lib = ctypes.CDLL("libhiprtc.so") +_lib = ctypes.CDLL(f"{os.getenv('ROCM_PATH', '/opt/rocm')}/lib/libhiprtc.so") _lib.hiprtcCreateProgram.restype = c_int @@ -112,7 +113,6 @@ class HIPRTCProgram: # TODO: Handle headers def hiprtcCreateProgram(src: str, name: str) -> HIPRTCProgram: - src_bytes = src.encode("utf-8") name_bytes = name.encode("utf-8") @@ -128,7 +128,6 @@ def hiprtcCreateProgram(src: str, name: str) -> HIPRTCProgram: # TODO: Handle compile options def hiprtcCompileProgram(prog: HIPRTCProgram) -> None: - res = _lib.hiprtcCompileProgram(prog.handle, 0, None) if res != 0: @@ -136,7 +135,6 @@ def hiprtcCompileProgram(prog: HIPRTCProgram) -> None: def hiprtcGetProgramLogSize(prog: HIPRTCProgram) -> int: - size = c_size_t(0) res = _lib.hiprtcGetProgramLogSize(prog.handle, byref(size)) @@ -148,7 +146,6 @@ def hiprtcGetProgramLogSize(prog: HIPRTCProgram) -> int: def hiprtcGetProgramLog(prog: HIPRTCProgram) -> str: - size = hiprtcGetProgramLogSize(prog) buf = (ctypes.c_char * size)() @@ -171,7 +168,6 @@ def hiprtcGetCodeSize(prog: HIPRTCProgram) -> int: def hiprtcGetCode(prog: HIPRTCProgram) -> POINTER: - size = hiprtcGetCodeSize(prog) buf = (c_char * size)() res = _lib.hiprtcGetCode(prog.handle, buf) @@ -183,7 +179,6 @@ def hiprtcGetCode(prog: HIPRTCProgram) -> POINTER: def hiprtcGetLoweredName(prog: HIPRTCProgram, name_expression: str) -> str: - expr_bytes = name_expression.encode("utf-8") name_bytes = c_char_p() @@ -196,7 +191,6 @@ def hiprtcGetLoweredName(prog: HIPRTCProgram, name_expression: str) -> str: def hiprtcAddNameExpression(prog: HIPRTCProgram, name_expression: str) -> None: - expr_bytes = name_expression.encode("utf-8") res = _lib.hiprtcAddNameExpression(prog.handle, expr_bytes) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py index ad8fd3ed33..52ff39065b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py @@ -429,13 +429,20 @@ class RocProfCompute_Base: and not args.attach_pid ): # Use native counter collection tool - native_tool_path = str( - Path(sys.argv[0]).resolve().parents[2] - / "lib" - / "rocprofiler-compute" - / "librocprofiler-compute-tool.so" - ) - if not Path(native_tool_path).is_file(): + try: + native_tool_path = str( + Path(sys.argv[0]).resolve().parents[2] + / "lib" + / "rocprofiler-compute" + / "librocprofiler-compute-tool.so" + ) + except Exception as e: + console_debug( + f"Could not find pre-built native tool: {e}. " + "Building native tool now." + ) + native_tool_path = None + if not (native_tool_path and Path(native_tool_path).is_file()): # Build native counter collection tool if not exists native_tool_path = str( Path( diff --git a/projects/rocprofiler-compute/src/utils/benchmark.py b/projects/rocprofiler-compute/src/utils/benchmark.py index edf7bfee24..9349ddc9ab 100644 --- a/projects/rocprofiler-compute/src/utils/benchmark.py +++ b/projects/rocprofiler-compute/src/utils/benchmark.py @@ -170,7 +170,6 @@ DEFAULT_DATASET_SIZE = 512 * 1024 * 1024 def show_progress(pct: float) -> None: - bar_char = "|" bar_size = 60 @@ -182,7 +181,6 @@ def show_progress(pct: float) -> None: # Returns a named tuple with the mean, std deviation and confidence def calc_stats(samples: list) -> Stats: - mean = sum(samples) / len(samples) stdev = 0.0 @@ -213,7 +211,6 @@ class Program: self.module = hip.hipModuleLoadData(self.code) def get_kernel(self, kernel_name: str) -> POINTER: - # TODO: Why doesn't hiprtcGetLoweredName work with non-template functions? if "<" in kernel_name: kernel_name = hiprtc.hiprtcGetLoweredName(self.prog, kernel_name) @@ -230,7 +227,6 @@ def launch_kernel( stream: POINTER, args: list[Any] = [], ) -> None: - # Convert to native types args_converted = [] for arg in args: @@ -262,7 +258,6 @@ def launch_kernel( # Retrieve the gfx architecture def get_gfx_arch(device: int) -> str: - arch_str = hip.hipGetDeviceProperties(device).gcnArchName # Parse out only gfx @@ -280,7 +275,6 @@ def run_get_samples( stream: POINTER, args: list[Any] = [], ) -> list[float]: - event_start = hip.hipEventCreate() event_stop = hip.hipEventCreate() @@ -405,8 +399,12 @@ def hbm_bw_benchmark(device: int) -> PerfMetrics: perf_metrics = PerfMetrics(mean, mean - stats.confidence, mean + stats.confidence) event_ms = total_bytes / mean / 1e6 + print( - f"HBM BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:.1f} GB/sec" + f"HBM BW, GPU ID: {device}, workgroupSize:{workgroup_size}, " + f"workgroups:{workgroups}, experiments:{num_experiments}, " + f"traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, " + f"mean:{mean:.1f} GB/sec, stdev:{stdev:.1f} GB/sec" ) return perf_metrics @@ -463,7 +461,10 @@ def cache_bw_bench(device: int, type: str, iters: int) -> PerfMetrics: event_ms = total_bytes / mean / 1e6 print( - f"{type} BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:1f} GB/sec" + f"{type} BW, GPU ID: {device}, workgroupSize:{workgroup_size}, " + f"workgroups:{workgroups}, experiments:{num_experiments}, " + f"traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, " + f"mean:{mean:.1f} GB/sec, stdev:{stdev:1f} GB/sec" ) return perf_metrics @@ -511,7 +512,6 @@ extern "C" __global__ void LDS_bw(int numIter, float *dummy) def lds_bw_benchmark(device: int) -> PerfMetrics: - num_experiments = DEFAULT_NUM_EXPERIMENTS workgroup_size = DEFAULT_WORKGROUP_SIZE @@ -553,7 +553,10 @@ def lds_bw_benchmark(device: int) -> PerfMetrics: event_ms = total_bytes / mean / 1e6 print( - f"LDS BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:1f} GB/sec" + f"LDS BW, GPU ID: {device}, workgroupSize:{workgroup_size}, " + f"workgroups:{workgroups}, experiments:{num_experiments}, " + f"traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, " + f"mean:{mean:.1f} GB/sec, stdev:{stdev:1f} GB/sec" ) return perf_metrics @@ -635,7 +638,9 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: event_ms = total_flops / mean / 1e6 print( - f"Peak {unit}s ({type}), GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, {unit}:{total_flops}, duration:{event_ms:.1f} ms, mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS" + f"workgroups:{workgroups}, experiments:{num_experiments}, " + f"{unit}:{total_flops}, duration:{event_ms:.1f} ms, " + f"mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS" ) return perf_metrics @@ -862,30 +867,80 @@ template __global__ void mfma_f8f6f4(int iter, float *dummy) case FP8_E4M3: // fp8 x fp8 for(int i = 0; i < iter; ++i) { - result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 0, 0, 0, 0, 0, 0); - } + result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4( + a, + a, + result, + 0, + 0, + 0, + 0, + 0, + 0 + ); + } case BF8_E5M2: // bf8 x bf8 for(int i = 0; i < iter; ++i) { - result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 1, 1, 0, 0, 0, 0); + result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4( + a, + a, + result, + 1, + 1, + 0, + 0, + 0, + 0 + ); } break; case FP6_E2M3: // fp6 x fp6 for(int i = 0; i < iter; ++i) { - result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 2, 2, 0, 0, 0, 0); + result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4( + a, + a, + result, + 2, + 2, + 0, + 0, + 0, + 0 + ); } break; case BF6_E3M2: // bf6 x bf6 for(int i = 0; i < iter; ++i) { - result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 3, 3, 0, 0, 0, 0); + result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4( + a, + a, + result, + 3, + 3, + 0, + 0, + 0, + 0 + ); } break; case FP4_E2M1: // fp4 x fp4 for(int i = 0; i < iter; ++i) { - result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 4, 4, 0, 0, 0, 0); + result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4( + a, + a, + result, + 4, + 4, + 0, + 0, + 0, + 0 + ); } break; } @@ -954,79 +1009,69 @@ def mfma_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: event_ms = total_flops / mean / 1e6 print( - f"Peak MFMA {unit}s ({type}), GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{experiments}, {unit}:{total_flops}, duration:{event_ms:.2f} ms, mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS" + f"Peak MFMA {unit}s ({type}), GPU ID: {device}, " + f"workgroupSize:{workgroup_size}, workgroups:{workgroups}, " + f"experiments:{experiments}, {unit}:{total_flops}, " + f"duration:{event_ms:.2f} ms, mean:{mean:.1f} {rate}, " + f"stdev:{stdev:.1f} GFLOPS" ) return perf_metrics def mfma_f32_bench(device: int) -> PerfMetrics: - return mfma_bench(device, "F32", "FLOP", "GFLOPS") def mfma_f16_bench(device: int) -> PerfMetrics: - return mfma_bench(device, "F16", "FLOP", "GFLOPS") def mfma_bf16_bench(device: int) -> PerfMetrics: - return mfma_bench(device, "BF16", "FLOP", "GFLOPS") def mfma_f64_bench(device: int) -> PerfMetrics: - return mfma_bench(device, "F64", "FLOP", "GFLOPS") def mfma_f8_bench(device: int) -> PerfMetrics: - return mfma_bench(device, "F8", "FLOP", "GFLOPS") def mfma_i8_bench(device: int) -> PerfMetrics: - return mfma_bench(device, "I8", "IOP", "GOPS") def mfma_f4_bench(device: int) -> PerfMetrics: - return mfma_bench(device, "F4", "FLOP", "GFLOPS") def mfma_f6_bench(device: int) -> PerfMetrics: - return mfma_bench(device, "F6", "FLOP", "GFLOPS") def fp16_benchmark(device: int) -> PerfMetrics: - return flops_bench(device, "FP16", "FLOP", "GFLOPS") def fp32_benchmark(device: int) -> PerfMetrics: - return flops_bench(device, "FP32", "FLOP", "GFLOPS") def fp64_benchmark(device: int) -> PerfMetrics: - return flops_bench(device, "FP64", "FLOP", "GFLOPS") def int8_benchmark(device: int) -> PerfMetrics: - return flops_bench(device, "INT8", "IOP", "GOPS") def int32_benchmark(device: int) -> PerfMetrics: - return flops_bench(device, "INT32", "IOP", "GOPS") def int64_benchmark(device: int) -> PerfMetrics: - return flops_bench(device, "INT64", "IOP", "GOPS") @@ -1055,7 +1100,6 @@ tests = { # Run the roofine tests on the specified device def run_benchmark(device: int) -> dict[PerfMetrics]: - metrics_dict = {} arch = get_gfx_arch(device) @@ -1079,7 +1123,6 @@ def run_benchmark(device: int) -> dict[PerfMetrics]: # Returns a dictionary mapping device ID to dictionary of # metrics def run_on_devices(devices: list[int]) -> dict[dict[PerfMetrics]]: - metrics = {} for d in devices: metrics[d] = run_benchmark(d)