diff --git a/projects/rocprofiler-compute/CMakeLists.txt b/projects/rocprofiler-compute/CMakeLists.txt index ba52f44495..81ef31a2c5 100644 --- a/projects/rocprofiler-compute/CMakeLists.txt +++ b/projects/rocprofiler-compute/CMakeLists.txt @@ -601,7 +601,8 @@ add_custom_target( # Create VERSION.sha file COMMAND git -C ${PROJECT_SOURCE_DIR} rev-parse HEAD > VERSION.sha # Build standalone binary - # NOTE: --no-deployment-flag=self-execution is used to avoid self-execution and fork + # NOTE: --no-deployment-flag=self-execution is used to avoid self-execution + # and fork # bombs as explained in # https://nuitka.net/user-documentation/common-issue-solutions.html#fork-bombs-self-execution COMMAND diff --git a/projects/rocprofiler-compute/sample/mat_mul_max.hip b/projects/rocprofiler-compute/sample/mat_mul_max.hip index 3f32e4776b..87a30198b8 100644 --- a/projects/rocprofiler-compute/sample/mat_mul_max.hip +++ b/projects/rocprofiler-compute/sample/mat_mul_max.hip @@ -6,6 +6,18 @@ #define TILE_SIZE 32 // Maximum block size: 32 x 32 = 1024 threads/block #define N 4096 // Matrix size: 4096 x 4096 (~67M elements) +// Helper macro for HIP error checking +#define HIP_CHECK(call) \ + do { \ + hipError_t err = call; \ + if (err != hipSuccess) { \ + std::cerr << "HIP error: " << hipGetErrorString(err) \ + << " at " << __FILE__ << ":" << __LINE__ \ + << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + } while(0) + __global__ void matMulKernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int width) { __shared__ float tileA[TILE_SIZE][TILE_SIZE]; __shared__ float tileB[TILE_SIZE][TILE_SIZE]; @@ -15,6 +27,7 @@ __global__ void matMulKernel(const float* __restrict__ A, const float* __restric float sum = 0.0f; + // Loop over tiles of input matrices for (int t = 0; t < width / TILE_SIZE; ++t) { tileA[threadIdx.y][threadIdx.x] = A[row * width + t * TILE_SIZE + threadIdx.x]; tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * width + col]; @@ -43,12 +56,12 @@ int main() { h_B[i] = static_cast((i + 1) % 100) * 0.01f; } - hipMalloc(&d_A, size); - hipMalloc(&d_B, size); - hipMalloc(&d_C, size); + HIP_CHECK(hipMalloc(&d_A, size)); + HIP_CHECK(hipMalloc(&d_B, size)); + HIP_CHECK(hipMalloc(&d_C, size)); - hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice); - hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice); + HIP_CHECK(hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice)); dim3 blockDim(TILE_SIZE, TILE_SIZE); // 32 x 32 = 1024 threads dim3 gridDim(N / TILE_SIZE, N / TILE_SIZE); // 128 x 128 = 16,384 thread blocks @@ -59,15 +72,16 @@ int main() { auto start = std::chrono::high_resolution_clock::now(); matMulKernel<<>>(d_A, d_B, d_C, N); - hipDeviceSynchronize(); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed = end - start; std::cout << "Execution time: " << elapsed.count() << " seconds\n"; - hipFree(d_A); - hipFree(d_B); - hipFree(d_C); + HIP_CHECK(hipFree(d_A)); + HIP_CHECK(hipFree(d_B)); + HIP_CHECK(hipFree(d_C)); delete[] h_A; delete[] h_B; diff --git a/projects/rocprofiler-compute/src/utils/parser.py b/projects/rocprofiler-compute/src/utils/parser.py index 4462b26e89..1674c02033 100755 --- a/projects/rocprofiler-compute/src/utils/parser.py +++ b/projects/rocprofiler-compute/src/utils/parser.py @@ -1174,7 +1174,10 @@ def apply_dispatch_filter(df: pd.DataFrame, workload: schema.Workload) -> pd.Dat if int(dispatch_id) >= len(df): # subtract 2 bc of the two header rows console_error("analysis", f"{dispatch_id} is an invalid dispatch id.") - if isinstance(workload.filter_dispatch_ids[0], str) and ">" in workload.filter_dispatch_ids[0]: + if ( + isinstance(workload.filter_dispatch_ids[0], str) + and ">" in workload.filter_dispatch_ids[0] + ): dispatch_match = re.match(r"\> (\d+)", workload.filter_dispatch_ids[0]) df = df[ df[schema.PMC_PERF_FILE_PREFIX]["Dispatch_ID"] diff --git a/projects/rocprofiler-compute/tests/test_profile_general.py b/projects/rocprofiler-compute/tests/test_profile_general.py index 07c7e247e2..00f3ba01bf 100644 --- a/projects/rocprofiler-compute/tests/test_profile_general.py +++ b/projects/rocprofiler-compute/tests/test_profile_general.py @@ -1817,7 +1817,7 @@ def test_pc_sampling_host_trap(binary_handler_profile_rocprof_compute): "--pc-sampling-method", "host_trap", "--pc-sampling-interval", - "1048576", + "256", ] workload_dir = test_utils.get_output_dir() _ = binary_handler_profile_rocprof_compute(