change interval for host_trap in unit test to adapt to single kernel (#1064)

2025-09-19 17:21:02 -04:00
commit 775ac73d25
@@ -601,7 +601,8 @@ add_custom_target(
    # Create VERSION.sha file
    COMMAND git -C ${PROJECT_SOURCE_DIR} rev-parse HEAD > VERSION.sha
    # Build standalone binary
-    # NOTE: --no-deployment-flag=self-execution is used to avoid self-execution and fork
+    # NOTE: --no-deployment-flag=self-execution is used to avoid self-execution
+    # and fork
    # bombs as explained in
    # https://nuitka.net/user-documentation/common-issue-solutions.html#fork-bombs-self-execution
    COMMAND
@@ -6,6 +6,18 @@
 #define TILE_SIZE 32     // Maximum block size: 32 x 32 = 1024 threads/block
 #define N 4096           // Matrix size: 4096 x 4096 (~67M elements)

+// Helper macro for HIP error checking
+#define HIP_CHECK(call)                                              \
+    do {                                                            \
+        hipError_t err = call;                                      \
+        if (err != hipSuccess) {                                    \
+            std::cerr << "HIP error: " << hipGetErrorString(err)    \
+                      << " at " << __FILE__ << ":" << __LINE__      \
+                      << std::endl;                                 \
+            std::exit(EXIT_FAILURE);                                \
+        }                                                           \
+    } while(0)
+
 __global__ void matMulKernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int width) {
    __shared__ float tileA[TILE_SIZE][TILE_SIZE];
    __shared__ float tileB[TILE_SIZE][TILE_SIZE];
@@ -15,6 +27,7 @@ __global__ void matMulKernel(const float* __restrict__ A, const float* __restric

    float sum = 0.0f;

+    // Loop over tiles of input matrices
    for (int t = 0; t < width / TILE_SIZE; ++t) {
        tileA[threadIdx.y][threadIdx.x] = A[row * width + t * TILE_SIZE + threadIdx.x];
        tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * width + col];
@@ -43,12 +56,12 @@ int main() {
        h_B[i] = static_cast<float>((i + 1) % 100) * 0.01f;
    }

-    hipMalloc(&d_A, size);
-    hipMalloc(&d_B, size);
-    hipMalloc(&d_C, size);
+    HIP_CHECK(hipMalloc(&d_A, size));
+    HIP_CHECK(hipMalloc(&d_B, size));
+    HIP_CHECK(hipMalloc(&d_C, size));

-    hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice);
-    hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice);
+    HIP_CHECK(hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice));

    dim3 blockDim(TILE_SIZE, TILE_SIZE);  // 32 x 32 = 1024 threads
    dim3 gridDim(N / TILE_SIZE, N / TILE_SIZE);  // 128 x 128 = 16,384 thread blocks
@@ -59,15 +72,16 @@ int main() {
    auto start = std::chrono::high_resolution_clock::now();

    matMulKernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, N);
-    hipDeviceSynchronize();
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipDeviceSynchronize());

    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> elapsed = end - start;
    std::cout << "Execution time: " << elapsed.count() << " seconds\n";

-    hipFree(d_A);
-    hipFree(d_B);
-    hipFree(d_C);
+    HIP_CHECK(hipFree(d_A));
+    HIP_CHECK(hipFree(d_B));
+    HIP_CHECK(hipFree(d_C));
    delete[] h_A;
    delete[] h_B;

@@ -1174,7 +1174,10 @@ def apply_dispatch_filter(df: pd.DataFrame, workload: schema.Workload) -> pd.Dat
        if int(dispatch_id) >= len(df):  # subtract 2 bc of the two header rows
            console_error("analysis", f"{dispatch_id} is an invalid dispatch id.")

-    if isinstance(workload.filter_dispatch_ids[0], str) and ">" in workload.filter_dispatch_ids[0]:
+    if (
+        isinstance(workload.filter_dispatch_ids[0], str)
+        and ">" in workload.filter_dispatch_ids[0]
+    ):
        dispatch_match = re.match(r"\> (\d+)", workload.filter_dispatch_ids[0])
        df = df[
            df[schema.PMC_PERF_FILE_PREFIX]["Dispatch_ID"]
@@ -1817,7 +1817,7 @@ def test_pc_sampling_host_trap(binary_handler_profile_rocprof_compute):
        "--pc-sampling-method",
        "host_trap",
        "--pc-sampling-interval",
-        "1048576",
+        "256",
    ]
    workload_dir = test_utils.get_output_dir()
    _ = binary_handler_profile_rocprof_compute(