change interval for host_trap in unit test to adapt to single kernel (#1064)

Este commit está contenido en:
ywang103-amd
2025-09-19 17:21:02 -04:00
cometido por GitHub
padre ec4d4b8a0d
commit 775ac73d25
Se han modificado 4 ficheros con 30 adiciones y 12 borrados
+2 -1
Ver fichero
@@ -601,7 +601,8 @@ add_custom_target(
# Create VERSION.sha file
COMMAND git -C ${PROJECT_SOURCE_DIR} rev-parse HEAD > VERSION.sha
# Build standalone binary
# NOTE: --no-deployment-flag=self-execution is used to avoid self-execution and fork
# NOTE: --no-deployment-flag=self-execution is used to avoid self-execution
# and fork
# bombs as explained in
# https://nuitka.net/user-documentation/common-issue-solutions.html#fork-bombs-self-execution
COMMAND
@@ -6,6 +6,18 @@
#define TILE_SIZE 32 // Maximum block size: 32 x 32 = 1024 threads/block
#define N 4096 // Matrix size: 4096 x 4096 (~67M elements)
// Helper macro for HIP error checking
#define HIP_CHECK(call) \
do { \
hipError_t err = call; \
if (err != hipSuccess) { \
std::cerr << "HIP error: " << hipGetErrorString(err) \
<< " at " << __FILE__ << ":" << __LINE__ \
<< std::endl; \
std::exit(EXIT_FAILURE); \
} \
} while(0)
__global__ void matMulKernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int width) {
__shared__ float tileA[TILE_SIZE][TILE_SIZE];
__shared__ float tileB[TILE_SIZE][TILE_SIZE];
@@ -15,6 +27,7 @@ __global__ void matMulKernel(const float* __restrict__ A, const float* __restric
float sum = 0.0f;
// Loop over tiles of input matrices
for (int t = 0; t < width / TILE_SIZE; ++t) {
tileA[threadIdx.y][threadIdx.x] = A[row * width + t * TILE_SIZE + threadIdx.x];
tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * width + col];
@@ -43,12 +56,12 @@ int main() {
h_B[i] = static_cast<float>((i + 1) % 100) * 0.01f;
}
hipMalloc(&d_A, size);
hipMalloc(&d_B, size);
hipMalloc(&d_C, size);
HIP_CHECK(hipMalloc(&d_A, size));
HIP_CHECK(hipMalloc(&d_B, size));
HIP_CHECK(hipMalloc(&d_C, size));
hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice);
hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice);
HIP_CHECK(hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice));
dim3 blockDim(TILE_SIZE, TILE_SIZE); // 32 x 32 = 1024 threads
dim3 gridDim(N / TILE_SIZE, N / TILE_SIZE); // 128 x 128 = 16,384 thread blocks
@@ -59,15 +72,16 @@ int main() {
auto start = std::chrono::high_resolution_clock::now();
matMulKernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, N);
hipDeviceSynchronize();
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipDeviceSynchronize());
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << "Execution time: " << elapsed.count() << " seconds\n";
hipFree(d_A);
hipFree(d_B);
hipFree(d_C);
HIP_CHECK(hipFree(d_A));
HIP_CHECK(hipFree(d_B));
HIP_CHECK(hipFree(d_C));
delete[] h_A;
delete[] h_B;
@@ -1174,7 +1174,10 @@ def apply_dispatch_filter(df: pd.DataFrame, workload: schema.Workload) -> pd.Dat
if int(dispatch_id) >= len(df): # subtract 2 bc of the two header rows
console_error("analysis", f"{dispatch_id} is an invalid dispatch id.")
if isinstance(workload.filter_dispatch_ids[0], str) and ">" in workload.filter_dispatch_ids[0]:
if (
isinstance(workload.filter_dispatch_ids[0], str)
and ">" in workload.filter_dispatch_ids[0]
):
dispatch_match = re.match(r"\> (\d+)", workload.filter_dispatch_ids[0])
df = df[
df[schema.PMC_PERF_FILE_PREFIX]["Dispatch_ID"]
@@ -1817,7 +1817,7 @@ def test_pc_sampling_host_trap(binary_handler_profile_rocprof_compute):
"--pc-sampling-method",
"host_trap",
"--pc-sampling-interval",
"1048576",
"256",
]
workload_dir = test_utils.get_output_dir()
_ = binary_handler_profile_rocprof_compute(