From 1c7293e6d06470795822697c80cec2c565f93018 Mon Sep 17 00:00:00 2001 From: habajpai-amd Date: Fri, 12 Sep 2025 17:32:30 +0530 Subject: [PATCH] Add bounds checks in transpose_a for both load and store so edge tiles dont read/write past MxN (#950) --- .../examples/transpose/transpose.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/projects/rocprofiler-systems/examples/transpose/transpose.cpp b/projects/rocprofiler-systems/examples/transpose/transpose.cpp index 31f8c21067..d68d9b9486 100644 --- a/projects/rocprofiler-systems/examples/transpose/transpose.cpp +++ b/projects/rocprofiler-systems/examples/transpose/transpose.cpp @@ -85,13 +85,18 @@ transpose_a(int* in, int* out, int M, int N) { __shared__ int tile[TILE_DIM][TILE_DIM]; - int idx = (blockIdx.y * blockDim.y + threadIdx.y) * M + blockIdx.x * blockDim.x + - threadIdx.x; - tile[threadIdx.y][threadIdx.x] = in[idx]; + int tx = threadIdx.x, ty = threadIdx.y; + int x = blockIdx.x * TILE_DIM + tx; + int y = blockIdx.y * TILE_DIM + ty; + + int v = 0; + if(x < M && y < N) v = in[y * M + x]; // guarded load + tile[ty][tx] = v; __syncthreads(); - idx = (blockIdx.x * blockDim.x + threadIdx.y) * N + blockIdx.y * blockDim.y + - threadIdx.x; - out[idx] = tile[threadIdx.x][threadIdx.y]; + + int xt = blockIdx.y * TILE_DIM + tx; + int yt = blockIdx.x * TILE_DIM + ty; + if(xt < N && yt < M) out[yt * N + xt] = tile[tx][ty]; // guarded store } namespace