/* ############################################################################## # MIT License # # Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ############################################################################## */ #include #include #include #include #define TILE_SIZE 32 // Maximum block size: 32 x 32 = 1024 threads/block #define N 4096 // Matrix size: 4096 x 4096 (~67M elements) // Helper macro for HIP error checking #define HIP_CHECK(call) \ do { \ hipError_t err = call; \ if (err != hipSuccess) { \ std::cerr << "HIP error: " << hipGetErrorString(err) \ << " at " << __FILE__ << ":" << __LINE__ \ << std::endl; \ std::exit(EXIT_FAILURE); \ } \ } while(0) __global__ void matMulKernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int width) { __shared__ float tileA[TILE_SIZE][TILE_SIZE]; __shared__ float tileB[TILE_SIZE][TILE_SIZE]; int row = blockIdx.y * TILE_SIZE + threadIdx.y; int col = blockIdx.x * TILE_SIZE + threadIdx.x; float sum = 0.0f; // Loop over tiles of input matrices for (int t = 0; t < width / TILE_SIZE; ++t) { tileA[threadIdx.y][threadIdx.x] = A[row * width + t * TILE_SIZE + threadIdx.x]; tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * width + col]; __syncthreads(); #pragma unroll for (int i = 0; i < TILE_SIZE; ++i) sum += tileA[threadIdx.y][i] * tileB[i][threadIdx.x]; __syncthreads(); } C[row * width + col] = sum; } int main() { size_t size = N * N * sizeof(float); float *h_A = new float[N * N]; float *h_B = new float[N * N]; float *d_A, *d_B, *d_C; // Initialize matrices with dummy values for (int i = 0; i < N * N; ++i) { h_A[i] = static_cast(i % 100) * 0.01f; h_B[i] = static_cast((i + 1) % 100) * 0.01f; } HIP_CHECK(hipMalloc(&d_A, size)); HIP_CHECK(hipMalloc(&d_B, size)); HIP_CHECK(hipMalloc(&d_C, size)); HIP_CHECK(hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice)); dim3 blockDim(TILE_SIZE, TILE_SIZE); // 32 x 32 = 1024 threads dim3 gridDim(N / TILE_SIZE, N / TILE_SIZE); // 128 x 128 = 16,384 thread blocks std::cout << "Launching kernel with grid: (" << gridDim.x << ", " << gridDim.y << ") and block: (" << blockDim.x << ", " << blockDim.y << ")\n"; auto start = std::chrono::high_resolution_clock::now(); matMulKernel<<>>(d_A, d_B, d_C, N); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed = end - start; std::cout << "Execution time: " << elapsed.count() << " seconds\n"; HIP_CHECK(hipFree(d_A)); HIP_CHECK(hipFree(d_B)); HIP_CHECK(hipFree(d_C)); delete[] h_A; delete[] h_B; return 0; }