// MIT License // // Copyright (c) 2024-2025 ROCm Developer Tools // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include namespace { #define M 8192 #define N 8192 #define K 8192 #define TileSize 16 #define BLOCK_SIZE_X 16 #define BLOCK_SIZE_Y 16 #define GRID_SIZE_X (M + BLOCK_SIZE_X - 1) / BLOCK_SIZE_X #define GRID_SIZE_Y (N + BLOCK_SIZE_Y - 1) / BLOCK_SIZE_Y #define WAVES_PER_BLOCK_MI200_PLUS (BLOCK_SIZE_X * BLOCK_SIZE_Y) / 64 #define HIP_API_CALL(CALL) \ { \ hipError_t error_ = (CALL); \ if(error_ != hipSuccess) \ { \ fprintf(stderr, \ "%s:%d :: HIP error : %s\n", \ __FILE__, \ __LINE__, \ hipGetErrorString(error_)); \ throw std::runtime_error("hip_api_call"); \ } \ } } // namespace namespace { void check_hip_error(void); } // namespace __global__ void matrix_multiply(float* A, float* B, float* Out, int /*m*/, int n, int k) { int gid_x = blockDim.x * blockIdx.x + threadIdx.x; int gid_y = blockDim.y * blockIdx.y + threadIdx.y; if(gid_x < N && gid_y < M) { float sum = 0; for(int i = 0; i < k; ++i) { sum += A[gid_y * k + i] * B[i * n + gid_x]; } Out[gid_y * n + gid_x] = sum; } } #if 1 __global__ void matrix_multiply_tile(float* A, float* B, float* Out, int m, int n, int k) { __shared__ float subTileM[TileSize][TileSize]; __shared__ float subTileN[TileSize][TileSize]; int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int row = by * TileSize + ty; int col = bx * TileSize + tx; float sum = 0; for(int i = 0; i < ((k - 1) / TileSize + 1); i++) { int curr_l = row * k + i * TileSize + tx; int curr_r = (i * TileSize + ty) * n + col; if(i * TileSize + tx < k && row < m) { subTileM[ty][tx] = A[curr_l]; } else { subTileM[ty][tx] = 0.0; } if(i * TileSize + ty < k && col < n) { subTileN[ty][tx] = B[curr_r]; } else { subTileN[ty][tx] = 0.0; } __syncthreads(); for(int j = 0; j < TileSize; j++) { if(j + TileSize * i < k) { sum += subTileM[ty][j] * subTileN[j][tx]; } } __syncthreads(); } if(row < m && col < n) { Out[row * n + col] = sum; } } #endif void run_hip_app() { std::vector A(M * K); std::vector B(K * N); std::vector Out(M * N); // Randomly initialize the matrices for(int i = 0; i < M * K; ++i) { A[i] = (float) rand() / (float) RAND_MAX; } for(int i = 0; i < K * N; ++i) { B[i] = (float) rand() / (float) RAND_MAX; } // Allocate GPU Memory float *d_A, *d_B, *d_Out; HIP_API_CALL(hipMalloc(&d_A, sizeof(float) * M * K)); HIP_API_CALL(hipMalloc(&d_B, sizeof(float) * K * N)); HIP_API_CALL(hipMalloc(&d_Out, sizeof(float) * M * N)); // Copy data to GPU HIP_API_CALL(hipMemcpy(d_A, A.data(), sizeof(float) * M * K, hipMemcpyHostToDevice)); HIP_API_CALL(hipMemcpy(d_B, B.data(), sizeof(float) * K * N, hipMemcpyHostToDevice)); // Run the kernel dim3 block_size(BLOCK_SIZE_X, BLOCK_SIZE_Y); dim3 grid_size((M + block_size.x - 1) / block_size.x, (N + block_size.y - 1) / block_size.y); matrix_multiply<<>>(d_A, d_B, d_Out, M, N, K); check_hip_error(); matrix_multiply_tile<<>>(d_A, d_B, d_Out, M, N, K); check_hip_error(); // Copy data back to CPU HIP_API_CALL(hipMemcpy(Out.data(), d_Out, sizeof(float) * M * N, hipMemcpyDeviceToHost)); // Free GPU Memory HIP_API_CALL(hipFree(d_A)); HIP_API_CALL(hipFree(d_B)); HIP_API_CALL(hipFree(d_Out)); } #define DEVICE_ID 0 int main(int /*argc*/, char** /*argv*/) { int deviceId = DEVICE_ID; auto status = hipSetDevice(deviceId); assert(status == hipSuccess); HIP_API_CALL(status); int currDeviceId = -1; status = hipGetDevice(&currDeviceId); HIP_API_CALL(status); assert(status == hipSuccess); assert(deviceId == currDeviceId); for(int i = 0; i < 1; i++) { std::cout << "<<< MatMul starts" << std::endl; run_hip_app(); std::cout << ">>> MatMul ends" << std::endl; } return 0; } namespace { void check_hip_error(void) { hipError_t err = hipGetLastError(); if(err != hipSuccess) { std::cerr << "Error: " << hipGetErrorString(err) << std::endl; throw std::runtime_error("hip_api_call"); } } } // namespace