edb51fc861
* Update LICENSE
* Update conf.py
* Update copyright year
* [fix] Update copyright year
* Update copyright year "ROCm Developer Tools"
* Add license headers to c++ files
* Add license to *.py
* Update licenses in rocdecode sources
---------
Co-authored-by: srawat <120587655+SwRaw@users.noreply.github.com>
Co-authored-by: Mythreya <mythreya.kuricheti@amd.com>
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
[ROCm/rocprofiler-sdk commit: 97b7a6315d]
225 lignes
6.7 KiB
C++
225 lignes
6.7 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2024-2025 ROCm Developer Tools
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in all
|
|
// copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
|
|
#include <hip/hip_runtime.h>
|
|
|
|
#include <stdio.h>
|
|
#include <cassert>
|
|
#include <iostream>
|
|
#include <random>
|
|
|
|
namespace
|
|
{
|
|
#define M 8192
|
|
#define N 8192
|
|
#define K 8192
|
|
#define TileSize 16
|
|
#define BLOCK_SIZE_X 16
|
|
#define BLOCK_SIZE_Y 16
|
|
#define GRID_SIZE_X (M + BLOCK_SIZE_X - 1) / BLOCK_SIZE_X
|
|
#define GRID_SIZE_Y (N + BLOCK_SIZE_Y - 1) / BLOCK_SIZE_Y
|
|
#define WAVES_PER_BLOCK_MI200_PLUS (BLOCK_SIZE_X * BLOCK_SIZE_Y) / 64
|
|
|
|
#define HIP_API_CALL(CALL) \
|
|
{ \
|
|
hipError_t error_ = (CALL); \
|
|
if(error_ != hipSuccess) \
|
|
{ \
|
|
fprintf(stderr, \
|
|
"%s:%d :: HIP error : %s\n", \
|
|
__FILE__, \
|
|
__LINE__, \
|
|
hipGetErrorString(error_)); \
|
|
throw std::runtime_error("hip_api_call"); \
|
|
} \
|
|
}
|
|
} // namespace
|
|
|
|
namespace
|
|
{
|
|
void
|
|
check_hip_error(void);
|
|
} // namespace
|
|
|
|
__global__ void
|
|
matrix_multiply(float* A, float* B, float* Out, int /*m*/, int n, int k)
|
|
{
|
|
int gid_x = blockDim.x * blockIdx.x + threadIdx.x;
|
|
int gid_y = blockDim.y * blockIdx.y + threadIdx.y;
|
|
|
|
if(gid_x < N && gid_y < M)
|
|
{
|
|
float sum = 0;
|
|
for(int i = 0; i < k; ++i)
|
|
{
|
|
sum += A[gid_y * k + i] * B[i * n + gid_x];
|
|
}
|
|
|
|
Out[gid_y * n + gid_x] = sum;
|
|
}
|
|
}
|
|
|
|
#if 1
|
|
__global__ void
|
|
matrix_multiply_tile(float* A, float* B, float* Out, int m, int n, int k)
|
|
{
|
|
__shared__ float subTileM[TileSize][TileSize];
|
|
__shared__ float subTileN[TileSize][TileSize];
|
|
|
|
int bx = blockIdx.x;
|
|
int by = blockIdx.y;
|
|
int tx = threadIdx.x;
|
|
int ty = threadIdx.y;
|
|
|
|
int row = by * TileSize + ty;
|
|
int col = bx * TileSize + tx;
|
|
|
|
float sum = 0;
|
|
for(int i = 0; i < ((k - 1) / TileSize + 1); i++)
|
|
{
|
|
int curr_l = row * k + i * TileSize + tx;
|
|
int curr_r = (i * TileSize + ty) * n + col;
|
|
|
|
if(i * TileSize + tx < k && row < m)
|
|
{
|
|
subTileM[ty][tx] = A[curr_l];
|
|
}
|
|
else
|
|
{
|
|
subTileM[ty][tx] = 0.0;
|
|
}
|
|
|
|
if(i * TileSize + ty < k && col < n)
|
|
{
|
|
subTileN[ty][tx] = B[curr_r];
|
|
}
|
|
else
|
|
{
|
|
subTileN[ty][tx] = 0.0;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
for(int j = 0; j < TileSize; j++)
|
|
{
|
|
if(j + TileSize * i < k)
|
|
{
|
|
sum += subTileM[ty][j] * subTileN[j][tx];
|
|
}
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
if(row < m && col < n)
|
|
{
|
|
Out[row * n + col] = sum;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void
|
|
run_hip_app()
|
|
{
|
|
std::vector<float> A(M * K);
|
|
std::vector<float> B(K * N);
|
|
std::vector<float> Out(M * N);
|
|
|
|
// Randomly initialize the matrices
|
|
for(int i = 0; i < M * K; ++i)
|
|
{
|
|
A[i] = (float) rand() / (float) RAND_MAX;
|
|
}
|
|
|
|
for(int i = 0; i < K * N; ++i)
|
|
{
|
|
B[i] = (float) rand() / (float) RAND_MAX;
|
|
}
|
|
|
|
// Allocate GPU Memory
|
|
float *d_A, *d_B, *d_Out;
|
|
HIP_API_CALL(hipMalloc(&d_A, sizeof(float) * M * K));
|
|
HIP_API_CALL(hipMalloc(&d_B, sizeof(float) * K * N));
|
|
HIP_API_CALL(hipMalloc(&d_Out, sizeof(float) * M * N));
|
|
|
|
// Copy data to GPU
|
|
HIP_API_CALL(hipMemcpy(d_A, A.data(), sizeof(float) * M * K, hipMemcpyHostToDevice));
|
|
HIP_API_CALL(hipMemcpy(d_B, B.data(), sizeof(float) * K * N, hipMemcpyHostToDevice));
|
|
|
|
// Run the kernel
|
|
dim3 block_size(BLOCK_SIZE_X, BLOCK_SIZE_Y);
|
|
dim3 grid_size((M + block_size.x - 1) / block_size.x, (N + block_size.y - 1) / block_size.y);
|
|
matrix_multiply<<<grid_size, block_size>>>(d_A, d_B, d_Out, M, N, K);
|
|
check_hip_error();
|
|
matrix_multiply_tile<<<grid_size, block_size>>>(d_A, d_B, d_Out, M, N, K);
|
|
check_hip_error();
|
|
|
|
// Copy data back to CPU
|
|
HIP_API_CALL(hipMemcpy(Out.data(), d_Out, sizeof(float) * M * N, hipMemcpyDeviceToHost));
|
|
|
|
// Free GPU Memory
|
|
HIP_API_CALL(hipFree(d_A));
|
|
HIP_API_CALL(hipFree(d_B));
|
|
HIP_API_CALL(hipFree(d_Out));
|
|
}
|
|
|
|
#define DEVICE_ID 0
|
|
|
|
int
|
|
main(int /*argc*/, char** /*argv*/)
|
|
{
|
|
int deviceId = DEVICE_ID;
|
|
|
|
auto status = hipSetDevice(deviceId);
|
|
assert(status == hipSuccess);
|
|
HIP_API_CALL(status);
|
|
|
|
int currDeviceId = -1;
|
|
status = hipGetDevice(&currDeviceId);
|
|
HIP_API_CALL(status);
|
|
assert(status == hipSuccess);
|
|
assert(deviceId == currDeviceId);
|
|
|
|
for(int i = 0; i < 1; i++)
|
|
{
|
|
std::cout << "<<< MatMul starts" << std::endl;
|
|
run_hip_app();
|
|
std::cout << ">>> MatMul ends" << std::endl;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
namespace
|
|
{
|
|
void
|
|
check_hip_error(void)
|
|
{
|
|
hipError_t err = hipGetLastError();
|
|
if(err != hipSuccess)
|
|
{
|
|
std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
|
|
throw std::runtime_error("hip_api_call");
|
|
}
|
|
}
|
|
} // namespace
|