Dateien
rocm-systems/tests/pc_sampling/main.cpp
T
Rawat, Swati 97b7a6315d update copyright date to 2025 (#102)
* Update LICENSE

* Update conf.py

* Update copyright year

* [fix] Update copyright year

* Update copyright year "ROCm Developer Tools"

* Add license headers to c++ files

* Add license to *.py

* Update licenses in rocdecode sources

---------

Co-authored-by: srawat <120587655+SwRaw@users.noreply.github.com>
Co-authored-by: Mythreya <mythreya.kuricheti@amd.com>
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
2025-01-22 19:11:20 -06:00

225 Zeilen
6.7 KiB
C++

// MIT License
//
// Copyright (c) 2024-2025 ROCm Developer Tools
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <hip/hip_runtime.h>
#include <stdio.h>
#include <cassert>
#include <iostream>
#include <random>
namespace
{
#define M 8192
#define N 8192
#define K 8192
#define TileSize 16
#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16
#define GRID_SIZE_X (M + BLOCK_SIZE_X - 1) / BLOCK_SIZE_X
#define GRID_SIZE_Y (N + BLOCK_SIZE_Y - 1) / BLOCK_SIZE_Y
#define WAVES_PER_BLOCK_MI200_PLUS (BLOCK_SIZE_X * BLOCK_SIZE_Y) / 64
#define HIP_API_CALL(CALL) \
{ \
hipError_t error_ = (CALL); \
if(error_ != hipSuccess) \
{ \
fprintf(stderr, \
"%s:%d :: HIP error : %s\n", \
__FILE__, \
__LINE__, \
hipGetErrorString(error_)); \
throw std::runtime_error("hip_api_call"); \
} \
}
} // namespace
namespace
{
void
check_hip_error(void);
} // namespace
__global__ void
matrix_multiply(float* A, float* B, float* Out, int /*m*/, int n, int k)
{
int gid_x = blockDim.x * blockIdx.x + threadIdx.x;
int gid_y = blockDim.y * blockIdx.y + threadIdx.y;
if(gid_x < N && gid_y < M)
{
float sum = 0;
for(int i = 0; i < k; ++i)
{
sum += A[gid_y * k + i] * B[i * n + gid_x];
}
Out[gid_y * n + gid_x] = sum;
}
}
#if 1
__global__ void
matrix_multiply_tile(float* A, float* B, float* Out, int m, int n, int k)
{
__shared__ float subTileM[TileSize][TileSize];
__shared__ float subTileN[TileSize][TileSize];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * TileSize + ty;
int col = bx * TileSize + tx;
float sum = 0;
for(int i = 0; i < ((k - 1) / TileSize + 1); i++)
{
int curr_l = row * k + i * TileSize + tx;
int curr_r = (i * TileSize + ty) * n + col;
if(i * TileSize + tx < k && row < m)
{
subTileM[ty][tx] = A[curr_l];
}
else
{
subTileM[ty][tx] = 0.0;
}
if(i * TileSize + ty < k && col < n)
{
subTileN[ty][tx] = B[curr_r];
}
else
{
subTileN[ty][tx] = 0.0;
}
__syncthreads();
for(int j = 0; j < TileSize; j++)
{
if(j + TileSize * i < k)
{
sum += subTileM[ty][j] * subTileN[j][tx];
}
}
__syncthreads();
}
if(row < m && col < n)
{
Out[row * n + col] = sum;
}
}
#endif
void
run_hip_app()
{
std::vector<float> A(M * K);
std::vector<float> B(K * N);
std::vector<float> Out(M * N);
// Randomly initialize the matrices
for(int i = 0; i < M * K; ++i)
{
A[i] = (float) rand() / (float) RAND_MAX;
}
for(int i = 0; i < K * N; ++i)
{
B[i] = (float) rand() / (float) RAND_MAX;
}
// Allocate GPU Memory
float *d_A, *d_B, *d_Out;
HIP_API_CALL(hipMalloc(&d_A, sizeof(float) * M * K));
HIP_API_CALL(hipMalloc(&d_B, sizeof(float) * K * N));
HIP_API_CALL(hipMalloc(&d_Out, sizeof(float) * M * N));
// Copy data to GPU
HIP_API_CALL(hipMemcpy(d_A, A.data(), sizeof(float) * M * K, hipMemcpyHostToDevice));
HIP_API_CALL(hipMemcpy(d_B, B.data(), sizeof(float) * K * N, hipMemcpyHostToDevice));
// Run the kernel
dim3 block_size(BLOCK_SIZE_X, BLOCK_SIZE_Y);
dim3 grid_size((M + block_size.x - 1) / block_size.x, (N + block_size.y - 1) / block_size.y);
matrix_multiply<<<grid_size, block_size>>>(d_A, d_B, d_Out, M, N, K);
check_hip_error();
matrix_multiply_tile<<<grid_size, block_size>>>(d_A, d_B, d_Out, M, N, K);
check_hip_error();
// Copy data back to CPU
HIP_API_CALL(hipMemcpy(Out.data(), d_Out, sizeof(float) * M * N, hipMemcpyDeviceToHost));
// Free GPU Memory
HIP_API_CALL(hipFree(d_A));
HIP_API_CALL(hipFree(d_B));
HIP_API_CALL(hipFree(d_Out));
}
#define DEVICE_ID 0
int
main(int /*argc*/, char** /*argv*/)
{
int deviceId = DEVICE_ID;
auto status = hipSetDevice(deviceId);
assert(status == hipSuccess);
HIP_API_CALL(status);
int currDeviceId = -1;
status = hipGetDevice(&currDeviceId);
HIP_API_CALL(status);
assert(status == hipSuccess);
assert(deviceId == currDeviceId);
for(int i = 0; i < 1; i++)
{
std::cout << "<<< MatMul starts" << std::endl;
run_hip_app();
std::cout << ">>> MatMul ends" << std::endl;
}
return 0;
}
namespace
{
void
check_hip_error(void)
{
hipError_t err = hipGetLastError();
if(err != hipSuccess)
{
std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
throw std::runtime_error("hip_api_call");
}
}
} // namespace