6d9d880d31
* Added laplace solver in samples * Add laplace eqn in CMake * Added counter accuracy test * Add iteration CLI arg for laplace eq * Unnest profile method * Missing counter warning * Updated insufficient kernel warning * Added reference for laplace equation * variable name change * Added comments for data comparison * Included scipy as test requirement * Added line number for ref * split stochastic and deterministic tests * Added order cli option for laplace_eqn * Install laplace eqn * Missing counter warning * Warn about missing kernels during analysis * Update tests * Split iteration multiplexing ctests * Updated warning * Incorporated copilot's suggestions
217 line
7.7 KiB
Plaintext
217 line
7.7 KiB
Plaintext
// MIT License
|
|
//
|
|
// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
|
|
// Reference: Finite Difference Methods for Ordinary and Partial Differential
|
|
// Equations by Randall J. LeVeque, Page 69-71
|
|
|
|
#include <algorithm>
|
|
#include <hip/hip_runtime.h>
|
|
#include <iostream>
|
|
#include <vector>
|
|
|
|
// Helper macro for HIP error checking
|
|
#define HIP_CHECK(call) \
|
|
do { \
|
|
hipError_t err = call; \
|
|
if (err != hipSuccess) { \
|
|
std::cerr << "HIP error: " << hipGetErrorString(err) << " at " \
|
|
<< __FILE__ << ":" << __LINE__ << std::endl; \
|
|
std::exit(EXIT_FAILURE); \
|
|
} \
|
|
} while (0)
|
|
|
|
void initialize(float *U, int N) {
|
|
for (int i = 0; i < N; ++i) {
|
|
for (int j = 0; j < N; ++j) {
|
|
float x = 2.0f * (float)i / (float)N - 1.0f;
|
|
float y = 2.0f * (float)j / (float)N - 1.0f;
|
|
U[i * N + j] = 1.0f / (1.0f + x * x + y * y); // Initial guess
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void jacobi_iteration(float *__restrict__ U_new,
|
|
const float *__restrict__ U_old, int N) {
|
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int j = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
int idx = j + i * N; // Flattened index for 2D grid
|
|
|
|
extern __shared__ float U[];
|
|
int s_i = threadIdx.x + 1;
|
|
int s_j = threadIdx.y + 1;
|
|
|
|
int tile_height = blockDim.y + 2;
|
|
int s_idx = s_j + s_i * tile_height;
|
|
|
|
// Load data into shared memory with halo regions
|
|
if (i < N && j < N) {
|
|
U[s_idx] = U_old[idx];
|
|
|
|
// Load halo regions
|
|
if (threadIdx.x == 0) {
|
|
U[s_j] = (i > 0) ? U_old[(i - 1) * N + j] : U_old[(N - 1) * N + j];
|
|
}
|
|
|
|
if (threadIdx.x == blockDim.x - 1) {
|
|
U[s_j + (blockDim.x + 1) * tile_height] =
|
|
(i < N - 1) ? U_old[(i + 1) * N + j] : U_old[j];
|
|
}
|
|
|
|
if (threadIdx.y == 0) {
|
|
U[s_i * tile_height] =
|
|
(j > 0) ? U_old[i * N + (j - 1)] : U_old[i * N + (N - 1)];
|
|
}
|
|
|
|
if (threadIdx.y == blockDim.y - 1) {
|
|
U[s_i * tile_height + blockDim.y + 1] =
|
|
(j < N - 1) ? U_old[i * N + (j + 1)] : U_old[i * N];
|
|
}
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
if (i < N && j < N) {
|
|
int east = s_idx - tile_height;
|
|
int west = s_idx + tile_height;
|
|
int north = s_idx + 1;
|
|
int south = s_idx - 1;
|
|
U_new[idx] = 0.25f * (U[east] + U[west] + U[north] + U[south]);
|
|
}
|
|
}
|
|
|
|
int main(int argc, char *argv[]) {
|
|
int iter = 1000;
|
|
std::vector<int> launch_order;
|
|
|
|
for (int i = 0; i < argc; i++) {
|
|
if (std::string(argv[i]) == "-i" && i + 1 < argc) {
|
|
iter = std::atoi(argv[i + 1]);
|
|
} else if (std::string(argv[i]) == "-o" && i + 1 < argc) {
|
|
int n_order = std::atoi(argv[i + 1]);
|
|
if ((i + n_order + 1) >= argc) {
|
|
std::cerr << "Insufficient arguments for -o option" << std::endl;
|
|
return EXIT_FAILURE;
|
|
} else {
|
|
for (int j = 0; j < n_order; j++) {
|
|
int kernel_id = std::atoi(argv[i + 2 + j]);
|
|
if (kernel_id < 0 || kernel_id > 2) {
|
|
std::cerr << "Invalid kernel ID: " << kernel_id << std::endl;
|
|
return EXIT_FAILURE;
|
|
}
|
|
launch_order.push_back(kernel_id);
|
|
}
|
|
i += n_order;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (launch_order.empty()) {
|
|
launch_order = {0, 1, 2};
|
|
}
|
|
|
|
// Problem size
|
|
const int n_small = 1024;
|
|
const int n_mid = 2048;
|
|
const int n_large = 4096;
|
|
|
|
// Define block and grid sizes
|
|
dim3 blockSize_small(16, 16);
|
|
dim3 gridSize_small((n_small + blockSize_small.x - 1) / blockSize_small.x,
|
|
(n_small + blockSize_small.y - 1) / blockSize_small.y);
|
|
dim3 blockSize_mid(24, 24);
|
|
dim3 gridSize_mid((n_mid + blockSize_mid.x - 1) / blockSize_mid.x,
|
|
(n_mid + blockSize_mid.y - 1) / blockSize_mid.y);
|
|
dim3 blockSize_large(32, 32);
|
|
dim3 gridSize_large((n_large + blockSize_large.x - 1) / blockSize_large.x,
|
|
(n_large + blockSize_large.y - 1) / blockSize_large.y);
|
|
|
|
// Host memory pointers
|
|
float *h_U_small = new float[n_small * n_small];
|
|
float *h_U_mid = new float[n_mid * n_mid];
|
|
float *h_U_large = new float[n_large * n_large];
|
|
|
|
// Initialize host arrays
|
|
initialize(h_U_small, n_small);
|
|
initialize(h_U_mid, n_mid);
|
|
initialize(h_U_large, n_large);
|
|
|
|
// Device memory pointers
|
|
float *d_U_small_new, *d_U_small_old;
|
|
float *d_U_mid_new, *d_U_mid_old;
|
|
float *d_U_large_new, *d_U_large_old;
|
|
|
|
// Allocate device memory
|
|
HIP_CHECK(hipMalloc(&d_U_small_new, n_small * n_small * sizeof(float)));
|
|
HIP_CHECK(hipMalloc(&d_U_small_old, n_small * n_small * sizeof(float)));
|
|
|
|
HIP_CHECK(hipMalloc(&d_U_mid_new, n_mid * n_mid * sizeof(float)));
|
|
HIP_CHECK(hipMalloc(&d_U_mid_old, n_mid * n_mid * sizeof(float)));
|
|
|
|
HIP_CHECK(hipMalloc(&d_U_large_new, n_large * n_large * sizeof(float)));
|
|
HIP_CHECK(hipMalloc(&d_U_large_old, n_large * n_large * sizeof(float)));
|
|
|
|
// Copy host arrays to device
|
|
HIP_CHECK(hipMemcpy(d_U_small_old, h_U_small,
|
|
n_small * n_small * sizeof(float),
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipMemcpy(d_U_mid_old, h_U_mid, n_mid * n_mid * sizeof(float),
|
|
hipMemcpyHostToDevice));
|
|
HIP_CHECK(hipMemcpy(d_U_large_old, h_U_large,
|
|
n_large * n_large * sizeof(float),
|
|
hipMemcpyHostToDevice));
|
|
|
|
std::vector<dim3> grid_sizes = {gridSize_small, gridSize_mid, gridSize_large};
|
|
std::vector<dim3> block_sizes = {blockSize_small, blockSize_mid,
|
|
blockSize_large};
|
|
std::vector<float *> d_U_news = {d_U_small_new, d_U_mid_new, d_U_large_new};
|
|
std::vector<float *> d_U_olds = {d_U_small_old, d_U_mid_old, d_U_large_old};
|
|
std::vector<int> n_sizes = {n_small, n_mid, n_large};
|
|
|
|
// Perform Jacobi iterations
|
|
for (int it = 0; it < iter; ++it) {
|
|
int i = launch_order[it % launch_order.size()];
|
|
jacobi_iteration<<<grid_sizes[i], block_sizes[i],
|
|
(block_sizes[i].x + 2) * (block_sizes[i].y + 2) *
|
|
sizeof(float)>>>(d_U_news[i], d_U_olds[i],
|
|
n_sizes[i]);
|
|
|
|
HIP_CHECK(hipDeviceSynchronize());
|
|
|
|
// Swap pointers
|
|
std::swap(d_U_news[i], d_U_olds[i]);
|
|
}
|
|
|
|
// Free device memory
|
|
HIP_CHECK(hipFree(d_U_small_new));
|
|
HIP_CHECK(hipFree(d_U_small_old));
|
|
HIP_CHECK(hipFree(d_U_mid_new));
|
|
HIP_CHECK(hipFree(d_U_mid_old));
|
|
HIP_CHECK(hipFree(d_U_large_new));
|
|
HIP_CHECK(hipFree(d_U_large_old));
|
|
|
|
delete[] h_U_small;
|
|
delete[] h_U_mid;
|
|
delete[] h_U_large;
|
|
|
|
return 0;
|
|
} |