SWDEV-546217 Complete hip-test Port to Catch2 Framework [Stream and Compute Folder] (#559)
* SWDEV-546498 hipPerfDeviceConcurrency
* SWDEV-546500 hipPerfStreamConcurrency
* SWDEV-546502 hipPerfStreamCreateCopyDestroy.c
* SWDEV-546479 hipPerfDotProduct
* SWDEV-546482 hipPerfMandelbrot
[ROCm/hip-tests commit: 9fdc9a98b7]
Этот коммит содержится в:
@@ -18,10 +18,10 @@
|
||||
*/
|
||||
|
||||
/**
|
||||
* @addtogroup hipPerfDotProduct hipPerfDotProduct
|
||||
* @{
|
||||
* @ingroup perfComputeTest
|
||||
*/
|
||||
* @addtogroup hipPerfDotProduct hipPerfDotProduct
|
||||
* @{
|
||||
* @ingroup perfComputeTest
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <vector>
|
||||
@@ -31,11 +31,9 @@
|
||||
using namespace std;
|
||||
|
||||
template <unsigned int BLOCKSIZE>
|
||||
__launch_bounds__(BLOCKSIZE)
|
||||
__global__ void vectors_not_equal(int n,
|
||||
const double* __restrict__ x,
|
||||
const double* __restrict__ y,
|
||||
double* __restrict__ workspace) {
|
||||
__launch_bounds__(BLOCKSIZE) __global__
|
||||
void vectors_not_equal(int n, const double* __restrict__ x, const double* __restrict__ y,
|
||||
double* __restrict__ workspace) {
|
||||
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
double sum = 0.0;
|
||||
@@ -93,9 +91,8 @@ __global__ void vectors_not_equal(int n,
|
||||
}
|
||||
|
||||
template <unsigned int BLOCKSIZE>
|
||||
__launch_bounds__(BLOCKSIZE)
|
||||
__global__ void vectors_equal(int n, const double* __restrict__ x,
|
||||
double* __restrict__ workspace) {
|
||||
__launch_bounds__(BLOCKSIZE) __global__
|
||||
void vectors_equal(int n, const double* __restrict__ x, double* __restrict__ workspace) {
|
||||
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
double sum = 0.0;
|
||||
@@ -129,7 +126,7 @@ __global__ void vectors_equal(int n, const double* __restrict__ x,
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < 8) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 8];
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 8];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -149,12 +146,11 @@ __global__ void vectors_equal(int n, const double* __restrict__ x,
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
workspace[blockIdx.x] = sdata[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int BLOCKSIZE>
|
||||
__launch_bounds__(BLOCKSIZE)
|
||||
__global__ void dot_reduction(double* __restrict__ workspace) {
|
||||
__launch_bounds__(BLOCKSIZE) __global__ void dot_reduction(double* __restrict__ workspace) {
|
||||
__shared__ double sdata[BLOCKSIZE];
|
||||
|
||||
sdata[threadIdx.x] = workspace[threadIdx.x];
|
||||
@@ -187,7 +183,8 @@ __global__ void dot_reduction(double* __restrict__ workspace) {
|
||||
|
||||
if (threadIdx.x < 4) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 4];
|
||||
} __syncthreads();
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < 2) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 2];
|
||||
@@ -203,8 +200,7 @@ __global__ void dot_reduction(double* __restrict__ workspace) {
|
||||
}
|
||||
}
|
||||
|
||||
void computeDotProduct(int n, const double* x, const double* y, double& result,
|
||||
double* workspace) {
|
||||
void computeDotProduct(int n, const double* x, const double* y, double& result, double* workspace) {
|
||||
dim3 blocks(DOT_DIM);
|
||||
dim3 threadsPerBlock(DOT_DIM);
|
||||
|
||||
@@ -225,16 +221,16 @@ void computeDotProduct(int n, const double* x, const double* y, double& result,
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the device kernel results comparing it with the host results.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/compute/hipPerfDotProduct.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the device kernel results comparing it with the host results.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/compute/hipPerfDotProduct.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
|
||||
TEST_CASE("Perf_hipPerfDotProduct") {
|
||||
int nGpu = 0;
|
||||
@@ -252,120 +248,120 @@ TEST_CASE("Perf_hipPerfDotProduct") {
|
||||
for (unsigned int testCase = 0; testCase < 3; testCase++) {
|
||||
vector<int> vectorSize = {200, 300, 50};
|
||||
switch (testCase) {
|
||||
case 0:
|
||||
nx = vectorSize[0];
|
||||
ny = vectorSize[0];
|
||||
nz = vectorSize[0];
|
||||
break;
|
||||
case 0:
|
||||
nx = vectorSize[0];
|
||||
ny = vectorSize[0];
|
||||
nz = vectorSize[0];
|
||||
break;
|
||||
|
||||
case 1:
|
||||
nx = vectorSize[1];
|
||||
ny = vectorSize[1];
|
||||
nz = vectorSize[1];
|
||||
break;
|
||||
case 1:
|
||||
nx = vectorSize[1];
|
||||
ny = vectorSize[1];
|
||||
nz = vectorSize[1];
|
||||
break;
|
||||
|
||||
case 2:
|
||||
nx = vectorSize[0];
|
||||
ny = vectorSize[1];
|
||||
nz = vectorSize[2];
|
||||
break;
|
||||
case 2:
|
||||
nx = vectorSize[0];
|
||||
ny = vectorSize[1];
|
||||
nz = vectorSize[2];
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
int trials = 200;
|
||||
int size = nx * ny * nz;
|
||||
int trials = 200;
|
||||
int size = nx * ny * nz;
|
||||
|
||||
vector<double> hx(size);
|
||||
vector<double> hy(size);
|
||||
double hresult_xy = 0.0;
|
||||
double hresult_xx = 0.0;
|
||||
vector<double> hx(size);
|
||||
vector<double> hy(size);
|
||||
double hresult_xy = 0.0;
|
||||
double hresult_xx = 0.0;
|
||||
|
||||
srand(time(NULL));
|
||||
srand(time(NULL));
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
hx[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
|
||||
hy[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
hx[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
|
||||
hy[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
|
||||
|
||||
hresult_xy += hx[i] * hy[i];
|
||||
hresult_xx += hx[i] * hx[i];
|
||||
}
|
||||
hresult_xy += hx[i] * hy[i];
|
||||
hresult_xx += hx[i] * hx[i];
|
||||
}
|
||||
|
||||
double* dx;
|
||||
double* dy;
|
||||
double* workspace;
|
||||
double dresult;
|
||||
double* dx;
|
||||
double* dy;
|
||||
double* workspace;
|
||||
double dresult;
|
||||
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dx), sizeof(double) * size));
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dy), sizeof(double) * size));
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&workspace), sizeof(double) * DOT_DIM));
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dx), sizeof(double) * size));
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dy), sizeof(double) * size));
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&workspace), sizeof(double) * DOT_DIM));
|
||||
|
||||
HIP_CHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice));
|
||||
|
||||
// Warm up
|
||||
computeDotProduct(size, dx, dy, dresult, workspace);
|
||||
computeDotProduct(size, dx, dy, dresult, workspace);
|
||||
computeDotProduct(size, dx, dy, dresult, workspace);
|
||||
|
||||
// Timed run for <x,y>
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for (int i = 0; i < trials; ++i) {
|
||||
// Warm up
|
||||
computeDotProduct(size, dx, dy, dresult, workspace);
|
||||
computeDotProduct(size, dx, dy, dresult, workspace);
|
||||
computeDotProduct(size, dx, dy, dresult, workspace);
|
||||
}
|
||||
|
||||
float time = 0;
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
time = all_kernel_time.count();
|
||||
// Timed run for <x,y>
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
time /= trials;
|
||||
for (int i = 0; i < trials; ++i) {
|
||||
computeDotProduct(size, dx, dy, dresult, workspace);
|
||||
}
|
||||
|
||||
double bw = sizeof(double) * size * 2.0 / 1e9;
|
||||
double gf = 2.0 * size / 1e9;
|
||||
float time = 0;
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
time = all_kernel_time.count();
|
||||
|
||||
cout << "\nVector Size: " << size << "\n[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
|
||||
<< gf/(time / 1e3) << " GFlop/s" << endl;
|
||||
time /= trials;
|
||||
|
||||
// Verify the device kernel results comparing it with the host results
|
||||
REQUIRE(std::abs(dresult - hresult_xy) < std::max(dresult * 1e-10, 1e-8));
|
||||
double bw = sizeof(double) * size * 2.0 / 1e9;
|
||||
double gf = 2.0 * size / 1e9;
|
||||
|
||||
// Warm up
|
||||
computeDotProduct(size, dx, dx, dresult, workspace);
|
||||
computeDotProduct(size, dx, dx, dresult, workspace);
|
||||
computeDotProduct(size, dx, dx, dresult, workspace);
|
||||
CONSOLE_PRINT("\nVector Size: %d\n[ddot] <x,y> %.6f msec ; %.6f GByte/s ; %.6f GFlop/s", size,
|
||||
time, bw / (time / 1e3), gf / (time / 1e3));
|
||||
|
||||
// Timed run for <x,x>
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
all_start = std::chrono::steady_clock::now();
|
||||
// Verify the device kernel results comparing it with the host results
|
||||
REQUIRE(std::abs(dresult - hresult_xy) < std::max(dresult * 1e-10, 1e-8));
|
||||
|
||||
for (int i = 0; i < trials; ++i) {
|
||||
// Warm up
|
||||
computeDotProduct(size, dx, dx, dresult, workspace);
|
||||
computeDotProduct(size, dx, dx, dresult, workspace);
|
||||
computeDotProduct(size, dx, dx, dresult, workspace);
|
||||
}
|
||||
|
||||
all_end = std::chrono::steady_clock::now();
|
||||
all_kernel_time = all_end - all_start;
|
||||
time = all_kernel_time.count();
|
||||
// Timed run for <x,x>
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
all_start = std::chrono::steady_clock::now();
|
||||
|
||||
time /= trials;
|
||||
bw = sizeof(double) * size / 1e9;
|
||||
for (int i = 0; i < trials; ++i) {
|
||||
computeDotProduct(size, dx, dx, dresult, workspace);
|
||||
}
|
||||
|
||||
cout << "[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
|
||||
<< gf/(time / 1e3) << " GFlop/s" << endl;
|
||||
all_end = std::chrono::steady_clock::now();
|
||||
all_kernel_time = all_end - all_start;
|
||||
time = all_kernel_time.count();
|
||||
|
||||
// Verify the device kernel results comparing it with the host results
|
||||
REQUIRE(abs(dresult - hresult_xx) < max(dresult * 1e-10, 1e-8));
|
||||
time /= trials;
|
||||
bw = sizeof(double) * size / 1e9;
|
||||
|
||||
HIP_CHECK(hipFree(dx));
|
||||
HIP_CHECK(hipFree(dy));
|
||||
HIP_CHECK(hipFree(workspace));
|
||||
CONSOLE_PRINT("[ddot] <x,y> %.6f msec ; %.6f GByte/s ; %.6f GFlop/s", time, bw / (time / 1e3),
|
||||
gf / (time / 1e3));
|
||||
|
||||
// Verify the device kernel results comparing it with the host results
|
||||
REQUIRE(abs(dresult - hresult_xx) < max(dresult * 1e-10, 1e-8));
|
||||
|
||||
HIP_CHECK(hipFree(dx));
|
||||
HIP_CHECK(hipFree(dy));
|
||||
HIP_CHECK(hipFree(workspace));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* End doxygen group perfComputeTest.
|
||||
* @}
|
||||
*/
|
||||
* End doxygen group perfComputeTest.
|
||||
* @}
|
||||
*/
|
||||
|
||||
@@ -18,10 +18,10 @@
|
||||
*/
|
||||
|
||||
/**
|
||||
* @addtogroup hipPerfMandelbrot hipPerfMandelbrot
|
||||
* @{
|
||||
* @ingroup perfComputeTest
|
||||
*/
|
||||
* @addtogroup hipPerfMandelbrot hipPerfMandelbrot
|
||||
* @{
|
||||
* @ingroup perfComputeTest
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_vector_types.h>
|
||||
@@ -45,36 +45,35 @@ coordRec coords[] = {
|
||||
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
|
||||
|
||||
template <typename T>
|
||||
__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos,
|
||||
T xStep, T yStep, uint maxIter) {
|
||||
__global__ void float_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
|
||||
uint maxIter) {
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = static_cast<float>(xPos + xStep*i);
|
||||
float y0 = static_cast<float>(yPos + yStep*j);
|
||||
float x0 = static_cast<float>(xPos + xStep * i);
|
||||
float y0 = static_cast<float>(yPos + yStep * j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
tmp = x;
|
||||
x = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
}
|
||||
out[tid] = iter;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
T yPos, T xStep, T yStep, uint maxIter) {
|
||||
|
||||
__global__ void float_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
|
||||
uint maxIter) {
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = static_cast<float>(xPos + xStep*static_cast<float>(i));
|
||||
float y0 = static_cast<float>(yPos + yStep*static_cast<float>(j));
|
||||
float x0 = static_cast<float>(xPos + xStep * static_cast<float>(i));
|
||||
float y0 = static_cast<float>(yPos + yStep * static_cast<float>(j));
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
@@ -84,72 +83,71 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
float tmp;
|
||||
int stay;
|
||||
int ccount = 0;
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
stay = (x * x + y * y) <= 4.0;
|
||||
float savx = x;
|
||||
float savy = y;
|
||||
#ifdef FAST
|
||||
for (iter = 0; (iter < maxIter); iter+=16) {
|
||||
for (iter = 0; (iter < maxIter); iter += 16) {
|
||||
#else
|
||||
for (iter = 0; stay && (iter < maxIter); iter+=16) {
|
||||
for (iter = 0; stay && (iter < maxIter); iter += 16) {
|
||||
#endif
|
||||
x = savx;
|
||||
y = savy;
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
stay = (x * x + y * y) <= 4.0;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
ccount += stay*16;
|
||||
ccount += stay * 16;
|
||||
#ifdef FAST
|
||||
if (!stay)
|
||||
break;
|
||||
if (!stay) break;
|
||||
#endif
|
||||
}
|
||||
// Handle remainder
|
||||
@@ -158,10 +156,10 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
do {
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
|
||||
stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter);
|
||||
tmp = x;
|
||||
x = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
x = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
ccount += stay;
|
||||
iter--;
|
||||
savx = (stay ? x : savx);
|
||||
@@ -172,36 +170,36 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
|
||||
uint maxIter) {
|
||||
__global__ void double_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
|
||||
uint maxIter) {
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
double x0 = static_cast<double>(xPos + xStep*i);
|
||||
double y0 = static_cast<double>(yPos + yStep*j);
|
||||
double x0 = static_cast<double>(xPos + xStep * i);
|
||||
double y0 = static_cast<double>(yPos + yStep * j);
|
||||
|
||||
double x = x0;
|
||||
double y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
double tmp;
|
||||
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
tmp = x;
|
||||
x = fma(-y, y,fma(x, x, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
x = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
}
|
||||
out[tid] = iter;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
T yPos, T xStep, T yStep, uint maxIter) {
|
||||
__global__ void double_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
|
||||
uint maxIter) {
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
double x0 = static_cast<double>(xPos + xStep*static_cast<double>(i));
|
||||
double y0 = static_cast<double>(yPos + yStep*static_cast<double>(j));
|
||||
double x0 = static_cast<double>(xPos + xStep * static_cast<double>(i));
|
||||
double y0 = static_cast<double>(yPos + yStep * static_cast<double>(j));
|
||||
|
||||
double x = x0;
|
||||
double y = y0;
|
||||
@@ -211,13 +209,13 @@ __global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
double tmp;
|
||||
int stay;
|
||||
int ccount = 0;
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
stay = (x * x + y * y) <= 4.0;
|
||||
double savx = x;
|
||||
double savy = y;
|
||||
#ifdef FAST
|
||||
for (iter = 0; (iter < maxIter); iter+=16)
|
||||
for (iter = 0; (iter < maxIter); iter += 16)
|
||||
#else
|
||||
for (iter = 0; stay && (iter < maxIter); iter+=16)
|
||||
for (iter = 0; stay && (iter < maxIter); iter += 16)
|
||||
#endif
|
||||
{
|
||||
x = savx;
|
||||
@@ -225,141 +223,131 @@ __global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
tmp = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * x, y, y0);
|
||||
x = fma(-y, y, fma(tmp, tmp, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
stay = (x * x + y * y) <= 4.0;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
ccount += stay*16;
|
||||
ccount += stay * 16;
|
||||
#ifdef FAST
|
||||
if (!stay)
|
||||
break;
|
||||
if (!stay) break;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
// Handle remainder
|
||||
if (!stay) {
|
||||
iter = 16;
|
||||
do {
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
|
||||
tmp = x;
|
||||
x = fma(-y,y, fma(x, x, x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
ccount += stay;
|
||||
iter--;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
}
|
||||
while (stay && iter);
|
||||
}
|
||||
out[tid] = (uint)ccount;
|
||||
if (!stay) {
|
||||
iter = 16;
|
||||
do {
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter);
|
||||
tmp = x;
|
||||
x = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
ccount += stay;
|
||||
iter--;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
} while (stay && iter);
|
||||
}
|
||||
out[tid] = (uint)ccount;
|
||||
};
|
||||
|
||||
// Expected results for each kernel run at each coord
|
||||
unsigned long long expectedIters[] = {
|
||||
203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull,
|
||||
120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull,
|
||||
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull,
|
||||
203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull,
|
||||
120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull,
|
||||
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};
|
||||
203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, 120254651ull,
|
||||
203277748ull, 2147483648ull, 120254651ull, 203315114ull, 2147483648ull, 120042599ull,
|
||||
203315114ull, 2147483648ull, 120042599ull, 203280620ull, 2147483648ull, 120485704ull,
|
||||
203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, 120485704ull,
|
||||
203315114ull, 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};
|
||||
|
||||
class hipPerfMandelBrot {
|
||||
public:
|
||||
hipPerfMandelBrot();
|
||||
~hipPerfMandelBrot();
|
||||
|
||||
void setNumKernels(unsigned int num) {
|
||||
numKernels = num;
|
||||
}
|
||||
void setNumKernels(unsigned int num) { numKernels = num; }
|
||||
|
||||
unsigned int getNumKernels() {
|
||||
return numKernels;
|
||||
}
|
||||
unsigned int getNumKernels() { return numKernels; }
|
||||
|
||||
void setNumStreams(unsigned int num) {
|
||||
numStreams = num;
|
||||
}
|
||||
unsigned int getNumStreams() {
|
||||
return numStreams;
|
||||
}
|
||||
void setNumStreams(unsigned int num) { numStreams = num; }
|
||||
unsigned int getNumStreams() { return numStreams; }
|
||||
|
||||
void open(int deviceID);
|
||||
bool run(unsigned int testCase);
|
||||
void printResults(void);
|
||||
|
||||
// array of funtion pointers
|
||||
typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt);
|
||||
typedef void (hipPerfMandelBrot::*funPtr)(uint* out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter,
|
||||
hipStream_t* streams, int blocks, int threads_per_block,
|
||||
int kernelCnt);
|
||||
|
||||
// Wrappers
|
||||
void float_mad(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t* streams,
|
||||
int blocks, int threads_per_block, int kernelCnt);
|
||||
void float_mad(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
|
||||
uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
|
||||
int kernelCnt);
|
||||
|
||||
void float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t* streams,
|
||||
int blocks, int threads_per_block, int kernelCnt);
|
||||
void float_mandel_unroll(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
|
||||
uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
|
||||
int kernelCnt);
|
||||
|
||||
void double_mad(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt);
|
||||
void double_mad(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
|
||||
uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
|
||||
int kernelCnt);
|
||||
|
||||
void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt);
|
||||
void double_mandel_unroll(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
|
||||
uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
|
||||
int kernelCnt);
|
||||
|
||||
hipStream_t streams[2];
|
||||
|
||||
private:
|
||||
void setData(void *ptr, unsigned int value);
|
||||
void checkData(uint *ptr);
|
||||
void setData(void* ptr, unsigned int value);
|
||||
void checkData(uint* ptr);
|
||||
|
||||
unsigned int numKernels;
|
||||
unsigned int numStreams;
|
||||
@@ -387,9 +375,9 @@ void hipPerfMandelBrot::open(int deviceId) {
|
||||
HIP_CHECK(hipSetDevice(deviceId));
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
|
||||
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
|
||||
<< std::endl;
|
||||
|
||||
CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID,
|
||||
props.name, props.multiProcessorCount, deviceId);
|
||||
|
||||
numCUs = props.multiProcessorCount;
|
||||
}
|
||||
@@ -397,52 +385,52 @@ void hipPerfMandelBrot::open(int deviceId) {
|
||||
void hipPerfMandelBrot::printResults() {
|
||||
int numStreams = getNumStreams();
|
||||
|
||||
std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
|
||||
<< numStreams << " streams (s)" << std::endl;
|
||||
CONSOLE_PRINT("Measured perf for kernels in GFLOPS on %d streams (s)", numStreams);
|
||||
|
||||
std::map<std::string, std::vector<double>>:: iterator itr;
|
||||
std::map<std::string, std::vector<double>>::iterator itr;
|
||||
for (itr = results.begin(); itr != results.end(); itr++) {
|
||||
std::cout << "\n" << std::setw(20) << itr->first << " ";
|
||||
for (auto i : results[itr->first]) {
|
||||
std::cout << std::setw(10) << i << " ";
|
||||
}
|
||||
}
|
||||
CONSOLE_PRINT("\n%s ", itr->first.c_str());
|
||||
for (auto i : results[itr->first]) {
|
||||
CONSOLE_PRINT("%10f ", i);
|
||||
}
|
||||
}
|
||||
results.clear();
|
||||
std::cout << std::endl;
|
||||
CONSOLE_PRINT("\n");
|
||||
}
|
||||
|
||||
// Wrappers for the kernel launches
|
||||
void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
void hipPerfMandelBrot::float_mad(uint* out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt) {
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep,
|
||||
maxIter);
|
||||
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
|
||||
}
|
||||
|
||||
void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t * streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
void hipPerfMandelBrot::float_mandel_unroll(uint* out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter,
|
||||
hipStream_t* streams, int blocks, int threads_per_block,
|
||||
int kernelCnt) {
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
|
||||
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
|
||||
}
|
||||
|
||||
void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t * streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
void hipPerfMandelBrot::double_mad(uint* out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt) {
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
|
||||
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
|
||||
}
|
||||
|
||||
void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t * streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
void hipPerfMandelBrot::double_mandel_unroll(uint* out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter,
|
||||
hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt) {
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
|
||||
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
|
||||
}
|
||||
|
||||
bool hipPerfMandelBrot::run(unsigned int testCase) {
|
||||
@@ -450,18 +438,18 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
|
||||
coordIdx = testCase % numCoords;
|
||||
|
||||
funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
|
||||
&hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
|
||||
&hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
|
||||
|
||||
// Maximum iteration count
|
||||
maxIter = 32768;
|
||||
|
||||
uint ** hPtr = new uint *[numKernels];
|
||||
uint ** dPtr = new uint *[numKernels];
|
||||
uint** hPtr = new uint*[numKernels];
|
||||
uint** dPtr = new uint*[numKernels];
|
||||
|
||||
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
|
||||
width_ = 256;
|
||||
|
||||
bufSize = width_ * width_ * sizeof(uint);
|
||||
bufSize = width_ * width_ * sizeof(uint);
|
||||
|
||||
// Create streams for concurrency
|
||||
for (uint i = 0; i < numStreams; i++) {
|
||||
@@ -470,15 +458,15 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
|
||||
|
||||
// Allocate memory on the host and device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hPtr[i]), bufSize, hipHostMallocDefault));
|
||||
HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hPtr[i]), bufSize, hipHostMallocDefault));
|
||||
setData(hPtr[i], 0xdeadbeef);
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<uint **>(&dPtr[i]), bufSize))
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<uint**>(&dPtr[i]), bufSize))
|
||||
}
|
||||
|
||||
// Prepare kernel launch parameters
|
||||
int threads = (bufSize/sizeof(uint));
|
||||
int threads_per_block = 64;
|
||||
int blocks = (threads/threads_per_block) + (threads % threads_per_block);
|
||||
int threads = (bufSize / sizeof(uint));
|
||||
int threads_per_block = 64;
|
||||
int blocks = (threads / threads_per_block) + (threads % threads_per_block);
|
||||
|
||||
// Copy memory asynchronously and concurrently from host to device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
@@ -489,90 +477,88 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
|
||||
HIP_CHECK(hipStreamSynchronize(0));
|
||||
|
||||
int kernelIdx;
|
||||
if(testCase == 0 || testCase == 5 || testCase == 10) {
|
||||
if (testCase == 0 || testCase == 5 || testCase == 10) {
|
||||
kernelIdx = 0;
|
||||
} else if(testCase == 1 || testCase == 6 || testCase == 11) {
|
||||
} else if (testCase == 1 || testCase == 6 || testCase == 11) {
|
||||
kernelIdx = 1;
|
||||
} else if(testCase == 2 || testCase == 7 || testCase == 12) {
|
||||
} else if (testCase == 2 || testCase == 7 || testCase == 12) {
|
||||
kernelIdx = 2;
|
||||
} else if(testCase == 3 || testCase == 8 || testCase == 13){
|
||||
} else if (testCase == 3 || testCase == 8 || testCase == 13) {
|
||||
kernelIdx = 3;
|
||||
}
|
||||
double totalTime = 0.0;
|
||||
for (unsigned int k = 0; k < numLoops; k++) {
|
||||
if ((testCase == 0 || testCase == 1 || testCase == 2 ||
|
||||
testCase == 5 || testCase == 6 || testCase == 7 ||
|
||||
testCase == 10 || testCase == 11 || testCase == 12)) {
|
||||
float xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
|
||||
float yStep = static_cast<float>(-coords[coordIdx].width / static_cast<double>(width_));
|
||||
float xPos = static_cast<float>(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
float yPos = static_cast<float>(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
if ((testCase == 0 || testCase == 1 || testCase == 2 || testCase == 5 || testCase == 6 ||
|
||||
testCase == 7 || testCase == 10 || testCase == 11 || testCase == 12)) {
|
||||
float xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
|
||||
float yStep = static_cast<float>(-coords[coordIdx].width / static_cast<double>(width_));
|
||||
float xPos = static_cast<float>(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
float yPos = static_cast<float>(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
|
||||
threads_per_block, i);
|
||||
}
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
|
||||
threads_per_block, i);
|
||||
}
|
||||
|
||||
// Synchronize all the concurrent streams to have completed execution
|
||||
HIP_CHECK(hipStreamSynchronize(0));
|
||||
// Synchronize all the concurrent streams to have completed execution
|
||||
HIP_CHECK(hipStreamSynchronize(0));
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
totalTime += all_kernel_time.count();
|
||||
} else {
|
||||
double xStep = coords[coordIdx].width / static_cast<double>(width_);
|
||||
double yStep = -coords[coordIdx].width / static_cast<double>(width_);
|
||||
double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
|
||||
double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
totalTime += all_kernel_time.count();
|
||||
} else {
|
||||
double xStep = coords[coordIdx].width / static_cast<double>(width_);
|
||||
double yStep = -coords[coordIdx].width / static_cast<double>(width_);
|
||||
double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
|
||||
double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
|
||||
threads_per_block, i);
|
||||
}
|
||||
// Synchronize all the concurrent streams to have completed execution
|
||||
HIP_CHECK(hipStreamSynchronize(0));
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
|
||||
threads_per_block, i);
|
||||
}
|
||||
// Synchronize all the concurrent streams to have completed execution
|
||||
HIP_CHECK(hipStreamSynchronize(0));
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
totalTime += all_kernel_time.count();
|
||||
}
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
totalTime += all_kernel_time.count();
|
||||
}
|
||||
}
|
||||
|
||||
// Copy data back from device to the host
|
||||
for(uint i = 0; i < numKernels; i++) {
|
||||
HIP_CHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
for(uint i = 0; i < numKernels; i++) {
|
||||
checkData(hPtr[i]);
|
||||
int j =0;
|
||||
while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
|
||||
j++;
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIP_CHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
checkData(hPtr[i]);
|
||||
int j = 0;
|
||||
while ((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
|
||||
j++;
|
||||
}
|
||||
|
||||
if(j==30) {
|
||||
std::cout << "Incorrect iteration count detected. ";
|
||||
}
|
||||
if (j == 30) {
|
||||
CONSOLE_PRINT("Incorrect iteration count detected. ");
|
||||
}
|
||||
}
|
||||
|
||||
// Compute GFLOPS. There are 7 FLOPs per iteration
|
||||
double perf = (static_cast<double>(totalIters*numKernels) * 7 * static_cast<double>(1e-09)) /
|
||||
(totalTime / (double)numLoops);
|
||||
double perf = (static_cast<double>(totalIters * numKernels) * 7 * static_cast<double>(1e-09)) /
|
||||
(totalTime / (double)numLoops);
|
||||
|
||||
|
||||
std::vector<std::string> kernelName = {"float", "float_unroll",
|
||||
"double", "double_unroll"};
|
||||
std::vector<std::string> kernelName = {"float", "float_unroll", "double", "double_unroll"};
|
||||
|
||||
// Print results except for Warm-up kernel
|
||||
if (testCase != 100) {
|
||||
results[kernelName[testCase % 4]].push_back(perf);
|
||||
}
|
||||
results[kernelName[testCase % 4]].push_back(perf);
|
||||
}
|
||||
|
||||
for(uint i = 0 ; i < numStreams; i++) {
|
||||
for (uint i = 0; i < numStreams; i++) {
|
||||
HIP_CHECK(hipStreamDestroy(streams[i]));
|
||||
}
|
||||
|
||||
@@ -581,19 +567,19 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
|
||||
HIP_CHECK(hipHostFree(hPtr[i]));
|
||||
HIP_CHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
delete [] hPtr;
|
||||
delete [] dPtr;
|
||||
delete[] hPtr;
|
||||
delete[] dPtr;
|
||||
return true;
|
||||
}
|
||||
|
||||
void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
|
||||
unsigned int *ptr2 = (unsigned int *)ptr;
|
||||
void hipPerfMandelBrot::setData(void* ptr, unsigned int value) {
|
||||
unsigned int* ptr2 = (unsigned int*)ptr;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
ptr2[i] = value;
|
||||
ptr2[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
void hipPerfMandelBrot::checkData(uint *ptr) {
|
||||
void hipPerfMandelBrot::checkData(uint* ptr) {
|
||||
totalIters = 0;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
totalIters += ptr[i];
|
||||
@@ -601,30 +587,30 @@ void hipPerfMandelBrot::checkData(uint *ptr) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the warm-up kernel default stream executes serially.
|
||||
* - verify by running all kernels - sync.
|
||||
* - verify by running all kernels - async.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/compute/hipPerfMandelbrot.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the warm-up kernel default stream executes serially.
|
||||
* - verify by running all kernels - sync.
|
||||
* - verify by running all kernels - async.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/compute/hipPerfMandelbrot.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
|
||||
TEST_CASE("Perf_hipPerfMandelbrot") {
|
||||
hipPerfMandelBrot mandelbrotCompute;
|
||||
int deviceId = 0;
|
||||
mandelbrotCompute.open(deviceId);
|
||||
#if HT_AMD
|
||||
#if HT_AMD
|
||||
SECTION("warm-up kernel default stream executes serially") {
|
||||
mandelbrotCompute.setNumStreams(1);
|
||||
mandelbrotCompute.setNumKernels(1);
|
||||
REQUIRE(true == mandelbrotCompute.run(100/*Random number*/));
|
||||
REQUIRE(true == mandelbrotCompute.run(100 /*Random number*/));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
SECTION("run all - sync") {
|
||||
int i = 0;
|
||||
do {
|
||||
@@ -632,7 +618,7 @@ TEST_CASE("Perf_hipPerfMandelbrot") {
|
||||
mandelbrotCompute.setNumKernels(1);
|
||||
REQUIRE(true == mandelbrotCompute.run(i));
|
||||
i++;
|
||||
}while(i < 12);
|
||||
} while (i < 12);
|
||||
mandelbrotCompute.printResults();
|
||||
}
|
||||
|
||||
@@ -643,12 +629,12 @@ TEST_CASE("Perf_hipPerfMandelbrot") {
|
||||
mandelbrotCompute.setNumKernels(2);
|
||||
REQUIRE(true == mandelbrotCompute.run(i));
|
||||
i++;
|
||||
}while(i < 12);
|
||||
} while (i < 12);
|
||||
mandelbrotCompute.printResults();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* End doxygen group perfComputeTest.
|
||||
* @}
|
||||
*/
|
||||
* End doxygen group perfComputeTest.
|
||||
* @}
|
||||
*/
|
||||
|
||||
@@ -18,12 +18,12 @@
|
||||
*/
|
||||
|
||||
/**
|
||||
* @addtogroup hipPerfDeviceConcurrency hipPerfDeviceConcurrency
|
||||
* @{
|
||||
* @ingroup perfStreamTest
|
||||
* `hipError_t hipStreamCreate(hipStream_t* stream)` -
|
||||
* Create an asynchronous stream.
|
||||
*/
|
||||
* @addtogroup hipPerfDeviceConcurrency hipPerfDeviceConcurrency
|
||||
* @{
|
||||
* @ingroup perfStreamTest
|
||||
* `hipError_t hipStreamCreate(hipStream_t* stream)` -
|
||||
* Create an asynchronous stream.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
@@ -34,28 +34,28 @@ typedef struct {
|
||||
} coordRec;
|
||||
|
||||
static coordRec coords[] = {
|
||||
{0.0, 0.0, 0.00001}, // All black
|
||||
{0.0, 0.0, 0.00001}, // All black
|
||||
};
|
||||
|
||||
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
|
||||
|
||||
__global__ void mandelbrot(uint *out, uint width, float xPos,
|
||||
float yPos, float xStep, float yStep, uint maxIter) {
|
||||
__global__ void mandelbrot(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
|
||||
uint maxIter) {
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = static_cast<float>(xPos + xStep*i);
|
||||
float y0 = static_cast<float>(yPos + yStep*j);
|
||||
float x0 = static_cast<float>(xPos + xStep * i);
|
||||
float y0 = static_cast<float>(yPos + yStep * j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
tmp = x;
|
||||
x = fma(-y, y, fma(x, x, x0));
|
||||
y = fma(2.0f*tmp, y, y0);
|
||||
y = fma(2.0f * tmp, y, y0);
|
||||
}
|
||||
out[tid] = iter;
|
||||
};
|
||||
@@ -65,20 +65,16 @@ class hipPerfDeviceConcurrency {
|
||||
hipPerfDeviceConcurrency();
|
||||
~hipPerfDeviceConcurrency();
|
||||
|
||||
void setNumGpus(unsigned int num) {
|
||||
numDevices = num;
|
||||
}
|
||||
unsigned int getNumGpus() {
|
||||
return numDevices;
|
||||
}
|
||||
void setNumGpus(unsigned int num) { numDevices = num; }
|
||||
unsigned int getNumGpus() { return numDevices; }
|
||||
|
||||
void open(void);
|
||||
void close(void);
|
||||
bool run(unsigned int testCase, int numGpus);
|
||||
|
||||
private:
|
||||
void setData(void *ptr, unsigned int value);
|
||||
void checkData(uint *ptr);
|
||||
void setData(void* ptr, unsigned int value);
|
||||
void checkData(uint* ptr);
|
||||
|
||||
unsigned int numDevices;
|
||||
unsigned int width_;
|
||||
@@ -100,17 +96,16 @@ void hipPerfDeviceConcurrency::open(void) {
|
||||
}
|
||||
}
|
||||
|
||||
void hipPerfDeviceConcurrency::close() {
|
||||
}
|
||||
void hipPerfDeviceConcurrency::close() {}
|
||||
|
||||
bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
|
||||
static int deviceId;
|
||||
uint ** hPtr = new uint*[numGpus];
|
||||
uint ** dPtr = new uint*[numGpus];
|
||||
hipStream_t * streams = new hipStream_t[numGpus];
|
||||
int *numCUs = new int[numGpus];
|
||||
unsigned int *maxIter = new unsigned int[numGpus];
|
||||
unsigned long long *expectedIters = new unsigned long long[numGpus];
|
||||
uint** hPtr = new uint*[numGpus];
|
||||
uint** dPtr = new uint*[numGpus];
|
||||
hipStream_t* streams = new hipStream_t[numGpus];
|
||||
int* numCUs = new int[numGpus];
|
||||
unsigned int* maxIter = new unsigned int[numGpus];
|
||||
unsigned long long* expectedIters = new unsigned long long[numGpus];
|
||||
|
||||
int threads, threads_per_block, blocks;
|
||||
float xStep, yStep, xPos, yPos;
|
||||
@@ -124,25 +119,21 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, i));
|
||||
if (testCase != 0) {
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID
|
||||
<< " " << props.name << " with " << props.multiProcessorCount
|
||||
<< " CUs" << " and device ID: " << i << std::endl;
|
||||
CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device ID: %d", props.pciBusID,
|
||||
props.name, props.multiProcessorCount, i);
|
||||
}
|
||||
|
||||
numCUs[i] = props.multiProcessorCount;
|
||||
int clkFrequency = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkFrequency,
|
||||
hipDeviceAttributeClockRate, i));
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
|
||||
if (clkFrequency == 0) {
|
||||
std::cout << "clkFrequency = 0, set it to 1000000\n";
|
||||
CONSOLE_PRINT("clkFrequency = 0, set it to 1000000");
|
||||
clkFrequency = 1000000;
|
||||
}
|
||||
clkFrequency =(unsigned int)clkFrequency/1000;
|
||||
clkFrequency = (unsigned int)clkFrequency / 1000;
|
||||
|
||||
// Maximum iteration count
|
||||
// maxIter = 8388608 * (engine_clock / 1000).serial execution
|
||||
maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000))
|
||||
* numCUs[i]) / 128);
|
||||
maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
|
||||
maxIter[i] = (maxIter[i] + 15) & ~15;
|
||||
|
||||
// Width is divisible by 4 because the mandelbrot
|
||||
@@ -153,15 +144,14 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
|
||||
HIP_CHECK(hipStreamCreate(&streams[i]));
|
||||
|
||||
// Allocate memory on the host and device
|
||||
HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hPtr[i]),
|
||||
bufSize, hipHostMallocDefault));
|
||||
HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hPtr[i]), bufSize, hipHostMallocDefault));
|
||||
setData(hPtr[i], 0xdeadbeef);
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<uint **>(&dPtr[i]), bufSize))
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<uint**>(&dPtr[i]), bufSize))
|
||||
|
||||
// Prepare kernel launch parameters
|
||||
threads = (bufSize/sizeof(uint));
|
||||
threads_per_block = 64;
|
||||
blocks = (threads/threads_per_block) + (threads % threads_per_block);
|
||||
threads = (bufSize / sizeof(uint));
|
||||
threads_per_block = 64;
|
||||
blocks = (threads / threads_per_block) + (threads % threads_per_block);
|
||||
|
||||
coordIdx = testCase % numCoords;
|
||||
xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
|
||||
@@ -180,10 +170,9 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
|
||||
deviceId = i;
|
||||
}
|
||||
|
||||
HIP_CHECK(hipSetDevice(deviceId));
|
||||
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[i], dPtr[i], width_, xPos, yPos, xStep,
|
||||
yStep, maxIter[i]);
|
||||
HIP_CHECK(hipSetDevice(deviceId));
|
||||
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i], dPtr[i],
|
||||
width_, xPos, yPos, xStep, yStep, maxIter[i]);
|
||||
}
|
||||
for (int i = 0; i < numGpus; i++) {
|
||||
HIP_CHECK(hipStreamSynchronize(0));
|
||||
@@ -192,8 +181,8 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
if(testCase != 0) {
|
||||
for (int i = 0; i < numGpus; i++) {
|
||||
if (testCase != 0) {
|
||||
deviceId = i;
|
||||
}
|
||||
HIP_CHECK(hipSetDevice(deviceId));
|
||||
@@ -201,11 +190,11 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
|
||||
// Copy data back from device to the host
|
||||
HIP_CHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
|
||||
checkData(hPtr[i]);
|
||||
expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
|
||||
expectedIters[i] = width_ * width_ * (unsigned long long)maxIter[i];
|
||||
if (testCase != 0) {
|
||||
checkData(hPtr[i]);
|
||||
if (totalIters != expectedIters[i]) {
|
||||
std::cout << "Incorrect iteration count detected" << std::endl;
|
||||
CONSOLE_PRINT("Incorrect iteration count detected");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -216,31 +205,30 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
|
||||
}
|
||||
|
||||
if (testCase != 0) {
|
||||
std::cout << '\n' << "Measured time for kernel computation on " << numGpus
|
||||
<< " device (s): " << all_kernel_time.count() << " (s) "
|
||||
<< '\n' << std::endl;
|
||||
CONSOLE_PRINT("\nMeasured time for kernel computation on %d device(s): %.6f (s)\n", numGpus,
|
||||
all_kernel_time.count());
|
||||
}
|
||||
|
||||
if (testCase == 0) {
|
||||
deviceId++;
|
||||
}
|
||||
delete [] hPtr;
|
||||
delete [] dPtr;
|
||||
delete [] streams;
|
||||
delete [] numCUs;
|
||||
delete [] maxIter;
|
||||
delete [] expectedIters;
|
||||
delete[] hPtr;
|
||||
delete[] dPtr;
|
||||
delete[] streams;
|
||||
delete[] numCUs;
|
||||
delete[] maxIter;
|
||||
delete[] expectedIters;
|
||||
return true;
|
||||
}
|
||||
|
||||
void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
|
||||
unsigned int *ptr2 = (unsigned int *)ptr;
|
||||
for (unsigned int i = 0; i < width_ * width_ ; i++) {
|
||||
ptr2[i] = value;
|
||||
void hipPerfDeviceConcurrency::setData(void* ptr, unsigned int value) {
|
||||
unsigned int* ptr2 = (unsigned int*)ptr;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
ptr2[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
void hipPerfDeviceConcurrency::checkData(uint *ptr) {
|
||||
void hipPerfDeviceConcurrency::checkData(uint* ptr) {
|
||||
totalIters = 0;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
totalIters += ptr[i];
|
||||
@@ -248,16 +236,16 @@ void hipPerfDeviceConcurrency::checkData(uint *ptr) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the different levels of device concurrency.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/stream/hipPerfDeviceConcurrency.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the different levels of device concurrency.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/stream/hipPerfDeviceConcurrency.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
|
||||
TEST_CASE("Perf_hipPerfDeviceConcurrency") {
|
||||
hipPerfDeviceConcurrency deviceConcurrency;
|
||||
@@ -279,6 +267,6 @@ TEST_CASE("Perf_hipPerfDeviceConcurrency") {
|
||||
}
|
||||
|
||||
/**
|
||||
* End doxygen group perfStreamTest.
|
||||
* @}
|
||||
*/
|
||||
* End doxygen group perfStreamTest.
|
||||
* @}
|
||||
*/
|
||||
|
||||
@@ -18,12 +18,12 @@
|
||||
*/
|
||||
|
||||
/**
|
||||
* @addtogroup hipPerfStreamConcurrency hipPerfStreamConcurrency
|
||||
* @{
|
||||
* @ingroup perfComputeTest
|
||||
* `hipError_t hipStreamCreate(hipStream_t* stream)` -
|
||||
* Create an asynchronous stream.
|
||||
*/
|
||||
* @addtogroup hipPerfStreamConcurrency hipPerfStreamConcurrency
|
||||
* @{
|
||||
* @ingroup perfComputeTest
|
||||
* `hipError_t hipStreamCreate(hipStream_t* stream)` -
|
||||
* Create an asynchronous stream.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_vector_types.h>
|
||||
@@ -55,23 +55,23 @@ static coordRec coords[] = {
|
||||
|
||||
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
|
||||
|
||||
__global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter) {
|
||||
__global__ static void mandelbrot(uint* out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter) {
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % (width/4);
|
||||
int j = tid / (width/4);
|
||||
int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3);
|
||||
int i = tid % (width / 4);
|
||||
int j = tid / (width / 4);
|
||||
int4 veci = make_int4(4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3);
|
||||
int4 vecj = make_int4(j, j, j, j);
|
||||
float4 x0;
|
||||
x0.x = static_cast<float>(xPos + xStep*veci.x);
|
||||
x0.y = static_cast<float>(xPos + xStep*veci.y);
|
||||
x0.z = static_cast<float>(xPos + xStep*veci.z);
|
||||
x0.w = static_cast<float>(xPos + xStep*veci.w);
|
||||
x0.x = static_cast<float>(xPos + xStep * veci.x);
|
||||
x0.y = static_cast<float>(xPos + xStep * veci.y);
|
||||
x0.z = static_cast<float>(xPos + xStep * veci.z);
|
||||
x0.w = static_cast<float>(xPos + xStep * veci.w);
|
||||
float4 y0;
|
||||
y0.x = static_cast<float>(yPos + yStep*vecj.x);
|
||||
y0.y = static_cast<float>(yPos + yStep*vecj.y);
|
||||
y0.z = static_cast<float>(yPos + yStep*vecj.z);
|
||||
y0.w = static_cast<float>(yPos + yStep*vecj.w);
|
||||
y0.x = static_cast<float>(yPos + yStep * vecj.x);
|
||||
y0.y = static_cast<float>(yPos + yStep * vecj.y);
|
||||
y0.z = static_cast<float>(yPos + yStep * vecj.z);
|
||||
y0.w = static_cast<float>(yPos + yStep * vecj.w);
|
||||
float4 x = x0;
|
||||
float4 y = y0;
|
||||
uint iter = 0;
|
||||
@@ -80,53 +80,52 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
|
||||
int4 ccount = make_int4(0, 0, 0, 0);
|
||||
float4 savx = x;
|
||||
float4 savy = y;
|
||||
stay.x = (x.x*x.x+y.x*y.x) <= static_cast<float>(4.0f);
|
||||
stay.y = (x.y*x.y+y.y*y.y) <= static_cast<float>(4.0f);
|
||||
stay.z = (x.z*x.z+y.z*y.z) <= static_cast<float>(4.0f);
|
||||
stay.w = (x.w*x.w+y.w*y.w) <= static_cast<float>(4.0f);
|
||||
for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter);
|
||||
iter+=16) {
|
||||
stay.x = (x.x * x.x + y.x * y.x) <= static_cast<float>(4.0f);
|
||||
stay.y = (x.y * x.y + y.y * y.y) <= static_cast<float>(4.0f);
|
||||
stay.z = (x.z * x.z + y.z * y.z) <= static_cast<float>(4.0f);
|
||||
stay.w = (x.w * x.w + y.w * y.w) <= static_cast<float>(4.0f);
|
||||
for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter); iter += 16) {
|
||||
x = savx;
|
||||
y = savy;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
tmp = x * x + x0 - y * y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
x = tmp * tmp + x0 - y * y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
tmp = x * x + x0 - y * y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
x = tmp * tmp + x0 - y * y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
tmp = x * x + x0 - y * y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
x = tmp * tmp + x0 - y * y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
tmp = x * x + x0 - y * y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
x = tmp * tmp + x0 - y * y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
tmp = x * x + x0 - y * y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
x = tmp * tmp + x0 - y * y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
tmp = x * x + x0 - y * y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
x = tmp * tmp + x0 - y * y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
tmp = x * x + x0 - y * y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
x = tmp * tmp + x0 - y * y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
stay.x = (x.x*x.x+y.x*y.x) <= static_cast<float>(4.0f);
|
||||
stay.y = (x.y*x.y+y.y*y.y) <= static_cast<float>(4.0f);
|
||||
stay.z = (x.z*x.z+y.z*y.z) <= static_cast<float>(4.0f);
|
||||
stay.w = (x.w*x.w+y.w*y.w) <= static_cast<float>(4.0f);
|
||||
stay.x = (x.x * x.x + y.x * y.x) <= static_cast<float>(4.0f);
|
||||
stay.y = (x.y * x.y + y.y * y.y) <= static_cast<float>(4.0f);
|
||||
stay.z = (x.z * x.z + y.z * y.z) <= static_cast<float>(4.0f);
|
||||
stay.w = (x.w * x.w + y.w * y.w) <= static_cast<float>(4.0f);
|
||||
savx.x = static_cast<bool>(stay.x ? x.x : savx.x);
|
||||
savx.y = static_cast<bool>(stay.y ? x.y : savx.y);
|
||||
savx.z = static_cast<bool>(stay.z ? x.z : savx.z);
|
||||
@@ -135,10 +134,10 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
|
||||
savy.y = static_cast<bool>(stay.y ? y.y : savy.y);
|
||||
savy.z = static_cast<bool>(stay.z ? y.z : savy.z);
|
||||
savy.w = static_cast<bool>(stay.w ? y.w : savy.w);
|
||||
ccount.x -= stay.x*16;
|
||||
ccount.y -= stay.y*16;
|
||||
ccount.z -= stay.z*16;
|
||||
ccount.w -= stay.w*16;
|
||||
ccount.x -= stay.x * 16;
|
||||
ccount.y -= stay.y * 16;
|
||||
ccount.z -= stay.z * 16;
|
||||
ccount.w -= stay.w * 16;
|
||||
}
|
||||
// Handle remainder
|
||||
if (!(stay.x & stay.y & stay.z & stay.w)) {
|
||||
@@ -146,13 +145,13 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
|
||||
do {
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x < maxIter);
|
||||
stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y < maxIter);
|
||||
stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z < maxIter);
|
||||
stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w < maxIter);
|
||||
stay.x = ((x.x * x.x + y.x * y.x) <= 4.0f) && (ccount.x < maxIter);
|
||||
stay.y = ((x.y * x.y + y.y * y.y) <= 4.0f) && (ccount.y < maxIter);
|
||||
stay.z = ((x.z * x.z + y.z * y.z) <= 4.0f) && (ccount.z < maxIter);
|
||||
stay.w = ((x.w * x.w + y.w * y.w) <= 4.0f) && (ccount.w < maxIter);
|
||||
tmp = x;
|
||||
x = x*x + x0 - y*y;
|
||||
y = 2.0f*tmp*y + y0;
|
||||
x = x * x + x0 - y * y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
ccount.x += stay.x;
|
||||
ccount.y += stay.y;
|
||||
ccount.z += stay.z;
|
||||
@@ -168,7 +167,7 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
|
||||
savy.w = (stay.w ? y.w : savy.w);
|
||||
} while ((stay.x | stay.y | stay.z | stay.w) && iter);
|
||||
}
|
||||
uint4 *vecOut = reinterpret_cast<uint4 *>(out);
|
||||
uint4* vecOut = reinterpret_cast<uint4*>(out);
|
||||
vecOut[tid].x = (uint)(ccount.x);
|
||||
vecOut[tid].y = (uint)(ccount.y);
|
||||
vecOut[tid].z = (uint)(ccount.z);
|
||||
@@ -180,27 +179,19 @@ class hipPerfStreamConcurrency {
|
||||
hipPerfStreamConcurrency();
|
||||
~hipPerfStreamConcurrency();
|
||||
|
||||
void setNumKernels(unsigned int num) {
|
||||
numKernels = num;
|
||||
}
|
||||
void setNumStreams(unsigned int num) {
|
||||
numStreams = num;
|
||||
}
|
||||
unsigned int getNumStreams() {
|
||||
return numStreams;
|
||||
}
|
||||
void setNumKernels(unsigned int num) { numKernels = num; }
|
||||
void setNumStreams(unsigned int num) { numStreams = num; }
|
||||
unsigned int getNumStreams() { return numStreams; }
|
||||
|
||||
unsigned int getNumKernels() {
|
||||
return numKernels;
|
||||
}
|
||||
unsigned int getNumKernels() { return numKernels; }
|
||||
|
||||
bool open(int deviceID);
|
||||
bool run(unsigned int testCase, unsigned int deviceId);
|
||||
void close(void);
|
||||
|
||||
private:
|
||||
void setData(void *ptr, unsigned int value);
|
||||
void checkData(uint *ptr);
|
||||
void setData(void* ptr, unsigned int value);
|
||||
void checkData(uint* ptr);
|
||||
|
||||
unsigned int numKernels;
|
||||
unsigned int numStreams;
|
||||
@@ -227,38 +218,34 @@ bool hipPerfStreamConcurrency::open(int deviceId) {
|
||||
HIP_CHECK(hipSetDevice(deviceId));
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID
|
||||
<< " " << props.name << " with " << props.multiProcessorCount << " CUs"
|
||||
<< " and device id: " << deviceId << std::endl;
|
||||
CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device ID: %d", props.pciBusID,
|
||||
props.name, props.multiProcessorCount, deviceId);
|
||||
|
||||
numCUs = props.multiProcessorCount;
|
||||
return true;
|
||||
}
|
||||
|
||||
void hipPerfStreamConcurrency::close() {
|
||||
}
|
||||
void hipPerfStreamConcurrency::close() {}
|
||||
|
||||
bool hipPerfStreamConcurrency::run(unsigned int testCase,
|
||||
unsigned int deviceId) {
|
||||
bool hipPerfStreamConcurrency::run(unsigned int testCase, unsigned int deviceId) {
|
||||
int clkFrequency = 0;
|
||||
unsigned int numStreams = getNumStreams();
|
||||
unsigned int numKernels = getNumKernels();
|
||||
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkFrequency,
|
||||
hipDeviceAttributeClockRate, deviceId));
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, deviceId));
|
||||
if (clkFrequency == 0) {
|
||||
std::cout << "clkFrequency = 0, set it to 1000000\n";
|
||||
CONSOLE_PRINT("clkFrequency = 0, set it to 1000000\n");
|
||||
clkFrequency = 1000000;
|
||||
}
|
||||
clkFrequency =(unsigned int)clkFrequency/1000;
|
||||
clkFrequency = (unsigned int)clkFrequency / 1000;
|
||||
|
||||
// Maximum iteration count
|
||||
// maxIter = 8388608 * (engine_clock / 1000).serial execution
|
||||
maxIter = (unsigned int)(((8388608 * (static_cast<float>(clkFrequency) / 1000))
|
||||
* numCUs) / 128);
|
||||
maxIter = (unsigned int)(((8388608 * (static_cast<float>(clkFrequency) / 1000)) * numCUs) / 128);
|
||||
maxIter = (maxIter + 15) & ~15;
|
||||
hipStream_t *streams = new hipStream_t[numStreams];
|
||||
uint ** hPtr = new uint*[numKernels];
|
||||
uint ** dPtr = new uint*[numKernels];
|
||||
hipStream_t* streams = new hipStream_t[numStreams];
|
||||
uint** hPtr = new uint*[numKernels];
|
||||
uint** dPtr = new uint*[numKernels];
|
||||
|
||||
// Width is divisible by 4 because the mandelbrot kernel
|
||||
// processes 4 pixels at once.
|
||||
@@ -271,16 +258,15 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
|
||||
|
||||
// Allocate memory on the host and device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hPtr[i]),
|
||||
bufSize, hipHostMallocDefault));
|
||||
HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hPtr[i]), bufSize, hipHostMallocDefault));
|
||||
setData(hPtr[i], 0xdeadbeef);
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<void **>(&dPtr[i]), bufSize))
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dPtr[i]), bufSize))
|
||||
}
|
||||
|
||||
// Prepare kernel launch parameters
|
||||
int threads = (bufSize/sizeof(uint));
|
||||
int threads_per_block = 64;
|
||||
int blocks = (threads/threads_per_block) + (threads % threads_per_block);
|
||||
int threads = (bufSize / sizeof(uint));
|
||||
int threads_per_block = 64;
|
||||
int blocks = (threads / threads_per_block) + (threads % threads_per_block);
|
||||
coordIdx = testCase % numCoords;
|
||||
float xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
|
||||
float yStep = static_cast<float>(-coords[coordIdx].width / static_cast<double>(width_));
|
||||
@@ -289,8 +275,8 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
|
||||
|
||||
// Copy memory asynchronously and concurrently from host to device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIP_CHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]),
|
||||
hPtr[i], bufSize, streams[i % numStreams]));
|
||||
HIP_CHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]), hPtr[i], bufSize,
|
||||
streams[i % numStreams]));
|
||||
}
|
||||
|
||||
// Synchronize to make sure all the copies are completed
|
||||
@@ -305,9 +291,8 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block),
|
||||
0, streams[i%numStreams], dPtr[i], width_, xPos, yPos, xStep,
|
||||
yStep, maxIter);
|
||||
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[i % numStreams], dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter);
|
||||
}
|
||||
|
||||
// Synchronize all the concurrent streans to have completed execution
|
||||
@@ -320,17 +305,16 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
|
||||
|
||||
// Copy data back from device to the host
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIP_CHECK(hipMemcpyDtoHAsync(hPtr[i],
|
||||
reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize,
|
||||
streams[i % numStreams]));
|
||||
HIP_CHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize,
|
||||
streams[i % numStreams]));
|
||||
}
|
||||
|
||||
if (testCase != 0) {
|
||||
std::cout <<"Measured time for " << numKernels <<" kernels (s) on "
|
||||
<< numStreams <<" stream (s): " << all_kernel_time.count() << std::endl;
|
||||
CONSOLE_PRINT("Measured time for %d kernels (s) on %d stream(s): %e\n", numKernels, numStreams,
|
||||
all_kernel_time.count());
|
||||
}
|
||||
|
||||
for (uint i = 0 ; i < numStreams; i++) {
|
||||
for (uint i = 0; i < numStreams; i++) {
|
||||
HIP_CHECK(hipStreamDestroy(streams[i]));
|
||||
}
|
||||
|
||||
@@ -340,20 +324,20 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
|
||||
HIP_CHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
|
||||
delete [] streams;
|
||||
delete [] hPtr;
|
||||
delete [] dPtr;
|
||||
delete[] streams;
|
||||
delete[] hPtr;
|
||||
delete[] dPtr;
|
||||
return true;
|
||||
}
|
||||
|
||||
void hipPerfStreamConcurrency::setData(void *ptr, unsigned int value) {
|
||||
unsigned int *ptr2 = (unsigned int *)ptr;
|
||||
for (unsigned int i = 0; i < width_ ; i++) {
|
||||
ptr2[i] = value;
|
||||
void hipPerfStreamConcurrency::setData(void* ptr, unsigned int value) {
|
||||
unsigned int* ptr2 = (unsigned int*)ptr;
|
||||
for (unsigned int i = 0; i < width_; i++) {
|
||||
ptr2[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
void hipPerfStreamConcurrency::checkData(uint *ptr) {
|
||||
void hipPerfStreamConcurrency::checkData(uint* ptr) {
|
||||
totalIters = 0;
|
||||
for (unsigned int i = 0; i < width_; i++) {
|
||||
totalIters += ptr[i];
|
||||
@@ -361,16 +345,16 @@ void hipPerfStreamConcurrency::checkData(uint *ptr) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the different levels of stream concurrency.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/stream/hipPerfStreamConcurrency.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the different levels of stream concurrency.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/stream/hipPerfStreamConcurrency.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
|
||||
TEST_CASE("Perf_hipPerfStreamConcurrency") {
|
||||
hipPerfStreamConcurrency streamConcurrency;
|
||||
@@ -386,10 +370,10 @@ TEST_CASE("Perf_hipPerfStreamConcurrency") {
|
||||
break;
|
||||
|
||||
case 1:
|
||||
// default stream executes serially
|
||||
streamConcurrency.setNumStreams(1);
|
||||
streamConcurrency.setNumKernels(1);
|
||||
break;
|
||||
// default stream executes serially
|
||||
streamConcurrency.setNumStreams(1);
|
||||
streamConcurrency.setNumKernels(1);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
// 2-way concurrency
|
||||
@@ -419,6 +403,6 @@ TEST_CASE("Perf_hipPerfStreamConcurrency") {
|
||||
}
|
||||
|
||||
/**
|
||||
* End doxygen group perfComputeTest.
|
||||
* @}
|
||||
*/
|
||||
* End doxygen group perfComputeTest.
|
||||
* @}
|
||||
*/
|
||||
|
||||
+46
-46
@@ -18,19 +18,17 @@
|
||||
*/
|
||||
|
||||
/**
|
||||
* @addtogroup hipPerfStreamCreateCopyDestroy hipPerfStreamCreateCopyDestroy
|
||||
* @{
|
||||
* @ingroup perfStreamTest
|
||||
* `hipError_t hipStreamCreate(hipStream_t* stream)` -
|
||||
* Create an asynchronous stream.
|
||||
*/
|
||||
* @addtogroup hipPerfStreamCreateCopyDestroy hipPerfStreamCreateCopyDestroy
|
||||
* @{
|
||||
* @ingroup perfStreamTest
|
||||
* `hipError_t hipStreamCreate(hipStream_t* stream)` -
|
||||
* Create an asynchronous stream.
|
||||
*/
|
||||
|
||||
#include <hip_test_kernels.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define BufSize 0x1000
|
||||
#define Iterations 0x100
|
||||
#define TotalStreams 4
|
||||
@@ -39,17 +37,20 @@ using namespace std;
|
||||
|
||||
class hipPerfStreamCreateCopyDestroy {
|
||||
private:
|
||||
unsigned int numBuffers_;
|
||||
unsigned int numStreams_;
|
||||
const size_t totalStreams_[TotalStreams];
|
||||
const size_t totalBuffers_[TotalBufs];
|
||||
unsigned int numBuffers_;
|
||||
unsigned int numStreams_;
|
||||
const size_t totalStreams_[TotalStreams];
|
||||
const size_t totalBuffers_[TotalBufs];
|
||||
|
||||
public:
|
||||
hipPerfStreamCreateCopyDestroy() : numBuffers_(0), numStreams_(0),
|
||||
totalStreams_{1, 2, 4, 8},
|
||||
totalBuffers_{1, 100, 1000, 5000} {};
|
||||
~hipPerfStreamCreateCopyDestroy() {};
|
||||
bool open(int deviceID);
|
||||
bool run(unsigned int testNumber);
|
||||
hipPerfStreamCreateCopyDestroy()
|
||||
: numBuffers_(0),
|
||||
numStreams_(0),
|
||||
totalStreams_{1, 2, 4, 8},
|
||||
totalBuffers_{1, 100, 1000, 5000} {};
|
||||
~hipPerfStreamCreateCopyDestroy(){};
|
||||
bool open(int deviceID);
|
||||
bool run(unsigned int testNumber);
|
||||
};
|
||||
|
||||
bool hipPerfStreamCreateCopyDestroy::open(int deviceId) {
|
||||
@@ -61,20 +62,20 @@ bool hipPerfStreamCreateCopyDestroy::open(int deviceId) {
|
||||
HIP_CHECK(hipSetDevice(deviceId));
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID
|
||||
<< " " << props.name << " with " << props.multiProcessorCount << " CUs"
|
||||
<< " and device id: " << deviceId << std::endl;
|
||||
|
||||
CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID,
|
||||
props.name, props.multiProcessorCount, deviceId);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
|
||||
numStreams_ = totalStreams_[testNumber % TotalStreams];
|
||||
size_t iter = Iterations / (numStreams_ * (static_cast<size_t>(1)
|
||||
<< (testNumber / TotalBufs + 1)));
|
||||
hipStream_t *streams = new hipStream_t[numStreams_];
|
||||
size_t iter =
|
||||
Iterations / (numStreams_ * (static_cast<size_t>(1) << (testNumber / TotalBufs + 1)));
|
||||
hipStream_t* streams = new hipStream_t[numStreams_];
|
||||
|
||||
numBuffers_ = totalBuffers_[testNumber / TotalBufs];
|
||||
float ** dSrc = new float*[numBuffers_];
|
||||
float** dSrc = new float*[numBuffers_];
|
||||
size_t nBytes = BufSize * sizeof(float);
|
||||
|
||||
for (size_t b = 0; b < numBuffers_; ++b) {
|
||||
@@ -97,8 +98,7 @@ bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
|
||||
|
||||
for (size_t s = 0; s < numStreams_; ++s) {
|
||||
for (size_t b = 0; b < numBuffers_; ++b) {
|
||||
HIP_CHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes,
|
||||
hipMemcpyHostToDevice, streams[s]));
|
||||
HIP_CHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, hipMemcpyHostToDevice, streams[s]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -112,31 +112,31 @@ bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
|
||||
|
||||
auto time = static_cast<float>(diff.count() * 1000 / (iter * numStreams_));
|
||||
|
||||
cout << "Create+Copy+Destroy time for " << numStreams_ << " streams and "
|
||||
<< setw(4) << numBuffers_ << " buffers " << " and " << setw(4)
|
||||
<< iter << " iterations " << time << " (ms) " << endl;
|
||||
CONSOLE_PRINT(
|
||||
"Create+Copy+Destroy time for %u streams and %u buffers and %zu iterations %.6f (ms)\n",
|
||||
numStreams_, numBuffers_, iter, time);
|
||||
|
||||
delete [] hSrc;
|
||||
delete[] hSrc;
|
||||
for (size_t b = 0; b < numBuffers_; ++b) {
|
||||
HIP_CHECK(hipFree(dSrc[b]));
|
||||
}
|
||||
|
||||
delete [] streams;
|
||||
delete [] dSrc;
|
||||
delete[] streams;
|
||||
delete[] dSrc;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the Create+Copy+Destroy time for different stream.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/stream/hipPerfDeviceConcurrency.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Verify the Create+Copy+Destroy time for different stream.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - perftests/stream/hipPerfDeviceConcurrency.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
|
||||
TEST_CASE("Perf_hipPerfStreamCreateCopyDestroy") {
|
||||
hipPerfStreamCreateCopyDestroy streamCCD;
|
||||
@@ -149,6 +149,6 @@ TEST_CASE("Perf_hipPerfStreamCreateCopyDestroy") {
|
||||
}
|
||||
|
||||
/**
|
||||
* End doxygen group perfStreamTest.
|
||||
* @}
|
||||
*/
|
||||
* End doxygen group perfStreamTest.
|
||||
* @}
|
||||
*/
|
||||
|
||||
@@ -1,382 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include "test_common.h"
|
||||
#include <vector>
|
||||
|
||||
#define DOT_DIM 256
|
||||
|
||||
using namespace std;
|
||||
|
||||
template <unsigned int BLOCKSIZE>
|
||||
__launch_bounds__(BLOCKSIZE)
|
||||
__global__ void vectors_not_equal(int n,
|
||||
const double* __restrict__ x,
|
||||
const double* __restrict__ y,
|
||||
double* __restrict__ workspace) {
|
||||
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
double sum = 0.0;
|
||||
for(int idx = gid; idx < n; idx += hipGridDim_x * hipBlockDim_x) {
|
||||
sum = fma(y[idx], x[idx], sum);
|
||||
}
|
||||
|
||||
__shared__ double sdata[BLOCKSIZE];
|
||||
sdata[threadIdx.x] = sum;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 128) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 128];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 64){
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 64];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 32){
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 32];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 16) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 16];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 8) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 8];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 4) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 4];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 2) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 2];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 1) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 1];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
workspace[blockIdx.x] = sdata[0];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <unsigned int BLOCKSIZE>
|
||||
__launch_bounds__(BLOCKSIZE)
|
||||
__global__ void vectors_equal(int n, const double* __restrict__ x,
|
||||
double* __restrict__ workspace) {
|
||||
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
double sum = 0.0;
|
||||
for(int idx = gid; idx < n; idx += hipGridDim_x * blockDim.x) {
|
||||
sum = fma(x[idx], x[idx], sum);
|
||||
}
|
||||
|
||||
__shared__ double sdata[BLOCKSIZE];
|
||||
sdata[threadIdx.x] = sum;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 128) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 128];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 64) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 64];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 32) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 32];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 16) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 16];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 8) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 8];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 4) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 4];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 2) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 2];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 1) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 1];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
workspace[blockIdx.x] = sdata[0];
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int BLOCKSIZE>
|
||||
__launch_bounds__(BLOCKSIZE)
|
||||
__global__ void dot_reduction(double* __restrict__ workspace) {
|
||||
|
||||
__shared__ double sdata[BLOCKSIZE];
|
||||
|
||||
sdata[threadIdx.x] = workspace[threadIdx.x];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 128) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 128];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 64) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 64];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 32) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 32];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 16) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 16];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 8) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 8];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 4) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 4];
|
||||
} __syncthreads();
|
||||
|
||||
if(threadIdx.x < 2) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 2];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(threadIdx.x < 1) {
|
||||
sdata[threadIdx.x] += sdata[threadIdx.x + 1];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0) {
|
||||
workspace[0] = sdata[0];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void computeDotProduct(int n, const double* x, const double* y, double& result,
|
||||
double* workspace)
|
||||
{
|
||||
dim3 blocks(DOT_DIM);
|
||||
dim3 threadsPerBlock(DOT_DIM);
|
||||
|
||||
if(x != y) {
|
||||
hipLaunchKernelGGL(vectors_not_equal<DOT_DIM>, blocks, threadsPerBlock, 0, 0, n, x, y,
|
||||
workspace);
|
||||
}
|
||||
else {
|
||||
hipLaunchKernelGGL(vectors_equal<DOT_DIM>, blocks, threadsPerBlock, 0, 0, n, x, workspace);
|
||||
}
|
||||
|
||||
// Part 2 of dot product computation
|
||||
hipLaunchKernelGGL(dot_reduction<DOT_DIM>, dim3(1), threadsPerBlock, 0, 0, workspace);
|
||||
|
||||
// Copy the final dot product result back from the device
|
||||
HIPCHECK(hipMemcpy(&result, workspace, sizeof(double), hipMemcpyDeviceToHost));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
|
||||
if (nGpu < 1) {
|
||||
failed("No GPU!");
|
||||
}
|
||||
hipDeviceProp_t props = {0};
|
||||
props = {0};
|
||||
HIPCHECK(hipSetDevice(p_gpuDevice));
|
||||
HIPCHECK(hipGetDeviceProperties(&props, p_gpuDevice));
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
|
||||
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << p_gpuDevice
|
||||
<< std::endl;
|
||||
|
||||
int nx, ny, nz;
|
||||
|
||||
for (unsigned int testCase = 0; testCase < 3; testCase++) {
|
||||
|
||||
vector<int> vectorSize = {200, 300, 50};
|
||||
switch(testCase) {
|
||||
|
||||
case 0:
|
||||
nx = vectorSize[0];
|
||||
ny = vectorSize[0];
|
||||
nz = vectorSize[0];
|
||||
break;
|
||||
|
||||
case 1:
|
||||
nx = vectorSize[1];
|
||||
ny = vectorSize[1];
|
||||
nz = vectorSize[1];
|
||||
break;
|
||||
|
||||
case 2:
|
||||
nx = vectorSize[0];
|
||||
ny = vectorSize[1];
|
||||
nz = vectorSize[2];
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
int trials = 200;
|
||||
|
||||
int size = nx * ny * nz;
|
||||
|
||||
vector<double> hx(size);
|
||||
vector<double> hy(size);
|
||||
double hresult_xy = 0.0;
|
||||
double hresult_xx = 0.0;
|
||||
|
||||
srand(time(NULL));
|
||||
|
||||
for(int i = 0; i < size; ++i) {
|
||||
hx[i] = 2.0 * (double)rand() / (double)RAND_MAX - 1.0;
|
||||
hy[i] = 2.0 * (double)rand() / (double)RAND_MAX - 1.0;
|
||||
|
||||
hresult_xy += hx[i] * hy[i];
|
||||
hresult_xx += hx[i] * hx[i];
|
||||
}
|
||||
|
||||
double* dx;
|
||||
double* dy;
|
||||
double* workspace;
|
||||
double dresult;
|
||||
|
||||
HIPCHECK(hipMalloc((void**)&dx, sizeof(double) * size));
|
||||
HIPCHECK(hipMalloc((void**)&dy, sizeof(double) * size));
|
||||
HIPCHECK(hipMalloc((void**)&workspace, sizeof(double) * DOT_DIM));
|
||||
|
||||
HIPCHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice));
|
||||
|
||||
// Warm up
|
||||
computeDotProduct(size, dx, dy, dresult, workspace);
|
||||
computeDotProduct(size, dx, dy, dresult, workspace);
|
||||
computeDotProduct(size, dx, dy, dresult, workspace);
|
||||
|
||||
// Timed run for <x,y>
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for(int i = 0; i < trials; ++i) {
|
||||
computeDotProduct(size, dx, dy, dresult, workspace);
|
||||
}
|
||||
|
||||
float time = 0;
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
time = all_kernel_time.count();
|
||||
|
||||
time /= trials;
|
||||
|
||||
double bw = sizeof(double) * size * 2.0 / 1e9;
|
||||
double gf = 2.0 * size / 1e9;
|
||||
|
||||
cout << "\nVector Size: " << size << "\n[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
|
||||
<< gf/(time / 1e3) << " GFlop/s" << endl;
|
||||
|
||||
// Verify the device kernel results comparing it with the host results
|
||||
if(std::abs(dresult - hresult_xy) > std::max(dresult * 1e-10, 1e-8)) {
|
||||
cerr << " Device results inconsistent with host results. "
|
||||
<< " Host result: " << hresult_xy
|
||||
<< " Device result: " << dresult;
|
||||
}
|
||||
|
||||
// Warm up
|
||||
computeDotProduct(size, dx, dx, dresult, workspace);
|
||||
computeDotProduct(size, dx, dx, dresult, workspace);
|
||||
computeDotProduct(size, dx, dx, dresult, workspace);
|
||||
|
||||
// Timed run for <x,x>
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for(int i = 0; i < trials; ++i) {
|
||||
computeDotProduct(size, dx, dx, dresult, workspace);
|
||||
}
|
||||
|
||||
all_end = std::chrono::steady_clock::now();
|
||||
all_kernel_time = all_end - all_start;
|
||||
time = all_kernel_time.count();
|
||||
|
||||
time /= trials;
|
||||
bw = sizeof(double) * size / 1e9;
|
||||
|
||||
cout << "[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
|
||||
<< gf/(time / 1e3) << " GFlop/s" << endl;
|
||||
|
||||
// Verify the device kernel results comparing it with the host results
|
||||
if(abs(dresult - hresult_xx) > max(dresult * 1e-10, 1e-8)) {
|
||||
cerr << " Device results inconsistent with host results"
|
||||
<< " Host result: " << hresult_xy
|
||||
<< " Device result: " << dresult;
|
||||
}
|
||||
|
||||
HIPCHECK(hipFree(dx));
|
||||
HIPCHECK(hipFree(dy));
|
||||
HIPCHECK(hipFree(workspace));
|
||||
|
||||
}
|
||||
passed();
|
||||
return 0;
|
||||
}
|
||||
@@ -1,743 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include "test_common.h"
|
||||
#include <hip/hip_vector_types.h>
|
||||
#include <hip/math_functions.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double width;
|
||||
} coordRec;
|
||||
|
||||
coordRec coords[] = {
|
||||
{0.0, 0.0, 4.0}, // Whole set
|
||||
{0.0, 0.0, 0.00001}, // All black
|
||||
{-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail
|
||||
};
|
||||
|
||||
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
|
||||
|
||||
template <typename T>
|
||||
__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
|
||||
uint maxIter) {
|
||||
|
||||
#pragma FP_CONTRACT ON
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = (float)(xPos + xStep*i);
|
||||
float y0 = (float)(yPos + yStep*j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
tmp = x;
|
||||
x = fma(-y,y,fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
}
|
||||
|
||||
out[tid] = iter;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
T yPos, T xStep, T yStep, uint maxIter) {
|
||||
|
||||
#pragma FP_CONTRACT ON
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = (float)(xPos + xStep*(float)i);
|
||||
float y0 = (float)(yPos + yStep*(float)j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
#define FAST
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
int stay;
|
||||
int ccount = 0;
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
float savx = x;
|
||||
float savy = y;
|
||||
#ifdef FAST
|
||||
for (iter = 0; (iter < maxIter); iter+=16) {
|
||||
#else
|
||||
for (iter = 0; stay && (iter < maxIter); iter+=16) {
|
||||
#endif
|
||||
x = savx;
|
||||
y = savy;
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
ccount += stay*16;
|
||||
#ifdef FAST
|
||||
if (!stay)
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
// Handle remainder
|
||||
if (!stay) {
|
||||
iter = 16;
|
||||
do {
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
|
||||
tmp = x;
|
||||
x = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
ccount += stay;
|
||||
iter--;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
} while (stay && iter);
|
||||
}
|
||||
|
||||
|
||||
out[tid] = (uint)ccount;
|
||||
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
|
||||
uint maxIter) {
|
||||
|
||||
#pragma FP_CONTRACT ON
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
double x0 = (double)(xPos + xStep*i);
|
||||
double y0 = (double)(yPos + yStep*j);
|
||||
|
||||
double x = x0;
|
||||
double y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
double tmp;
|
||||
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
tmp = x;
|
||||
x = fma(-y,y,fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
}
|
||||
out[tid] = iter;
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
|
||||
T yPos, T xStep, T yStep, uint maxIter) {
|
||||
|
||||
#pragma FP_CONTRACT ON
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
double x0 = (double)(xPos + xStep*(double)i);
|
||||
double y0 = (double)(yPos + yStep*(double)j);
|
||||
|
||||
double x = x0;
|
||||
double y = y0;
|
||||
|
||||
#define FAST
|
||||
uint iter = 0;
|
||||
double tmp;
|
||||
int stay;
|
||||
int ccount = 0;
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
double savx = x;
|
||||
double savy = y;
|
||||
#ifdef FAST
|
||||
for (iter = 0; (iter < maxIter); iter+=16)
|
||||
#else
|
||||
for (iter = 0; stay && (iter < maxIter); iter+=16)
|
||||
#endif
|
||||
{
|
||||
x = savx;
|
||||
y = savy;
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
// Two iterations
|
||||
tmp = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*x,y,y0);
|
||||
x = fma(-y,y, fma(tmp,tmp,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
|
||||
stay = (x*x+y*y) <= 4.0;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
ccount += stay*16;
|
||||
#ifdef FAST
|
||||
if (!stay)
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
// Handle remainder
|
||||
if (!stay) {
|
||||
iter = 16;
|
||||
do {
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
|
||||
tmp = x;
|
||||
x = fma(-y,y, fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
ccount += stay;
|
||||
iter--;
|
||||
savx = (stay ? x : savx);
|
||||
savy = (stay ? y : savy);
|
||||
}
|
||||
while (stay && iter);
|
||||
|
||||
}
|
||||
out[tid] = (uint)ccount;
|
||||
};
|
||||
|
||||
static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
|
||||
|
||||
// Expected results for each kernel run at each coord
|
||||
unsigned long long expectedIters[] = {
|
||||
203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull,
|
||||
120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull,
|
||||
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull,
|
||||
203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull,
|
||||
120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull,
|
||||
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};
|
||||
|
||||
class hipPerfMandelBrot {
|
||||
public:
|
||||
hipPerfMandelBrot();
|
||||
~hipPerfMandelBrot();
|
||||
|
||||
void setNumKernels(unsigned int num) {
|
||||
numKernels = num;
|
||||
}
|
||||
|
||||
unsigned int getNumKernels() {
|
||||
return numKernels;
|
||||
}
|
||||
|
||||
void setNumStreams(unsigned int num) {
|
||||
numStreams = num;
|
||||
}
|
||||
unsigned int getNumStreams() {
|
||||
return numStreams;
|
||||
}
|
||||
|
||||
void open(int deviceID);
|
||||
void run(unsigned int testCase, unsigned int deviceId);
|
||||
void printResults(void);
|
||||
|
||||
// array of funtion pointers
|
||||
typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt);
|
||||
|
||||
// Wrappers
|
||||
void float_mad(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t* streams,
|
||||
int blocks, int threads_per_block, int kernelCnt);
|
||||
|
||||
void float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t* streams,
|
||||
int blocks, int threads_per_block, int kernelCnt);
|
||||
|
||||
void double_mad(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt);
|
||||
|
||||
void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams, int blocks,
|
||||
int threads_per_block, int kernelCnt);
|
||||
|
||||
hipStream_t streams[2];
|
||||
|
||||
private:
|
||||
void setData(void *ptr, unsigned int value);
|
||||
void checkData(uint *ptr);
|
||||
|
||||
unsigned int numKernels;
|
||||
unsigned int numStreams;
|
||||
|
||||
std::map<std::string, std::vector<double>> results;
|
||||
unsigned int width_;
|
||||
unsigned int bufSize;
|
||||
unsigned int maxIter;
|
||||
unsigned int coordIdx;
|
||||
volatile unsigned long long totalIters = 0;
|
||||
int numCUs;
|
||||
static const unsigned int numLoops = 10;
|
||||
};
|
||||
|
||||
|
||||
hipPerfMandelBrot::hipPerfMandelBrot() {}
|
||||
|
||||
hipPerfMandelBrot::~hipPerfMandelBrot() {}
|
||||
|
||||
void hipPerfMandelBrot::open(int deviceId) {
|
||||
|
||||
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
if (nGpu < 1) {
|
||||
failed("No GPU!");
|
||||
}
|
||||
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
hipDeviceProp_t props = {0};
|
||||
HIPCHECK(hipGetDeviceProperties(&props, deviceId));
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
|
||||
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
|
||||
<< std::endl;
|
||||
|
||||
numCUs = props.multiProcessorCount;
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::printResults() {
|
||||
|
||||
int numkernels = getNumKernels();
|
||||
int numStreams = getNumStreams();
|
||||
|
||||
std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
|
||||
<< numStreams << " streams (s)" << std::endl;
|
||||
|
||||
std::map<std::string, std::vector<double>>:: iterator itr;
|
||||
for (itr = results.begin(); itr != results.end(); itr++) {
|
||||
std::cout << "\n" << std::setw(20) << itr->first << " ";
|
||||
for(auto i : results[itr->first]) {
|
||||
std::cout << std::setw(10) << i << " ";
|
||||
}
|
||||
}
|
||||
results.clear();
|
||||
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
|
||||
// Wrappers for the kernel launches
|
||||
void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter, hipStream_t* streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
|
||||
maxIter);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t * streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t * streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter, hipStream_t * streams,
|
||||
int blocks, int threads_per_block, int kernelCnt) {
|
||||
|
||||
int streamCnt = getNumStreams();
|
||||
hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
|
||||
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
|
||||
|
||||
unsigned int numStreams = getNumStreams();
|
||||
coordIdx = testCase % numCoords;
|
||||
|
||||
funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
|
||||
&hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
|
||||
|
||||
// Maximum iteration count
|
||||
maxIter = 32768;
|
||||
|
||||
uint * hPtr[numKernels];
|
||||
uint * dPtr[numKernels];
|
||||
|
||||
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
|
||||
width_ = 256;
|
||||
|
||||
bufSize = width_ * width_ * sizeof(uint);
|
||||
|
||||
// Create streams for concurrency
|
||||
for (uint i = 0; i < numStreams; i++) {
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
|
||||
// Allocate memory on the host and device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
|
||||
setData(hPtr[i], 0xdeadbeef);
|
||||
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
|
||||
}
|
||||
|
||||
|
||||
// Prepare kernel launch parameters
|
||||
int threads = (bufSize/sizeof(uint));
|
||||
int threads_per_block = 64;
|
||||
int blocks = (threads/threads_per_block) + (threads % threads_per_block);
|
||||
|
||||
float xStep = (float)(coords[coordIdx].width / (double)width_);
|
||||
float yStep = (float)(-coords[coordIdx].width / (double)width_);
|
||||
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Copy memory asynchronously and concurrently from host to device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
// Synchronize to make sure all the copies are completed
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
|
||||
int kernelIdx;
|
||||
if(testCase == 0 || testCase == 5 || testCase == 10) {
|
||||
kernelIdx = 0;
|
||||
}
|
||||
|
||||
else if(testCase == 1 || testCase == 6 || testCase == 11) {
|
||||
kernelIdx = 1;
|
||||
}
|
||||
else if(testCase == 2 || testCase == 7 || testCase == 12) {
|
||||
kernelIdx = 2;
|
||||
}
|
||||
else if(testCase == 3 || testCase == 8 || testCase == 13){
|
||||
kernelIdx = 3;
|
||||
}
|
||||
|
||||
|
||||
double totalTime = 0.0;
|
||||
|
||||
for (unsigned int k = 0; k < numLoops; k++) {
|
||||
if ((testCase == 0 || testCase == 1 || testCase == 2 ||
|
||||
testCase == 5 || testCase == 6 || testCase == 7 ||
|
||||
testCase == 10 || testCase == 11 || testCase == 12)) {
|
||||
float xStep = (float)(coords[coordIdx].width / (double)width_);
|
||||
float yStep = (float)(-coords[coordIdx].width / (double)width_);
|
||||
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
|
||||
threads_per_block, i);
|
||||
}
|
||||
|
||||
|
||||
// Synchronize all the concurrent streams to have completed execution
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
totalTime += all_kernel_time.count();
|
||||
|
||||
}
|
||||
|
||||
|
||||
else {
|
||||
double xStep = coords[coordIdx].width / (double)width_;
|
||||
double yStep = -coords[coordIdx].width / (double)width_;
|
||||
double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
|
||||
double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
|
||||
threads_per_block, i);
|
||||
}
|
||||
|
||||
|
||||
// Synchronize all the concurrent streams to have completed execution
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
totalTime += all_kernel_time.count();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// Copy data back from device to the host
|
||||
for(uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
|
||||
for(uint i = 0; i < numKernels; i++) {
|
||||
checkData(hPtr[i]);
|
||||
|
||||
int j =0;
|
||||
while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
|
||||
j++;
|
||||
}
|
||||
|
||||
if(j==30) {
|
||||
std::cout << "Incorrect iteration count detected. ";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// Compute GFLOPS. There are 7 FLOPs per iteration
|
||||
double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
|
||||
(totalTime / (double)numLoops);
|
||||
|
||||
|
||||
std::vector<std::string> kernelName = {"float", "float_unroll",
|
||||
"double", "double_unroll"};
|
||||
|
||||
// Print results except for Warm-up kernel
|
||||
if(testCase!=100) {
|
||||
results[kernelName[testCase % 4]].push_back(perf);
|
||||
}
|
||||
|
||||
|
||||
for(uint i = 0 ; i < numStreams; i++) {
|
||||
HIPCHECK(hipStreamDestroy(streams[i]));
|
||||
}
|
||||
|
||||
|
||||
// Free host and device memory
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipHostFree(hPtr[i]));
|
||||
HIPCHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
|
||||
unsigned int *ptr2 = (unsigned int *)ptr;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
ptr2[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void hipPerfMandelBrot::checkData(uint *ptr) {
|
||||
totalIters = 0;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
totalIters += ptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
hipPerfMandelBrot mandelbrotCompute;
|
||||
int deviceId = 0;
|
||||
|
||||
mandelbrotCompute.open(deviceId);
|
||||
|
||||
for (unsigned int testCase = 0; testCase < 3; testCase++) {
|
||||
|
||||
|
||||
switch (testCase) {
|
||||
|
||||
|
||||
case 0: {
|
||||
// Warmup-kernel - default stream executes serially
|
||||
mandelbrotCompute.setNumStreams(1);
|
||||
mandelbrotCompute.setNumKernels(1);
|
||||
mandelbrotCompute.run(100/*Random number*/, deviceId);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
case 1: {
|
||||
// run all - sync
|
||||
int i = 0;
|
||||
do {
|
||||
mandelbrotCompute.setNumStreams(1);
|
||||
mandelbrotCompute.setNumKernels(1);
|
||||
mandelbrotCompute.run(i, deviceId);
|
||||
i++;
|
||||
}while(i < 12);
|
||||
mandelbrotCompute.printResults();
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
case 2: {
|
||||
// run all - async
|
||||
int i = 0;
|
||||
do {
|
||||
mandelbrotCompute.setNumStreams(2);
|
||||
mandelbrotCompute.setNumKernels(2);
|
||||
mandelbrotCompute.run(i, deviceId);
|
||||
i++;
|
||||
}while(i < 12);
|
||||
mandelbrotCompute.printResults();
|
||||
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
|
||||
default: {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -1,284 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include "test_common.h"
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double width;
|
||||
} coordRec;
|
||||
|
||||
static coordRec coords[] = {
|
||||
{0.0, 0.0, 0.00001}, // All black
|
||||
};
|
||||
|
||||
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
|
||||
|
||||
__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep,
|
||||
float yStep, uint maxIter) {
|
||||
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % width;
|
||||
int j = tid / width;
|
||||
float x0 = (float)(xPos + xStep*i);
|
||||
float y0 = (float)(yPos + yStep*j);
|
||||
|
||||
float x = x0;
|
||||
float y = y0;
|
||||
|
||||
uint iter = 0;
|
||||
float tmp;
|
||||
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
|
||||
tmp = x;
|
||||
x = fma(-y,y,fma(x,x,x0));
|
||||
y = fma(2.0f*tmp,y,y0);
|
||||
}
|
||||
|
||||
out[tid] = iter;
|
||||
};
|
||||
|
||||
class hipPerfDeviceConcurrency {
|
||||
public:
|
||||
hipPerfDeviceConcurrency();
|
||||
~hipPerfDeviceConcurrency();
|
||||
|
||||
void setNumGpus(unsigned int num) {
|
||||
numDevices = num;
|
||||
}
|
||||
unsigned int getNumGpus() {
|
||||
return numDevices;
|
||||
}
|
||||
|
||||
void open(void);
|
||||
void close(void);
|
||||
void run(unsigned int testCase, int numGpus);
|
||||
|
||||
private:
|
||||
void setData(void *ptr, unsigned int value);
|
||||
void checkData(uint *ptr);
|
||||
|
||||
unsigned int numDevices;
|
||||
unsigned int width_;
|
||||
unsigned int bufSize;
|
||||
unsigned int coordIdx;
|
||||
unsigned long long totalIters = 0;
|
||||
};
|
||||
|
||||
|
||||
hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
|
||||
|
||||
hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
|
||||
|
||||
void hipPerfDeviceConcurrency::open(void) {
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
setNumGpus(nGpu);
|
||||
if (nGpu < 1) {
|
||||
failed("No GPU!");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void hipPerfDeviceConcurrency::close() {
|
||||
}
|
||||
|
||||
void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
|
||||
|
||||
|
||||
static int deviceId;
|
||||
uint * hPtr[numGpus];
|
||||
uint * dPtr[numGpus];
|
||||
hipStream_t streams[numGpus];
|
||||
int numCUs[numGpus];
|
||||
unsigned int maxIter[numGpus];
|
||||
unsigned long long expectedIters[numGpus];
|
||||
|
||||
int threads, threads_per_block, blocks;
|
||||
float xStep, yStep, xPos, yPos;
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
|
||||
if(testCase != 0) {
|
||||
deviceId = i;
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
|
||||
hipDeviceProp_t props = {0};
|
||||
HIPCHECK(hipGetDeviceProperties(&props, i));
|
||||
|
||||
if (testCase != 0) {
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
|
||||
<< " with " << props.multiProcessorCount << " CUs" << " and device ID: "
|
||||
<< i << std::endl;
|
||||
}
|
||||
|
||||
numCUs[i] = props.multiProcessorCount;
|
||||
int clkFrequency = 0;
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
|
||||
|
||||
clkFrequency =(unsigned int)clkFrequency/1000;
|
||||
|
||||
// Maximum iteration count
|
||||
// maxIter = 8388608 * (engine_clock / 1000).serial execution
|
||||
maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
|
||||
maxIter[i] = (maxIter[i] + 15) & ~15;
|
||||
|
||||
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
|
||||
width_ = 256;
|
||||
|
||||
bufSize = width_ * width_ * sizeof(uint);
|
||||
|
||||
// Create streams for concurrency
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
|
||||
// Allocate memory on the host and device
|
||||
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
|
||||
setData(hPtr[i], 0xdeadbeef);
|
||||
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
|
||||
|
||||
// Prepare kernel launch parameters
|
||||
threads = (bufSize/sizeof(uint));
|
||||
threads_per_block = 64;
|
||||
blocks = (threads/threads_per_block) + (threads % threads_per_block);
|
||||
|
||||
coordIdx = testCase % numCoords;
|
||||
xStep = (float)(coords[coordIdx].width / (double)width_);
|
||||
yStep = (float)(-coords[coordIdx].width / (double)width_);
|
||||
xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Copy memory from host to device
|
||||
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
|
||||
|
||||
}
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
|
||||
if(testCase != 0) {
|
||||
deviceId = i;
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
|
||||
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
|
||||
dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
|
||||
|
||||
}
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
HIPCHECK(hipStreamSynchronize(0));
|
||||
}
|
||||
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
|
||||
for(int i = 0; i < numGpus; i++) {
|
||||
|
||||
if(testCase != 0) {
|
||||
deviceId = i;
|
||||
}
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
|
||||
// Copy data back from device to the host
|
||||
HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
|
||||
|
||||
checkData(hPtr[i]);
|
||||
expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
|
||||
|
||||
if (testCase != 0) {
|
||||
checkData(hPtr[i]);
|
||||
if(totalIters != expectedIters[i]) {
|
||||
std::cout << "Incorrect iteration count detected" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
HIPCHECK(hipStreamDestroy(streams[i]));
|
||||
|
||||
// Free host and device memory
|
||||
HIPCHECK(hipHostFree(hPtr[i]));
|
||||
HIPCHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
|
||||
if (testCase != 0) {
|
||||
std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
|
||||
<< all_kernel_time.count() << " (s) " << '\n' << std::endl;
|
||||
}
|
||||
|
||||
if(testCase == 0) {
|
||||
deviceId++;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
|
||||
unsigned int *ptr2 = (unsigned int *)ptr;
|
||||
for (unsigned int i = 0; i < width_ * width_ ; i++) {
|
||||
ptr2[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void hipPerfDeviceConcurrency::checkData(uint *ptr) {
|
||||
totalIters = 0;
|
||||
for (unsigned int i = 0; i < width_ * width_; i++) {
|
||||
totalIters += ptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
hipPerfDeviceConcurrency deviceConcurrency;
|
||||
|
||||
deviceConcurrency.open();
|
||||
|
||||
int nGpu = deviceConcurrency.getNumGpus();
|
||||
|
||||
// testCase = 0 refers to warmup kernel run
|
||||
int testCase = 0;
|
||||
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
// Warm-up kernel on all devices
|
||||
deviceConcurrency.run(testCase, 1);
|
||||
}
|
||||
|
||||
// Time for kernel on 1 device
|
||||
deviceConcurrency.run(++testCase, 1);
|
||||
|
||||
// Time for kernel on all available devices
|
||||
deviceConcurrency.run(++testCase, nGpu);
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -1,432 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include "test_common.h"
|
||||
#include <hip/hip_vector_types.h>
|
||||
|
||||
#ifdef __HIP_PLATFORM_NVIDIA__
|
||||
inline __device__ float4 operator*(float s, float4 a)
|
||||
{
|
||||
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
|
||||
}
|
||||
inline __device__ float4 operator*(float4 a, float4 b)
|
||||
{
|
||||
return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
|
||||
}
|
||||
inline __device__ float4 operator+(float4 a, float4 b)
|
||||
{
|
||||
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
|
||||
}
|
||||
inline __device__ float4 operator-(float4 a, float4 b)
|
||||
{
|
||||
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
double x;
|
||||
double y;
|
||||
double width;
|
||||
} coordRec;
|
||||
|
||||
static coordRec coords[] = {
|
||||
{0.0, 0.0, 0.00001}, // All black
|
||||
};
|
||||
|
||||
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
|
||||
|
||||
__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos,
|
||||
float xStep, float yStep, uint maxIter) {
|
||||
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
int i = tid % (width/4);
|
||||
int j = tid / (width/4);
|
||||
int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3);
|
||||
int4 vecj = make_int4(j, j, j, j);
|
||||
float4 x0;
|
||||
x0.x = (float)(xPos + xStep*veci.x);
|
||||
x0.y = (float)(xPos + xStep*veci.y);
|
||||
x0.z = (float)(xPos + xStep*veci.z);
|
||||
x0.w = (float)(xPos + xStep*veci.w);
|
||||
float4 y0;
|
||||
y0.x = (float)(yPos + yStep*vecj.x);
|
||||
y0.y = (float)(yPos + yStep*vecj.y);
|
||||
y0.z = (float)(yPos + yStep*vecj.z);
|
||||
y0.w = (float)(yPos + yStep*vecj.w);
|
||||
float4 x = x0;
|
||||
float4 y = y0;
|
||||
uint iter = 0;
|
||||
float4 tmp;
|
||||
int4 stay;
|
||||
int4 ccount = make_int4(0, 0, 0, 0);
|
||||
float4 savx = x;
|
||||
float4 savy = y;
|
||||
stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
|
||||
stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
|
||||
stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
|
||||
stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
|
||||
for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter);
|
||||
iter+=16) {
|
||||
x = savx;
|
||||
y = savy;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
// Two iterations
|
||||
tmp = x*x + x0 - y*y;
|
||||
y = 2.0f * x * y + y0;
|
||||
x = tmp*tmp + x0 - y*y;
|
||||
y = 2.0f * tmp * y + y0;
|
||||
stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
|
||||
stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
|
||||
stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
|
||||
stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
|
||||
savx.x = (bool)(stay.x ? x.x : savx.x);
|
||||
savx.y = (bool)(stay.y ? x.y : savx.y);
|
||||
savx.z = (bool)(stay.z ? x.z : savx.z);
|
||||
savx.w = (bool)(stay.w ? x.w : savx.w);
|
||||
savy.x = (bool)(stay.x ? y.x : savy.x);
|
||||
savy.y = (bool)(stay.y ? y.y : savy.y);
|
||||
savy.z = (bool)(stay.z ? y.z : savy.z);
|
||||
savy.w = (bool)(stay.w ? y.w : savy.w);
|
||||
ccount.x -= stay.x*16;
|
||||
ccount.y -= stay.y*16;
|
||||
ccount.z -= stay.z*16;
|
||||
ccount.w -= stay.w*16;
|
||||
}
|
||||
// Handle remainder
|
||||
if (!(stay.x & stay.y & stay.z & stay.w))
|
||||
{
|
||||
iter = 16;
|
||||
do
|
||||
{
|
||||
x = savx;
|
||||
y = savy;
|
||||
stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x < maxIter);
|
||||
stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y < maxIter);
|
||||
stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z < maxIter);
|
||||
stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w < maxIter);
|
||||
tmp = x;
|
||||
x = x*x + x0 - y*y;
|
||||
y = 2.0f*tmp*y + y0;
|
||||
ccount.x += stay.x;
|
||||
ccount.y += stay.y;
|
||||
ccount.z += stay.z;
|
||||
ccount.w += stay.w;
|
||||
iter--;
|
||||
savx.x = (stay.x ? x.x : savx.x);
|
||||
savx.y = (stay.y ? x.y : savx.y);
|
||||
savx.z = (stay.z ? x.z : savx.z);
|
||||
savx.w = (stay.w ? x.w : savx.w);
|
||||
savy.x = (stay.x ? y.x : savy.x);
|
||||
savy.y = (stay.y ? y.y : savy.y);
|
||||
savy.z = (stay.z ? y.z : savy.z);
|
||||
savy.w = (stay.w ? y.w : savy.w);
|
||||
} while ((stay.x | stay.y | stay.z | stay.w) && iter);
|
||||
}
|
||||
uint4 *vecOut = (uint4 *)out;
|
||||
vecOut[tid].x = (uint)(ccount.x);
|
||||
vecOut[tid].y = (uint)(ccount.y);
|
||||
vecOut[tid].z = (uint)(ccount.z);
|
||||
vecOut[tid].w = (uint)(ccount.w);
|
||||
}
|
||||
|
||||
class hipPerfStreamConcurrency {
|
||||
public:
|
||||
hipPerfStreamConcurrency();
|
||||
~hipPerfStreamConcurrency();
|
||||
|
||||
void setNumKernels(unsigned int num) {
|
||||
numKernels = num;
|
||||
}
|
||||
void setNumStreams(unsigned int num) {
|
||||
numStreams = num;
|
||||
}
|
||||
unsigned int getNumStreams() {
|
||||
return numStreams;
|
||||
}
|
||||
|
||||
unsigned int getNumKernels() {
|
||||
return numKernels;
|
||||
}
|
||||
|
||||
void open(int deviceID);
|
||||
void run(unsigned int testCase, unsigned int deviceId);
|
||||
void close(void);
|
||||
|
||||
private:
|
||||
void setData(void *ptr, unsigned int value);
|
||||
void checkData(uint *ptr);
|
||||
|
||||
unsigned int numKernels;
|
||||
unsigned int numStreams;
|
||||
|
||||
unsigned int width_;
|
||||
unsigned int bufSize;
|
||||
unsigned int maxIter;
|
||||
unsigned int coordIdx;
|
||||
unsigned long long totalIters;
|
||||
int numCUs;
|
||||
|
||||
};
|
||||
|
||||
|
||||
hipPerfStreamConcurrency::hipPerfStreamConcurrency() {}
|
||||
|
||||
hipPerfStreamConcurrency::~hipPerfStreamConcurrency() {}
|
||||
|
||||
void hipPerfStreamConcurrency::open(int deviceId) {
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
if (nGpu < 1) {
|
||||
failed("No GPU!");
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
hipDeviceProp_t props = {0};
|
||||
HIPCHECK(hipGetDeviceProperties(&props, deviceId));
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
|
||||
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId << std::endl;
|
||||
|
||||
numCUs = props.multiProcessorCount;
|
||||
}
|
||||
|
||||
|
||||
void hipPerfStreamConcurrency::close() {
|
||||
}
|
||||
|
||||
|
||||
void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId) {
|
||||
|
||||
int clkFrequency = 0;
|
||||
unsigned int numStreams = getNumStreams();
|
||||
unsigned int numKernels = getNumKernels();
|
||||
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, deviceId));
|
||||
|
||||
clkFrequency =(unsigned int)clkFrequency/1000;
|
||||
|
||||
// Maximum iteration count
|
||||
// maxIter = 8388608 * (engine_clock / 1000).serial execution
|
||||
maxIter = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs) / 128);
|
||||
maxIter = (maxIter + 15) & ~15;
|
||||
|
||||
hipStream_t streams[numStreams];
|
||||
|
||||
uint * hPtr[numKernels];
|
||||
uint * dPtr[numKernels];
|
||||
|
||||
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
|
||||
width_ = 256;
|
||||
|
||||
bufSize = width_ * sizeof(uint);
|
||||
|
||||
// Create streams for concurrency
|
||||
for (uint i = 0; i < numStreams; i++) {
|
||||
HIPCHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
|
||||
// Allocate memory on the host and device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
|
||||
setData(hPtr[i], 0xdeadbeef);
|
||||
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
|
||||
}
|
||||
|
||||
|
||||
// Prepare kernel launch parameters
|
||||
int threads = (bufSize/sizeof(uint));
|
||||
int threads_per_block = 64;
|
||||
int blocks = (threads/threads_per_block) + (threads % threads_per_block);
|
||||
|
||||
coordIdx = testCase % numCoords;
|
||||
float xStep = (float)(coords[coordIdx].width / (double)width_);
|
||||
float yStep = (float)(-coords[coordIdx].width / (double)width_);
|
||||
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
|
||||
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
|
||||
|
||||
// Copy memory asynchronously and concurrently from host to device
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]), hPtr[i], bufSize, streams[i % numStreams]));
|
||||
}
|
||||
|
||||
|
||||
// Synchronize to make sure all the copies are completed
|
||||
for(uint i = 0; i < numStreams; i++) {
|
||||
HIPCHECK(hipStreamSynchronize(streams[i]));
|
||||
}
|
||||
|
||||
// Warm-up kernel with lower iteration
|
||||
if (testCase == 0) {
|
||||
maxIter = 256;
|
||||
}
|
||||
|
||||
// Time the kernel execution
|
||||
auto all_start = std::chrono::steady_clock::now();
|
||||
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i%numStreams],
|
||||
dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter);
|
||||
}
|
||||
|
||||
|
||||
// Synchronize all the concurrent streans to have completed execution
|
||||
for(uint i = 0; i < numStreams; i++) {
|
||||
HIPCHECK(hipStreamSynchronize(streams[i]));
|
||||
}
|
||||
|
||||
|
||||
auto all_end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> all_kernel_time = all_end - all_start;
|
||||
|
||||
// Copy data back from device to the host
|
||||
for(uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize, streams[i % numStreams]));
|
||||
}
|
||||
|
||||
|
||||
if (testCase != 0) {
|
||||
std::cout <<"Measured time for " << numKernels <<" kernels (s) on " << numStreams <<" stream (s): "
|
||||
<< all_kernel_time.count() << std::endl;
|
||||
}
|
||||
|
||||
|
||||
unsigned long long expected =
|
||||
(unsigned long long)width_ * (unsigned long long)maxIter;
|
||||
|
||||
for(uint i = 0 ; i < numStreams; i++) {
|
||||
HIPCHECK(hipStreamDestroy(streams[i]));
|
||||
}
|
||||
|
||||
|
||||
// Free host and device memory
|
||||
for (uint i = 0; i < numKernels; i++) {
|
||||
HIPCHECK(hipHostFree(hPtr[i]));
|
||||
HIPCHECK(hipFree(dPtr[i]));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void hipPerfStreamConcurrency::setData(void *ptr, unsigned int value) {
|
||||
unsigned int *ptr2 = (unsigned int *)ptr;
|
||||
for (unsigned int i = 0; i < width_ ; i++) {
|
||||
ptr2[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void hipPerfStreamConcurrency::checkData(uint *ptr) {
|
||||
totalIters = 0;
|
||||
for (unsigned int i = 0; i < width_; i++) {
|
||||
totalIters += ptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
hipPerfStreamConcurrency streamConcurrency;
|
||||
int deviceId = 0;
|
||||
|
||||
streamConcurrency.open(deviceId);
|
||||
|
||||
for (unsigned int testCase = 0; testCase < 5; testCase++) {
|
||||
|
||||
|
||||
switch (testCase) {
|
||||
|
||||
|
||||
case 0:
|
||||
// Warm-up kernel
|
||||
streamConcurrency.setNumStreams(1);
|
||||
streamConcurrency.setNumKernels(1);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
// default stream executes serially
|
||||
streamConcurrency.setNumStreams(1);
|
||||
streamConcurrency.setNumKernels(1);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
// 2-way concurrency
|
||||
streamConcurrency.setNumStreams(2);
|
||||
streamConcurrency.setNumKernels(2);
|
||||
break;
|
||||
|
||||
case 3:
|
||||
// 4-way concurrency
|
||||
streamConcurrency.setNumStreams(4);
|
||||
streamConcurrency.setNumKernels(4);
|
||||
break;
|
||||
|
||||
case 4:
|
||||
streamConcurrency.setNumStreams(2);
|
||||
streamConcurrency.setNumKernels(4);
|
||||
break;
|
||||
|
||||
case 5:
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
streamConcurrency.run(testCase, deviceId);
|
||||
|
||||
}
|
||||
|
||||
|
||||
passed();
|
||||
}
|
||||
@@ -1,131 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../src/test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include "test_common.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define BufSize 0x1000
|
||||
#define Iterations 0x100
|
||||
#define TotalStreams 4
|
||||
#define TotalBufs 4
|
||||
|
||||
|
||||
class hipPerfStreamCreateCopyDestroy {
|
||||
private:
|
||||
unsigned int numBuffers_;
|
||||
unsigned int numStreams_;
|
||||
const size_t totalStreams_[TotalStreams];
|
||||
const size_t totalBuffers_[TotalBufs];
|
||||
public:
|
||||
hipPerfStreamCreateCopyDestroy() : numBuffers_(0), numStreams_(0),
|
||||
totalStreams_{1, 2, 4, 8},
|
||||
totalBuffers_{1, 100, 1000, 5000} {};
|
||||
~hipPerfStreamCreateCopyDestroy() {};
|
||||
void open(int deviceID);
|
||||
void run(unsigned int testNumber);
|
||||
};
|
||||
|
||||
void hipPerfStreamCreateCopyDestroy::open(int deviceId) {
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
if (nGpu < 1) {
|
||||
failed("No GPU!");
|
||||
}
|
||||
|
||||
HIPCHECK(hipSetDevice(deviceId));
|
||||
hipDeviceProp_t props = {0};
|
||||
HIPCHECK(hipGetDeviceProperties(&props, deviceId));
|
||||
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
|
||||
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId << std::endl;
|
||||
}
|
||||
|
||||
void hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
|
||||
numStreams_ = totalStreams_[testNumber % TotalStreams];
|
||||
size_t iter = Iterations / (numStreams_ * ((size_t)1 << (testNumber / TotalBufs + 1)));
|
||||
hipStream_t streams[numStreams_];
|
||||
|
||||
numBuffers_ = totalBuffers_[testNumber / TotalBufs];
|
||||
float* dSrc[numBuffers_];
|
||||
size_t nBytes = BufSize * sizeof(float);
|
||||
|
||||
for (size_t b = 0; b < numBuffers_; ++b) {
|
||||
HIPCHECK(hipMalloc(&dSrc[b], nBytes));
|
||||
}
|
||||
|
||||
float* hSrc;
|
||||
hSrc = new float[nBytes];
|
||||
HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess);
|
||||
for (size_t i = 0; i < BufSize; i++) {
|
||||
hSrc[i] = 1.618f + i;
|
||||
}
|
||||
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
|
||||
for (size_t i = 0; i < iter; ++i) {
|
||||
for (size_t s = 0; s < numStreams_; ++s) {
|
||||
HIPCHECK(hipStreamCreate(&streams[s]));
|
||||
}
|
||||
|
||||
for (size_t s = 0; s < numStreams_; ++s) {
|
||||
for (size_t b = 0; b < numBuffers_; ++b) {
|
||||
HIPCHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, hipMemcpyHostToDevice, streams[s]));
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t s = 0; s < numStreams_; ++s) {
|
||||
HIPCHECK(hipStreamDestroy(streams[s]));
|
||||
}
|
||||
}
|
||||
|
||||
auto end = std::chrono::steady_clock::now();
|
||||
std::chrono::duration<double> diff = end - start;
|
||||
|
||||
auto time = static_cast<float>(diff.count() * 1000 / (iter * numStreams_));
|
||||
|
||||
cout << "Create+Copy+Destroy time for " << numStreams_ << " streams and "
|
||||
<< setw(4) << numBuffers_ << " buffers " << " and " << setw(4)
|
||||
<< iter << " iterations " << time << " (ms) " << endl;
|
||||
|
||||
delete [] hSrc;
|
||||
for (size_t b = 0; b < numBuffers_; ++b) {
|
||||
HIPCHECK(hipFree(dSrc[b]));
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
hipPerfStreamCreateCopyDestroy streamCCD;
|
||||
|
||||
int deviceId = 0;
|
||||
streamCCD.open(deviceId);
|
||||
|
||||
for (auto testCase = 0; testCase < TotalStreams * TotalBufs; testCase++) {
|
||||
streamCCD.run(testCase);
|
||||
}
|
||||
|
||||
passed();
|
||||
}
|
||||
Ссылка в новой задаче
Block a user