SWDEV-546217 Complete hip-test Port to Catch2 Framework [Stream and Compute Folder] (#559)

* SWDEV-546498 hipPerfDeviceConcurrency

* SWDEV-546500 hipPerfStreamConcurrency

* SWDEV-546502 hipPerfStreamCreateCopyDestroy.c

* SWDEV-546479 hipPerfDotProduct

* SWDEV-546482 hipPerfMandelbrot
Этот коммит содержится в:
Luo, Phoebe
2025-08-15 15:38:33 -04:00
коммит произвёл GitHub
родитель d227a8110c
Коммит 9fdc9a98b7
10 изменённых файлов: 599 добавлений и 2617 удалений
+111 -115
Просмотреть файл
@@ -18,10 +18,10 @@
*/
/**
* @addtogroup hipPerfDotProduct hipPerfDotProduct
* @{
* @ingroup perfComputeTest
*/
* @addtogroup hipPerfDotProduct hipPerfDotProduct
* @{
* @ingroup perfComputeTest
*/
#include <hip_test_common.hh>
#include <vector>
@@ -31,11 +31,9 @@
using namespace std;
template <unsigned int BLOCKSIZE>
__launch_bounds__(BLOCKSIZE)
__global__ void vectors_not_equal(int n,
const double* __restrict__ x,
const double* __restrict__ y,
double* __restrict__ workspace) {
__launch_bounds__(BLOCKSIZE) __global__
void vectors_not_equal(int n, const double* __restrict__ x, const double* __restrict__ y,
double* __restrict__ workspace) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
double sum = 0.0;
@@ -93,9 +91,8 @@ __global__ void vectors_not_equal(int n,
}
template <unsigned int BLOCKSIZE>
__launch_bounds__(BLOCKSIZE)
__global__ void vectors_equal(int n, const double* __restrict__ x,
double* __restrict__ workspace) {
__launch_bounds__(BLOCKSIZE) __global__
void vectors_equal(int n, const double* __restrict__ x, double* __restrict__ workspace) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
double sum = 0.0;
@@ -129,7 +126,7 @@ __global__ void vectors_equal(int n, const double* __restrict__ x,
__syncthreads();
if (threadIdx.x < 8) {
sdata[threadIdx.x] += sdata[threadIdx.x + 8];
sdata[threadIdx.x] += sdata[threadIdx.x + 8];
}
__syncthreads();
@@ -149,12 +146,11 @@ __global__ void vectors_equal(int n, const double* __restrict__ x,
if (threadIdx.x == 0) {
workspace[blockIdx.x] = sdata[0];
}
}
}
template <unsigned int BLOCKSIZE>
__launch_bounds__(BLOCKSIZE)
__global__ void dot_reduction(double* __restrict__ workspace) {
__launch_bounds__(BLOCKSIZE) __global__ void dot_reduction(double* __restrict__ workspace) {
__shared__ double sdata[BLOCKSIZE];
sdata[threadIdx.x] = workspace[threadIdx.x];
@@ -187,7 +183,8 @@ __global__ void dot_reduction(double* __restrict__ workspace) {
if (threadIdx.x < 4) {
sdata[threadIdx.x] += sdata[threadIdx.x + 4];
} __syncthreads();
}
__syncthreads();
if (threadIdx.x < 2) {
sdata[threadIdx.x] += sdata[threadIdx.x + 2];
@@ -203,8 +200,7 @@ __global__ void dot_reduction(double* __restrict__ workspace) {
}
}
void computeDotProduct(int n, const double* x, const double* y, double& result,
double* workspace) {
void computeDotProduct(int n, const double* x, const double* y, double& result, double* workspace) {
dim3 blocks(DOT_DIM);
dim3 threadsPerBlock(DOT_DIM);
@@ -225,16 +221,16 @@ void computeDotProduct(int n, const double* x, const double* y, double& result,
}
/**
* Test Description
* ------------------------
* - Verify the device kernel results comparing it with the host results.
* Test source
* ------------------------
* - perftests/compute/hipPerfDotProduct.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
* Test Description
* ------------------------
* - Verify the device kernel results comparing it with the host results.
* Test source
* ------------------------
* - perftests/compute/hipPerfDotProduct.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
TEST_CASE("Perf_hipPerfDotProduct") {
int nGpu = 0;
@@ -252,120 +248,120 @@ TEST_CASE("Perf_hipPerfDotProduct") {
for (unsigned int testCase = 0; testCase < 3; testCase++) {
vector<int> vectorSize = {200, 300, 50};
switch (testCase) {
case 0:
nx = vectorSize[0];
ny = vectorSize[0];
nz = vectorSize[0];
break;
case 0:
nx = vectorSize[0];
ny = vectorSize[0];
nz = vectorSize[0];
break;
case 1:
nx = vectorSize[1];
ny = vectorSize[1];
nz = vectorSize[1];
break;
case 1:
nx = vectorSize[1];
ny = vectorSize[1];
nz = vectorSize[1];
break;
case 2:
nx = vectorSize[0];
ny = vectorSize[1];
nz = vectorSize[2];
break;
case 2:
nx = vectorSize[0];
ny = vectorSize[1];
nz = vectorSize[2];
break;
default:
break;
}
default:
break;
}
int trials = 200;
int size = nx * ny * nz;
int trials = 200;
int size = nx * ny * nz;
vector<double> hx(size);
vector<double> hy(size);
double hresult_xy = 0.0;
double hresult_xx = 0.0;
vector<double> hx(size);
vector<double> hy(size);
double hresult_xy = 0.0;
double hresult_xx = 0.0;
srand(time(NULL));
srand(time(NULL));
for (int i = 0; i < size; ++i) {
hx[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
hy[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
for (int i = 0; i < size; ++i) {
hx[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
hy[i] = 2.0 * static_cast<double>(rand()) / static_cast<double>(RAND_MAX) - 1.0;
hresult_xy += hx[i] * hy[i];
hresult_xx += hx[i] * hx[i];
}
hresult_xy += hx[i] * hy[i];
hresult_xx += hx[i] * hx[i];
}
double* dx;
double* dy;
double* workspace;
double dresult;
double* dx;
double* dy;
double* workspace;
double dresult;
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dx), sizeof(double) * size));
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dy), sizeof(double) * size));
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&workspace), sizeof(double) * DOT_DIM));
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dx), sizeof(double) * size));
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dy), sizeof(double) * size));
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&workspace), sizeof(double) * DOT_DIM));
HIP_CHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice));
// Warm up
computeDotProduct(size, dx, dy, dresult, workspace);
computeDotProduct(size, dx, dy, dresult, workspace);
computeDotProduct(size, dx, dy, dresult, workspace);
// Timed run for <x,y>
HIP_CHECK(hipDeviceSynchronize());
auto all_start = std::chrono::steady_clock::now();
for (int i = 0; i < trials; ++i) {
// Warm up
computeDotProduct(size, dx, dy, dresult, workspace);
computeDotProduct(size, dx, dy, dresult, workspace);
computeDotProduct(size, dx, dy, dresult, workspace);
}
float time = 0;
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
time = all_kernel_time.count();
// Timed run for <x,y>
HIP_CHECK(hipDeviceSynchronize());
auto all_start = std::chrono::steady_clock::now();
time /= trials;
for (int i = 0; i < trials; ++i) {
computeDotProduct(size, dx, dy, dresult, workspace);
}
double bw = sizeof(double) * size * 2.0 / 1e9;
double gf = 2.0 * size / 1e9;
float time = 0;
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
time = all_kernel_time.count();
cout << "\nVector Size: " << size << "\n[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
<< gf/(time / 1e3) << " GFlop/s" << endl;
time /= trials;
// Verify the device kernel results comparing it with the host results
REQUIRE(std::abs(dresult - hresult_xy) < std::max(dresult * 1e-10, 1e-8));
double bw = sizeof(double) * size * 2.0 / 1e9;
double gf = 2.0 * size / 1e9;
// Warm up
computeDotProduct(size, dx, dx, dresult, workspace);
computeDotProduct(size, dx, dx, dresult, workspace);
computeDotProduct(size, dx, dx, dresult, workspace);
CONSOLE_PRINT("\nVector Size: %d\n[ddot] <x,y> %.6f msec ; %.6f GByte/s ; %.6f GFlop/s", size,
time, bw / (time / 1e3), gf / (time / 1e3));
// Timed run for <x,x>
HIP_CHECK(hipDeviceSynchronize());
all_start = std::chrono::steady_clock::now();
// Verify the device kernel results comparing it with the host results
REQUIRE(std::abs(dresult - hresult_xy) < std::max(dresult * 1e-10, 1e-8));
for (int i = 0; i < trials; ++i) {
// Warm up
computeDotProduct(size, dx, dx, dresult, workspace);
computeDotProduct(size, dx, dx, dresult, workspace);
computeDotProduct(size, dx, dx, dresult, workspace);
}
all_end = std::chrono::steady_clock::now();
all_kernel_time = all_end - all_start;
time = all_kernel_time.count();
// Timed run for <x,x>
HIP_CHECK(hipDeviceSynchronize());
all_start = std::chrono::steady_clock::now();
time /= trials;
bw = sizeof(double) * size / 1e9;
for (int i = 0; i < trials; ++i) {
computeDotProduct(size, dx, dx, dresult, workspace);
}
cout << "[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
<< gf/(time / 1e3) << " GFlop/s" << endl;
all_end = std::chrono::steady_clock::now();
all_kernel_time = all_end - all_start;
time = all_kernel_time.count();
// Verify the device kernel results comparing it with the host results
REQUIRE(abs(dresult - hresult_xx) < max(dresult * 1e-10, 1e-8));
time /= trials;
bw = sizeof(double) * size / 1e9;
HIP_CHECK(hipFree(dx));
HIP_CHECK(hipFree(dy));
HIP_CHECK(hipFree(workspace));
CONSOLE_PRINT("[ddot] <x,y> %.6f msec ; %.6f GByte/s ; %.6f GFlop/s", time, bw / (time / 1e3),
gf / (time / 1e3));
// Verify the device kernel results comparing it with the host results
REQUIRE(abs(dresult - hresult_xx) < max(dresult * 1e-10, 1e-8));
HIP_CHECK(hipFree(dx));
HIP_CHECK(hipFree(dy));
HIP_CHECK(hipFree(workspace));
}
}
/**
* End doxygen group perfComputeTest.
* @}
*/
* End doxygen group perfComputeTest.
* @}
*/
+264 -278
Просмотреть файл
@@ -18,10 +18,10 @@
*/
/**
* @addtogroup hipPerfMandelbrot hipPerfMandelbrot
* @{
* @ingroup perfComputeTest
*/
* @addtogroup hipPerfMandelbrot hipPerfMandelbrot
* @{
* @ingroup perfComputeTest
*/
#include <hip_test_common.hh>
#include <hip/hip_vector_types.h>
@@ -45,36 +45,35 @@ coordRec coords[] = {
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
template <typename T>
__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos,
T xStep, T yStep, uint maxIter) {
__global__ void float_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
uint maxIter) {
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
float x0 = static_cast<float>(xPos + xStep*i);
float y0 = static_cast<float>(yPos + yStep*j);
float x0 = static_cast<float>(xPos + xStep * i);
float y0 = static_cast<float>(yPos + yStep * j);
float x = x0;
float y = y0;
uint iter = 0;
float tmp;
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) {
tmp = x;
x = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*tmp, y, y0);
y = fma(2.0f * tmp, y, y0);
}
out[tid] = iter;
}
template <typename T>
__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
T yPos, T xStep, T yStep, uint maxIter) {
__global__ void float_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
uint maxIter) {
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
float x0 = static_cast<float>(xPos + xStep*static_cast<float>(i));
float y0 = static_cast<float>(yPos + yStep*static_cast<float>(j));
float x0 = static_cast<float>(xPos + xStep * static_cast<float>(i));
float y0 = static_cast<float>(yPos + yStep * static_cast<float>(j));
float x = x0;
float y = y0;
@@ -84,72 +83,71 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
float tmp;
int stay;
int ccount = 0;
stay = (x*x+y*y) <= 4.0;
stay = (x * x + y * y) <= 4.0;
float savx = x;
float savy = y;
#ifdef FAST
for (iter = 0; (iter < maxIter); iter+=16) {
for (iter = 0; (iter < maxIter); iter += 16) {
#else
for (iter = 0; stay && (iter < maxIter); iter+=16) {
for (iter = 0; stay && (iter < maxIter); iter += 16) {
#endif
x = savx;
y = savy;
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
stay = (x*x+y*y) <= 4.0;
stay = (x * x + y * y) <= 4.0;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
ccount += stay*16;
ccount += stay * 16;
#ifdef FAST
if (!stay)
break;
if (!stay) break;
#endif
}
// Handle remainder
@@ -158,10 +156,10 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
do {
x = savx;
y = savy;
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter);
tmp = x;
x = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*tmp, y, y0);
x = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * tmp, y, y0);
ccount += stay;
iter--;
savx = (stay ? x : savx);
@@ -172,36 +170,36 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
}
template <typename T>
__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
uint maxIter) {
__global__ void double_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
uint maxIter) {
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
double x0 = static_cast<double>(xPos + xStep*i);
double y0 = static_cast<double>(yPos + yStep*j);
double x0 = static_cast<double>(xPos + xStep * i);
double y0 = static_cast<double>(yPos + yStep * j);
double x = x0;
double y = y0;
uint iter = 0;
double tmp;
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) {
tmp = x;
x = fma(-y, y,fma(x, x, x0));
y = fma(2.0f*tmp, y, y0);
x = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * tmp, y, y0);
}
out[tid] = iter;
};
template <typename T>
__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
T yPos, T xStep, T yStep, uint maxIter) {
__global__ void double_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep,
uint maxIter) {
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
double x0 = static_cast<double>(xPos + xStep*static_cast<double>(i));
double y0 = static_cast<double>(yPos + yStep*static_cast<double>(j));
double x0 = static_cast<double>(xPos + xStep * static_cast<double>(i));
double y0 = static_cast<double>(yPos + yStep * static_cast<double>(j));
double x = x0;
double y = y0;
@@ -211,13 +209,13 @@ __global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
double tmp;
int stay;
int ccount = 0;
stay = (x*x+y*y) <= 4.0;
stay = (x * x + y * y) <= 4.0;
double savx = x;
double savy = y;
#ifdef FAST
for (iter = 0; (iter < maxIter); iter+=16)
for (iter = 0; (iter < maxIter); iter += 16)
#else
for (iter = 0; stay && (iter < maxIter); iter+=16)
for (iter = 0; stay && (iter < maxIter); iter += 16)
#endif
{
x = savx;
@@ -225,141 +223,131 @@ __global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x,y,y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
// Two iterations
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f*tmp, y, y0);
tmp = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * x, y, y0);
x = fma(-y, y, fma(tmp, tmp, x0));
y = fma(2.0f * tmp, y, y0);
stay = (x*x+y*y) <= 4.0;
stay = (x * x + y * y) <= 4.0;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
ccount += stay*16;
ccount += stay * 16;
#ifdef FAST
if (!stay)
break;
if (!stay) break;
#endif
}
}
// Handle remainder
if (!stay) {
iter = 16;
do {
x = savx;
y = savy;
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
tmp = x;
x = fma(-y,y, fma(x, x, x0));
y = fma(2.0f*tmp,y,y0);
ccount += stay;
iter--;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
}
while (stay && iter);
}
out[tid] = (uint)ccount;
if (!stay) {
iter = 16;
do {
x = savx;
y = savy;
stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter);
tmp = x;
x = fma(-y, y, fma(x, x, x0));
y = fma(2.0f * tmp, y, y0);
ccount += stay;
iter--;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
} while (stay && iter);
}
out[tid] = (uint)ccount;
};
// Expected results for each kernel run at each coord
unsigned long long expectedIters[] = {
203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull,
120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull,
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull,
203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull,
120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull,
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};
203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, 120254651ull,
203277748ull, 2147483648ull, 120254651ull, 203315114ull, 2147483648ull, 120042599ull,
203315114ull, 2147483648ull, 120042599ull, 203280620ull, 2147483648ull, 120485704ull,
203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, 120485704ull,
203315114ull, 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};
class hipPerfMandelBrot {
public:
hipPerfMandelBrot();
~hipPerfMandelBrot();
void setNumKernels(unsigned int num) {
numKernels = num;
}
void setNumKernels(unsigned int num) { numKernels = num; }
unsigned int getNumKernels() {
return numKernels;
}
unsigned int getNumKernels() { return numKernels; }
void setNumStreams(unsigned int num) {
numStreams = num;
}
unsigned int getNumStreams() {
return numStreams;
}
void setNumStreams(unsigned int num) { numStreams = num; }
unsigned int getNumStreams() { return numStreams; }
void open(int deviceID);
bool run(unsigned int testCase);
void printResults(void);
// array of funtion pointers
typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt);
typedef void (hipPerfMandelBrot::*funPtr)(uint* out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter,
hipStream_t* streams, int blocks, int threads_per_block,
int kernelCnt);
// Wrappers
void float_mad(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t* streams,
int blocks, int threads_per_block, int kernelCnt);
void float_mad(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
int kernelCnt);
void float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t* streams,
int blocks, int threads_per_block, int kernelCnt);
void float_mandel_unroll(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
int kernelCnt);
void double_mad(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt);
void double_mad(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
int kernelCnt);
void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt);
void double_mandel_unroll(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
uint maxIter, hipStream_t* streams, int blocks, int threads_per_block,
int kernelCnt);
hipStream_t streams[2];
private:
void setData(void *ptr, unsigned int value);
void checkData(uint *ptr);
void setData(void* ptr, unsigned int value);
void checkData(uint* ptr);
unsigned int numKernels;
unsigned int numStreams;
@@ -387,9 +375,9 @@ void hipPerfMandelBrot::open(int deviceId) {
HIP_CHECK(hipSetDevice(deviceId));
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
<< std::endl;
CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID,
props.name, props.multiProcessorCount, deviceId);
numCUs = props.multiProcessorCount;
}
@@ -397,52 +385,52 @@ void hipPerfMandelBrot::open(int deviceId) {
void hipPerfMandelBrot::printResults() {
int numStreams = getNumStreams();
std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
<< numStreams << " streams (s)" << std::endl;
CONSOLE_PRINT("Measured perf for kernels in GFLOPS on %d streams (s)", numStreams);
std::map<std::string, std::vector<double>>:: iterator itr;
std::map<std::string, std::vector<double>>::iterator itr;
for (itr = results.begin(); itr != results.end(); itr++) {
std::cout << "\n" << std::setw(20) << itr->first << " ";
for (auto i : results[itr->first]) {
std::cout << std::setw(10) << i << " ";
}
}
CONSOLE_PRINT("\n%s ", itr->first.c_str());
for (auto i : results[itr->first]) {
CONSOLE_PRINT("%10f ", i);
}
}
results.clear();
std::cout << std::endl;
CONSOLE_PRINT("\n");
}
// Wrappers for the kernel launches
void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams,
int blocks, int threads_per_block, int kernelCnt) {
void hipPerfMandelBrot::float_mad(uint* out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep,
maxIter);
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
}
void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t * streams,
int blocks, int threads_per_block, int kernelCnt) {
void hipPerfMandelBrot::float_mandel_unroll(uint* out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter,
hipStream_t* streams, int blocks, int threads_per_block,
int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
}
void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t * streams,
int blocks, int threads_per_block, int kernelCnt) {
void hipPerfMandelBrot::double_mad(uint* out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
}
void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t * streams,
int blocks, int threads_per_block, int kernelCnt) {
void hipPerfMandelBrot::double_mandel_unroll(uint* out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter,
hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter);
}
bool hipPerfMandelBrot::run(unsigned int testCase) {
@@ -450,18 +438,18 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
coordIdx = testCase % numCoords;
funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
&hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
&hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
// Maximum iteration count
maxIter = 32768;
uint ** hPtr = new uint *[numKernels];
uint ** dPtr = new uint *[numKernels];
uint** hPtr = new uint*[numKernels];
uint** dPtr = new uint*[numKernels];
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
width_ = 256;
bufSize = width_ * width_ * sizeof(uint);
bufSize = width_ * width_ * sizeof(uint);
// Create streams for concurrency
for (uint i = 0; i < numStreams; i++) {
@@ -470,15 +458,15 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
// Allocate memory on the host and device
for (uint i = 0; i < numKernels; i++) {
HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hPtr[i]), bufSize, hipHostMallocDefault));
HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hPtr[i]), bufSize, hipHostMallocDefault));
setData(hPtr[i], 0xdeadbeef);
HIP_CHECK(hipMalloc(reinterpret_cast<uint **>(&dPtr[i]), bufSize))
HIP_CHECK(hipMalloc(reinterpret_cast<uint**>(&dPtr[i]), bufSize))
}
// Prepare kernel launch parameters
int threads = (bufSize/sizeof(uint));
int threads_per_block = 64;
int blocks = (threads/threads_per_block) + (threads % threads_per_block);
int threads = (bufSize / sizeof(uint));
int threads_per_block = 64;
int blocks = (threads / threads_per_block) + (threads % threads_per_block);
// Copy memory asynchronously and concurrently from host to device
for (uint i = 0; i < numKernels; i++) {
@@ -489,90 +477,88 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
HIP_CHECK(hipStreamSynchronize(0));
int kernelIdx;
if(testCase == 0 || testCase == 5 || testCase == 10) {
if (testCase == 0 || testCase == 5 || testCase == 10) {
kernelIdx = 0;
} else if(testCase == 1 || testCase == 6 || testCase == 11) {
} else if (testCase == 1 || testCase == 6 || testCase == 11) {
kernelIdx = 1;
} else if(testCase == 2 || testCase == 7 || testCase == 12) {
} else if (testCase == 2 || testCase == 7 || testCase == 12) {
kernelIdx = 2;
} else if(testCase == 3 || testCase == 8 || testCase == 13){
} else if (testCase == 3 || testCase == 8 || testCase == 13) {
kernelIdx = 3;
}
double totalTime = 0.0;
for (unsigned int k = 0; k < numLoops; k++) {
if ((testCase == 0 || testCase == 1 || testCase == 2 ||
testCase == 5 || testCase == 6 || testCase == 7 ||
testCase == 10 || testCase == 11 || testCase == 12)) {
float xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
float yStep = static_cast<float>(-coords[coordIdx].width / static_cast<double>(width_));
float xPos = static_cast<float>(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
float yPos = static_cast<float>(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
if ((testCase == 0 || testCase == 1 || testCase == 2 || testCase == 5 || testCase == 6 ||
testCase == 7 || testCase == 10 || testCase == 11 || testCase == 12)) {
float xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
float yStep = static_cast<float>(-coords[coordIdx].width / static_cast<double>(width_));
float xPos = static_cast<float>(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
float yPos = static_cast<float>(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for (uint i = 0; i < numKernels; i++) {
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
threads_per_block, i);
}
for (uint i = 0; i < numKernels; i++) {
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
threads_per_block, i);
}
// Synchronize all the concurrent streams to have completed execution
HIP_CHECK(hipStreamSynchronize(0));
// Synchronize all the concurrent streams to have completed execution
HIP_CHECK(hipStreamSynchronize(0));
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
totalTime += all_kernel_time.count();
} else {
double xStep = coords[coordIdx].width / static_cast<double>(width_);
double yStep = -coords[coordIdx].width / static_cast<double>(width_);
double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
totalTime += all_kernel_time.count();
} else {
double xStep = coords[coordIdx].width / static_cast<double>(width_);
double yStep = -coords[coordIdx].width / static_cast<double>(width_);
double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for (uint i = 0; i < numKernels; i++) {
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
threads_per_block, i);
}
// Synchronize all the concurrent streams to have completed execution
HIP_CHECK(hipStreamSynchronize(0));
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for (uint i = 0; i < numKernels; i++) {
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
threads_per_block, i);
}
// Synchronize all the concurrent streams to have completed execution
HIP_CHECK(hipStreamSynchronize(0));
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
totalTime += all_kernel_time.count();
}
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
totalTime += all_kernel_time.count();
}
}
// Copy data back from device to the host
for(uint i = 0; i < numKernels; i++) {
HIP_CHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
}
for(uint i = 0; i < numKernels; i++) {
checkData(hPtr[i]);
int j =0;
while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
j++;
for (uint i = 0; i < numKernels; i++) {
HIP_CHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
}
for (uint i = 0; i < numKernels; i++) {
checkData(hPtr[i]);
int j = 0;
while ((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
j++;
}
if(j==30) {
std::cout << "Incorrect iteration count detected. ";
}
if (j == 30) {
CONSOLE_PRINT("Incorrect iteration count detected. ");
}
}
// Compute GFLOPS. There are 7 FLOPs per iteration
double perf = (static_cast<double>(totalIters*numKernels) * 7 * static_cast<double>(1e-09)) /
(totalTime / (double)numLoops);
double perf = (static_cast<double>(totalIters * numKernels) * 7 * static_cast<double>(1e-09)) /
(totalTime / (double)numLoops);
std::vector<std::string> kernelName = {"float", "float_unroll",
"double", "double_unroll"};
std::vector<std::string> kernelName = {"float", "float_unroll", "double", "double_unroll"};
// Print results except for Warm-up kernel
if (testCase != 100) {
results[kernelName[testCase % 4]].push_back(perf);
}
results[kernelName[testCase % 4]].push_back(perf);
}
for(uint i = 0 ; i < numStreams; i++) {
for (uint i = 0; i < numStreams; i++) {
HIP_CHECK(hipStreamDestroy(streams[i]));
}
@@ -581,19 +567,19 @@ bool hipPerfMandelBrot::run(unsigned int testCase) {
HIP_CHECK(hipHostFree(hPtr[i]));
HIP_CHECK(hipFree(dPtr[i]));
}
delete [] hPtr;
delete [] dPtr;
delete[] hPtr;
delete[] dPtr;
return true;
}
void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
unsigned int *ptr2 = (unsigned int *)ptr;
void hipPerfMandelBrot::setData(void* ptr, unsigned int value) {
unsigned int* ptr2 = (unsigned int*)ptr;
for (unsigned int i = 0; i < width_ * width_; i++) {
ptr2[i] = value;
ptr2[i] = value;
}
}
void hipPerfMandelBrot::checkData(uint *ptr) {
void hipPerfMandelBrot::checkData(uint* ptr) {
totalIters = 0;
for (unsigned int i = 0; i < width_ * width_; i++) {
totalIters += ptr[i];
@@ -601,30 +587,30 @@ void hipPerfMandelBrot::checkData(uint *ptr) {
}
/**
* Test Description
* ------------------------
* - Verify the warm-up kernel default stream executes serially.
* - verify by running all kernels - sync.
* - verify by running all kernels - async.
* Test source
* ------------------------
* - perftests/compute/hipPerfMandelbrot.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
* Test Description
* ------------------------
* - Verify the warm-up kernel default stream executes serially.
* - verify by running all kernels - sync.
* - verify by running all kernels - async.
* Test source
* ------------------------
* - perftests/compute/hipPerfMandelbrot.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
TEST_CASE("Perf_hipPerfMandelbrot") {
hipPerfMandelBrot mandelbrotCompute;
int deviceId = 0;
mandelbrotCompute.open(deviceId);
#if HT_AMD
#if HT_AMD
SECTION("warm-up kernel default stream executes serially") {
mandelbrotCompute.setNumStreams(1);
mandelbrotCompute.setNumKernels(1);
REQUIRE(true == mandelbrotCompute.run(100/*Random number*/));
REQUIRE(true == mandelbrotCompute.run(100 /*Random number*/));
}
#endif
#endif
SECTION("run all - sync") {
int i = 0;
do {
@@ -632,7 +618,7 @@ TEST_CASE("Perf_hipPerfMandelbrot") {
mandelbrotCompute.setNumKernels(1);
REQUIRE(true == mandelbrotCompute.run(i));
i++;
}while(i < 12);
} while (i < 12);
mandelbrotCompute.printResults();
}
@@ -643,12 +629,12 @@ TEST_CASE("Perf_hipPerfMandelbrot") {
mandelbrotCompute.setNumKernels(2);
REQUIRE(true == mandelbrotCompute.run(i));
i++;
}while(i < 12);
} while (i < 12);
mandelbrotCompute.printResults();
}
}
/**
* End doxygen group perfComputeTest.
* @}
*/
* End doxygen group perfComputeTest.
* @}
*/
+68 -80
Просмотреть файл
@@ -18,12 +18,12 @@
*/
/**
* @addtogroup hipPerfDeviceConcurrency hipPerfDeviceConcurrency
* @{
* @ingroup perfStreamTest
* `hipError_t hipStreamCreate(hipStream_t* stream)` -
* Create an asynchronous stream.
*/
* @addtogroup hipPerfDeviceConcurrency hipPerfDeviceConcurrency
* @{
* @ingroup perfStreamTest
* `hipError_t hipStreamCreate(hipStream_t* stream)` -
* Create an asynchronous stream.
*/
#include <hip_test_common.hh>
@@ -34,28 +34,28 @@ typedef struct {
} coordRec;
static coordRec coords[] = {
{0.0, 0.0, 0.00001}, // All black
{0.0, 0.0, 0.00001}, // All black
};
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
__global__ void mandelbrot(uint *out, uint width, float xPos,
float yPos, float xStep, float yStep, uint maxIter) {
__global__ void mandelbrot(uint* out, uint width, float xPos, float yPos, float xStep, float yStep,
uint maxIter) {
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
float x0 = static_cast<float>(xPos + xStep*i);
float y0 = static_cast<float>(yPos + yStep*j);
float x0 = static_cast<float>(xPos + xStep * i);
float y0 = static_cast<float>(yPos + yStep * j);
float x = x0;
float y = y0;
uint iter = 0;
float tmp;
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) {
tmp = x;
x = fma(-y, y, fma(x, x, x0));
y = fma(2.0f*tmp, y, y0);
y = fma(2.0f * tmp, y, y0);
}
out[tid] = iter;
};
@@ -65,20 +65,16 @@ class hipPerfDeviceConcurrency {
hipPerfDeviceConcurrency();
~hipPerfDeviceConcurrency();
void setNumGpus(unsigned int num) {
numDevices = num;
}
unsigned int getNumGpus() {
return numDevices;
}
void setNumGpus(unsigned int num) { numDevices = num; }
unsigned int getNumGpus() { return numDevices; }
void open(void);
void close(void);
bool run(unsigned int testCase, int numGpus);
private:
void setData(void *ptr, unsigned int value);
void checkData(uint *ptr);
void setData(void* ptr, unsigned int value);
void checkData(uint* ptr);
unsigned int numDevices;
unsigned int width_;
@@ -100,17 +96,16 @@ void hipPerfDeviceConcurrency::open(void) {
}
}
void hipPerfDeviceConcurrency::close() {
}
void hipPerfDeviceConcurrency::close() {}
bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
static int deviceId;
uint ** hPtr = new uint*[numGpus];
uint ** dPtr = new uint*[numGpus];
hipStream_t * streams = new hipStream_t[numGpus];
int *numCUs = new int[numGpus];
unsigned int *maxIter = new unsigned int[numGpus];
unsigned long long *expectedIters = new unsigned long long[numGpus];
uint** hPtr = new uint*[numGpus];
uint** dPtr = new uint*[numGpus];
hipStream_t* streams = new hipStream_t[numGpus];
int* numCUs = new int[numGpus];
unsigned int* maxIter = new unsigned int[numGpus];
unsigned long long* expectedIters = new unsigned long long[numGpus];
int threads, threads_per_block, blocks;
float xStep, yStep, xPos, yPos;
@@ -124,25 +119,21 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, i));
if (testCase != 0) {
std::cout << "info: running on bus " << "0x" << props.pciBusID
<< " " << props.name << " with " << props.multiProcessorCount
<< " CUs" << " and device ID: " << i << std::endl;
CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device ID: %d", props.pciBusID,
props.name, props.multiProcessorCount, i);
}
numCUs[i] = props.multiProcessorCount;
int clkFrequency = 0;
HIP_CHECK(hipDeviceGetAttribute(&clkFrequency,
hipDeviceAttributeClockRate, i));
HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
if (clkFrequency == 0) {
std::cout << "clkFrequency = 0, set it to 1000000\n";
CONSOLE_PRINT("clkFrequency = 0, set it to 1000000");
clkFrequency = 1000000;
}
clkFrequency =(unsigned int)clkFrequency/1000;
clkFrequency = (unsigned int)clkFrequency / 1000;
// Maximum iteration count
// maxIter = 8388608 * (engine_clock / 1000).serial execution
maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000))
* numCUs[i]) / 128);
maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
maxIter[i] = (maxIter[i] + 15) & ~15;
// Width is divisible by 4 because the mandelbrot
@@ -153,15 +144,14 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
HIP_CHECK(hipStreamCreate(&streams[i]));
// Allocate memory on the host and device
HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hPtr[i]),
bufSize, hipHostMallocDefault));
HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hPtr[i]), bufSize, hipHostMallocDefault));
setData(hPtr[i], 0xdeadbeef);
HIP_CHECK(hipMalloc(reinterpret_cast<uint **>(&dPtr[i]), bufSize))
HIP_CHECK(hipMalloc(reinterpret_cast<uint**>(&dPtr[i]), bufSize))
// Prepare kernel launch parameters
threads = (bufSize/sizeof(uint));
threads_per_block = 64;
blocks = (threads/threads_per_block) + (threads % threads_per_block);
threads = (bufSize / sizeof(uint));
threads_per_block = 64;
blocks = (threads / threads_per_block) + (threads % threads_per_block);
coordIdx = testCase % numCoords;
xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
@@ -180,10 +170,9 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
deviceId = i;
}
HIP_CHECK(hipSetDevice(deviceId));
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0,
streams[i], dPtr[i], width_, xPos, yPos, xStep,
yStep, maxIter[i]);
HIP_CHECK(hipSetDevice(deviceId));
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i], dPtr[i],
width_, xPos, yPos, xStep, yStep, maxIter[i]);
}
for (int i = 0; i < numGpus; i++) {
HIP_CHECK(hipStreamSynchronize(0));
@@ -192,8 +181,8 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
for(int i = 0; i < numGpus; i++) {
if(testCase != 0) {
for (int i = 0; i < numGpus; i++) {
if (testCase != 0) {
deviceId = i;
}
HIP_CHECK(hipSetDevice(deviceId));
@@ -201,11 +190,11 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
// Copy data back from device to the host
HIP_CHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
checkData(hPtr[i]);
expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
expectedIters[i] = width_ * width_ * (unsigned long long)maxIter[i];
if (testCase != 0) {
checkData(hPtr[i]);
if (totalIters != expectedIters[i]) {
std::cout << "Incorrect iteration count detected" << std::endl;
CONSOLE_PRINT("Incorrect iteration count detected");
}
}
@@ -216,31 +205,30 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
}
if (testCase != 0) {
std::cout << '\n' << "Measured time for kernel computation on " << numGpus
<< " device (s): " << all_kernel_time.count() << " (s) "
<< '\n' << std::endl;
CONSOLE_PRINT("\nMeasured time for kernel computation on %d device(s): %.6f (s)\n", numGpus,
all_kernel_time.count());
}
if (testCase == 0) {
deviceId++;
}
delete [] hPtr;
delete [] dPtr;
delete [] streams;
delete [] numCUs;
delete [] maxIter;
delete [] expectedIters;
delete[] hPtr;
delete[] dPtr;
delete[] streams;
delete[] numCUs;
delete[] maxIter;
delete[] expectedIters;
return true;
}
void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
unsigned int *ptr2 = (unsigned int *)ptr;
for (unsigned int i = 0; i < width_ * width_ ; i++) {
ptr2[i] = value;
void hipPerfDeviceConcurrency::setData(void* ptr, unsigned int value) {
unsigned int* ptr2 = (unsigned int*)ptr;
for (unsigned int i = 0; i < width_ * width_; i++) {
ptr2[i] = value;
}
}
void hipPerfDeviceConcurrency::checkData(uint *ptr) {
void hipPerfDeviceConcurrency::checkData(uint* ptr) {
totalIters = 0;
for (unsigned int i = 0; i < width_ * width_; i++) {
totalIters += ptr[i];
@@ -248,16 +236,16 @@ void hipPerfDeviceConcurrency::checkData(uint *ptr) {
}
/**
* Test Description
* ------------------------
* - Verify the different levels of device concurrency.
* Test source
* ------------------------
* - perftests/stream/hipPerfDeviceConcurrency.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
* Test Description
* ------------------------
* - Verify the different levels of device concurrency.
* Test source
* ------------------------
* - perftests/stream/hipPerfDeviceConcurrency.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
TEST_CASE("Perf_hipPerfDeviceConcurrency") {
hipPerfDeviceConcurrency deviceConcurrency;
@@ -279,6 +267,6 @@ TEST_CASE("Perf_hipPerfDeviceConcurrency") {
}
/**
* End doxygen group perfStreamTest.
* @}
*/
* End doxygen group perfStreamTest.
* @}
*/
+110 -126
Просмотреть файл
@@ -18,12 +18,12 @@
*/
/**
* @addtogroup hipPerfStreamConcurrency hipPerfStreamConcurrency
* @{
* @ingroup perfComputeTest
* `hipError_t hipStreamCreate(hipStream_t* stream)` -
* Create an asynchronous stream.
*/
* @addtogroup hipPerfStreamConcurrency hipPerfStreamConcurrency
* @{
* @ingroup perfComputeTest
* `hipError_t hipStreamCreate(hipStream_t* stream)` -
* Create an asynchronous stream.
*/
#include <hip_test_common.hh>
#include <hip/hip_vector_types.h>
@@ -55,23 +55,23 @@ static coordRec coords[] = {
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
__global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter) {
__global__ static void mandelbrot(uint* out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter) {
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % (width/4);
int j = tid / (width/4);
int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3);
int i = tid % (width / 4);
int j = tid / (width / 4);
int4 veci = make_int4(4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3);
int4 vecj = make_int4(j, j, j, j);
float4 x0;
x0.x = static_cast<float>(xPos + xStep*veci.x);
x0.y = static_cast<float>(xPos + xStep*veci.y);
x0.z = static_cast<float>(xPos + xStep*veci.z);
x0.w = static_cast<float>(xPos + xStep*veci.w);
x0.x = static_cast<float>(xPos + xStep * veci.x);
x0.y = static_cast<float>(xPos + xStep * veci.y);
x0.z = static_cast<float>(xPos + xStep * veci.z);
x0.w = static_cast<float>(xPos + xStep * veci.w);
float4 y0;
y0.x = static_cast<float>(yPos + yStep*vecj.x);
y0.y = static_cast<float>(yPos + yStep*vecj.y);
y0.z = static_cast<float>(yPos + yStep*vecj.z);
y0.w = static_cast<float>(yPos + yStep*vecj.w);
y0.x = static_cast<float>(yPos + yStep * vecj.x);
y0.y = static_cast<float>(yPos + yStep * vecj.y);
y0.z = static_cast<float>(yPos + yStep * vecj.z);
y0.w = static_cast<float>(yPos + yStep * vecj.w);
float4 x = x0;
float4 y = y0;
uint iter = 0;
@@ -80,53 +80,52 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
int4 ccount = make_int4(0, 0, 0, 0);
float4 savx = x;
float4 savy = y;
stay.x = (x.x*x.x+y.x*y.x) <= static_cast<float>(4.0f);
stay.y = (x.y*x.y+y.y*y.y) <= static_cast<float>(4.0f);
stay.z = (x.z*x.z+y.z*y.z) <= static_cast<float>(4.0f);
stay.w = (x.w*x.w+y.w*y.w) <= static_cast<float>(4.0f);
for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter);
iter+=16) {
stay.x = (x.x * x.x + y.x * y.x) <= static_cast<float>(4.0f);
stay.y = (x.y * x.y + y.y * y.y) <= static_cast<float>(4.0f);
stay.z = (x.z * x.z + y.z * y.z) <= static_cast<float>(4.0f);
stay.w = (x.w * x.w + y.w * y.w) <= static_cast<float>(4.0f);
for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter); iter += 16) {
x = savx;
y = savy;
// Two iterations
tmp = x*x + x0 - y*y;
tmp = x * x + x0 - y * y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
x = tmp * tmp + x0 - y * y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
tmp = x * x + x0 - y * y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
x = tmp * tmp + x0 - y * y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
tmp = x * x + x0 - y * y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
x = tmp * tmp + x0 - y * y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
tmp = x * x + x0 - y * y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
x = tmp * tmp + x0 - y * y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
tmp = x * x + x0 - y * y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
x = tmp * tmp + x0 - y * y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
tmp = x * x + x0 - y * y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
x = tmp * tmp + x0 - y * y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
tmp = x * x + x0 - y * y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
x = tmp * tmp + x0 - y * y;
y = 2.0f * tmp * y + y0;
stay.x = (x.x*x.x+y.x*y.x) <= static_cast<float>(4.0f);
stay.y = (x.y*x.y+y.y*y.y) <= static_cast<float>(4.0f);
stay.z = (x.z*x.z+y.z*y.z) <= static_cast<float>(4.0f);
stay.w = (x.w*x.w+y.w*y.w) <= static_cast<float>(4.0f);
stay.x = (x.x * x.x + y.x * y.x) <= static_cast<float>(4.0f);
stay.y = (x.y * x.y + y.y * y.y) <= static_cast<float>(4.0f);
stay.z = (x.z * x.z + y.z * y.z) <= static_cast<float>(4.0f);
stay.w = (x.w * x.w + y.w * y.w) <= static_cast<float>(4.0f);
savx.x = static_cast<bool>(stay.x ? x.x : savx.x);
savx.y = static_cast<bool>(stay.y ? x.y : savx.y);
savx.z = static_cast<bool>(stay.z ? x.z : savx.z);
@@ -135,10 +134,10 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
savy.y = static_cast<bool>(stay.y ? y.y : savy.y);
savy.z = static_cast<bool>(stay.z ? y.z : savy.z);
savy.w = static_cast<bool>(stay.w ? y.w : savy.w);
ccount.x -= stay.x*16;
ccount.y -= stay.y*16;
ccount.z -= stay.z*16;
ccount.w -= stay.w*16;
ccount.x -= stay.x * 16;
ccount.y -= stay.y * 16;
ccount.z -= stay.z * 16;
ccount.w -= stay.w * 16;
}
// Handle remainder
if (!(stay.x & stay.y & stay.z & stay.w)) {
@@ -146,13 +145,13 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
do {
x = savx;
y = savy;
stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x < maxIter);
stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y < maxIter);
stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z < maxIter);
stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w < maxIter);
stay.x = ((x.x * x.x + y.x * y.x) <= 4.0f) && (ccount.x < maxIter);
stay.y = ((x.y * x.y + y.y * y.y) <= 4.0f) && (ccount.y < maxIter);
stay.z = ((x.z * x.z + y.z * y.z) <= 4.0f) && (ccount.z < maxIter);
stay.w = ((x.w * x.w + y.w * y.w) <= 4.0f) && (ccount.w < maxIter);
tmp = x;
x = x*x + x0 - y*y;
y = 2.0f*tmp*y + y0;
x = x * x + x0 - y * y;
y = 2.0f * tmp * y + y0;
ccount.x += stay.x;
ccount.y += stay.y;
ccount.z += stay.z;
@@ -168,7 +167,7 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos,
savy.w = (stay.w ? y.w : savy.w);
} while ((stay.x | stay.y | stay.z | stay.w) && iter);
}
uint4 *vecOut = reinterpret_cast<uint4 *>(out);
uint4* vecOut = reinterpret_cast<uint4*>(out);
vecOut[tid].x = (uint)(ccount.x);
vecOut[tid].y = (uint)(ccount.y);
vecOut[tid].z = (uint)(ccount.z);
@@ -180,27 +179,19 @@ class hipPerfStreamConcurrency {
hipPerfStreamConcurrency();
~hipPerfStreamConcurrency();
void setNumKernels(unsigned int num) {
numKernels = num;
}
void setNumStreams(unsigned int num) {
numStreams = num;
}
unsigned int getNumStreams() {
return numStreams;
}
void setNumKernels(unsigned int num) { numKernels = num; }
void setNumStreams(unsigned int num) { numStreams = num; }
unsigned int getNumStreams() { return numStreams; }
unsigned int getNumKernels() {
return numKernels;
}
unsigned int getNumKernels() { return numKernels; }
bool open(int deviceID);
bool run(unsigned int testCase, unsigned int deviceId);
void close(void);
private:
void setData(void *ptr, unsigned int value);
void checkData(uint *ptr);
void setData(void* ptr, unsigned int value);
void checkData(uint* ptr);
unsigned int numKernels;
unsigned int numStreams;
@@ -227,38 +218,34 @@ bool hipPerfStreamConcurrency::open(int deviceId) {
HIP_CHECK(hipSetDevice(deviceId));
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
std::cout << "info: running on bus " << "0x" << props.pciBusID
<< " " << props.name << " with " << props.multiProcessorCount << " CUs"
<< " and device id: " << deviceId << std::endl;
CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device ID: %d", props.pciBusID,
props.name, props.multiProcessorCount, deviceId);
numCUs = props.multiProcessorCount;
return true;
}
void hipPerfStreamConcurrency::close() {
}
void hipPerfStreamConcurrency::close() {}
bool hipPerfStreamConcurrency::run(unsigned int testCase,
unsigned int deviceId) {
bool hipPerfStreamConcurrency::run(unsigned int testCase, unsigned int deviceId) {
int clkFrequency = 0;
unsigned int numStreams = getNumStreams();
unsigned int numKernels = getNumKernels();
HIP_CHECK(hipDeviceGetAttribute(&clkFrequency,
hipDeviceAttributeClockRate, deviceId));
HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, deviceId));
if (clkFrequency == 0) {
std::cout << "clkFrequency = 0, set it to 1000000\n";
CONSOLE_PRINT("clkFrequency = 0, set it to 1000000\n");
clkFrequency = 1000000;
}
clkFrequency =(unsigned int)clkFrequency/1000;
clkFrequency = (unsigned int)clkFrequency / 1000;
// Maximum iteration count
// maxIter = 8388608 * (engine_clock / 1000).serial execution
maxIter = (unsigned int)(((8388608 * (static_cast<float>(clkFrequency) / 1000))
* numCUs) / 128);
maxIter = (unsigned int)(((8388608 * (static_cast<float>(clkFrequency) / 1000)) * numCUs) / 128);
maxIter = (maxIter + 15) & ~15;
hipStream_t *streams = new hipStream_t[numStreams];
uint ** hPtr = new uint*[numKernels];
uint ** dPtr = new uint*[numKernels];
hipStream_t* streams = new hipStream_t[numStreams];
uint** hPtr = new uint*[numKernels];
uint** dPtr = new uint*[numKernels];
// Width is divisible by 4 because the mandelbrot kernel
// processes 4 pixels at once.
@@ -271,16 +258,15 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
// Allocate memory on the host and device
for (uint i = 0; i < numKernels; i++) {
HIP_CHECK(hipHostMalloc(reinterpret_cast<void **>(&hPtr[i]),
bufSize, hipHostMallocDefault));
HIP_CHECK(hipHostMalloc(reinterpret_cast<void**>(&hPtr[i]), bufSize, hipHostMallocDefault));
setData(hPtr[i], 0xdeadbeef);
HIP_CHECK(hipMalloc(reinterpret_cast<void **>(&dPtr[i]), bufSize))
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dPtr[i]), bufSize))
}
// Prepare kernel launch parameters
int threads = (bufSize/sizeof(uint));
int threads_per_block = 64;
int blocks = (threads/threads_per_block) + (threads % threads_per_block);
int threads = (bufSize / sizeof(uint));
int threads_per_block = 64;
int blocks = (threads / threads_per_block) + (threads % threads_per_block);
coordIdx = testCase % numCoords;
float xStep = static_cast<float>(coords[coordIdx].width / static_cast<double>(width_));
float yStep = static_cast<float>(-coords[coordIdx].width / static_cast<double>(width_));
@@ -289,8 +275,8 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
// Copy memory asynchronously and concurrently from host to device
for (uint i = 0; i < numKernels; i++) {
HIP_CHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]),
hPtr[i], bufSize, streams[i % numStreams]));
HIP_CHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]), hPtr[i], bufSize,
streams[i % numStreams]));
}
// Synchronize to make sure all the copies are completed
@@ -305,9 +291,8 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
auto all_start = std::chrono::steady_clock::now();
for (uint i = 0; i < numKernels; i++) {
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block),
0, streams[i%numStreams], dPtr[i], width_, xPos, yPos, xStep,
yStep, maxIter);
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0,
streams[i % numStreams], dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter);
}
// Synchronize all the concurrent streans to have completed execution
@@ -320,17 +305,16 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
// Copy data back from device to the host
for (uint i = 0; i < numKernels; i++) {
HIP_CHECK(hipMemcpyDtoHAsync(hPtr[i],
reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize,
streams[i % numStreams]));
HIP_CHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize,
streams[i % numStreams]));
}
if (testCase != 0) {
std::cout <<"Measured time for " << numKernels <<" kernels (s) on "
<< numStreams <<" stream (s): " << all_kernel_time.count() << std::endl;
CONSOLE_PRINT("Measured time for %d kernels (s) on %d stream(s): %e\n", numKernels, numStreams,
all_kernel_time.count());
}
for (uint i = 0 ; i < numStreams; i++) {
for (uint i = 0; i < numStreams; i++) {
HIP_CHECK(hipStreamDestroy(streams[i]));
}
@@ -340,20 +324,20 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase,
HIP_CHECK(hipFree(dPtr[i]));
}
delete [] streams;
delete [] hPtr;
delete [] dPtr;
delete[] streams;
delete[] hPtr;
delete[] dPtr;
return true;
}
void hipPerfStreamConcurrency::setData(void *ptr, unsigned int value) {
unsigned int *ptr2 = (unsigned int *)ptr;
for (unsigned int i = 0; i < width_ ; i++) {
ptr2[i] = value;
void hipPerfStreamConcurrency::setData(void* ptr, unsigned int value) {
unsigned int* ptr2 = (unsigned int*)ptr;
for (unsigned int i = 0; i < width_; i++) {
ptr2[i] = value;
}
}
void hipPerfStreamConcurrency::checkData(uint *ptr) {
void hipPerfStreamConcurrency::checkData(uint* ptr) {
totalIters = 0;
for (unsigned int i = 0; i < width_; i++) {
totalIters += ptr[i];
@@ -361,16 +345,16 @@ void hipPerfStreamConcurrency::checkData(uint *ptr) {
}
/**
* Test Description
* ------------------------
* - Verify the different levels of stream concurrency.
* Test source
* ------------------------
* - perftests/stream/hipPerfStreamConcurrency.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
* Test Description
* ------------------------
* - Verify the different levels of stream concurrency.
* Test source
* ------------------------
* - perftests/stream/hipPerfStreamConcurrency.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
TEST_CASE("Perf_hipPerfStreamConcurrency") {
hipPerfStreamConcurrency streamConcurrency;
@@ -386,10 +370,10 @@ TEST_CASE("Perf_hipPerfStreamConcurrency") {
break;
case 1:
// default stream executes serially
streamConcurrency.setNumStreams(1);
streamConcurrency.setNumKernels(1);
break;
// default stream executes serially
streamConcurrency.setNumStreams(1);
streamConcurrency.setNumKernels(1);
break;
case 2:
// 2-way concurrency
@@ -419,6 +403,6 @@ TEST_CASE("Perf_hipPerfStreamConcurrency") {
}
/**
* End doxygen group perfComputeTest.
* @}
*/
* End doxygen group perfComputeTest.
* @}
*/
+46 -46
Просмотреть файл
@@ -18,19 +18,17 @@
*/
/**
* @addtogroup hipPerfStreamCreateCopyDestroy hipPerfStreamCreateCopyDestroy
* @{
* @ingroup perfStreamTest
* `hipError_t hipStreamCreate(hipStream_t* stream)` -
* Create an asynchronous stream.
*/
* @addtogroup hipPerfStreamCreateCopyDestroy hipPerfStreamCreateCopyDestroy
* @{
* @ingroup perfStreamTest
* `hipError_t hipStreamCreate(hipStream_t* stream)` -
* Create an asynchronous stream.
*/
#include <hip_test_kernels.hh>
#include <hip_test_checkers.hh>
#include <hip_test_common.hh>
using namespace std;
#define BufSize 0x1000
#define Iterations 0x100
#define TotalStreams 4
@@ -39,17 +37,20 @@ using namespace std;
class hipPerfStreamCreateCopyDestroy {
private:
unsigned int numBuffers_;
unsigned int numStreams_;
const size_t totalStreams_[TotalStreams];
const size_t totalBuffers_[TotalBufs];
unsigned int numBuffers_;
unsigned int numStreams_;
const size_t totalStreams_[TotalStreams];
const size_t totalBuffers_[TotalBufs];
public:
hipPerfStreamCreateCopyDestroy() : numBuffers_(0), numStreams_(0),
totalStreams_{1, 2, 4, 8},
totalBuffers_{1, 100, 1000, 5000} {};
~hipPerfStreamCreateCopyDestroy() {};
bool open(int deviceID);
bool run(unsigned int testNumber);
hipPerfStreamCreateCopyDestroy()
: numBuffers_(0),
numStreams_(0),
totalStreams_{1, 2, 4, 8},
totalBuffers_{1, 100, 1000, 5000} {};
~hipPerfStreamCreateCopyDestroy(){};
bool open(int deviceID);
bool run(unsigned int testNumber);
};
bool hipPerfStreamCreateCopyDestroy::open(int deviceId) {
@@ -61,20 +62,20 @@ bool hipPerfStreamCreateCopyDestroy::open(int deviceId) {
HIP_CHECK(hipSetDevice(deviceId));
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, deviceId));
std::cout << "info: running on bus " << "0x" << props.pciBusID
<< " " << props.name << " with " << props.multiProcessorCount << " CUs"
<< " and device id: " << deviceId << std::endl;
CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID,
props.name, props.multiProcessorCount, deviceId);
return true;
}
bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
numStreams_ = totalStreams_[testNumber % TotalStreams];
size_t iter = Iterations / (numStreams_ * (static_cast<size_t>(1)
<< (testNumber / TotalBufs + 1)));
hipStream_t *streams = new hipStream_t[numStreams_];
size_t iter =
Iterations / (numStreams_ * (static_cast<size_t>(1) << (testNumber / TotalBufs + 1)));
hipStream_t* streams = new hipStream_t[numStreams_];
numBuffers_ = totalBuffers_[testNumber / TotalBufs];
float ** dSrc = new float*[numBuffers_];
float** dSrc = new float*[numBuffers_];
size_t nBytes = BufSize * sizeof(float);
for (size_t b = 0; b < numBuffers_; ++b) {
@@ -97,8 +98,7 @@ bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
for (size_t s = 0; s < numStreams_; ++s) {
for (size_t b = 0; b < numBuffers_; ++b) {
HIP_CHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes,
hipMemcpyHostToDevice, streams[s]));
HIP_CHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, hipMemcpyHostToDevice, streams[s]));
}
}
@@ -112,31 +112,31 @@ bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
auto time = static_cast<float>(diff.count() * 1000 / (iter * numStreams_));
cout << "Create+Copy+Destroy time for " << numStreams_ << " streams and "
<< setw(4) << numBuffers_ << " buffers " << " and " << setw(4)
<< iter << " iterations " << time << " (ms) " << endl;
CONSOLE_PRINT(
"Create+Copy+Destroy time for %u streams and %u buffers and %zu iterations %.6f (ms)\n",
numStreams_, numBuffers_, iter, time);
delete [] hSrc;
delete[] hSrc;
for (size_t b = 0; b < numBuffers_; ++b) {
HIP_CHECK(hipFree(dSrc[b]));
}
delete [] streams;
delete [] dSrc;
delete[] streams;
delete[] dSrc;
return true;
}
/**
* Test Description
* ------------------------
* - Verify the Create+Copy+Destroy time for different stream.
* Test source
* ------------------------
* - perftests/stream/hipPerfDeviceConcurrency.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
* Test Description
* ------------------------
* - Verify the Create+Copy+Destroy time for different stream.
* Test source
* ------------------------
* - perftests/stream/hipPerfDeviceConcurrency.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
TEST_CASE("Perf_hipPerfStreamCreateCopyDestroy") {
hipPerfStreamCreateCopyDestroy streamCCD;
@@ -149,6 +149,6 @@ TEST_CASE("Perf_hipPerfStreamCreateCopyDestroy") {
}
/**
* End doxygen group perfStreamTest.
* @}
*/
* End doxygen group perfStreamTest.
* @}
*/
-382
Просмотреть файл
@@ -1,382 +0,0 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
#include <iostream>
#include <chrono>
#include "test_common.h"
#include <vector>
#define DOT_DIM 256
using namespace std;
template <unsigned int BLOCKSIZE>
__launch_bounds__(BLOCKSIZE)
__global__ void vectors_not_equal(int n,
const double* __restrict__ x,
const double* __restrict__ y,
double* __restrict__ workspace) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
double sum = 0.0;
for(int idx = gid; idx < n; idx += hipGridDim_x * hipBlockDim_x) {
sum = fma(y[idx], x[idx], sum);
}
__shared__ double sdata[BLOCKSIZE];
sdata[threadIdx.x] = sum;
__syncthreads();
if(threadIdx.x < 128) {
sdata[threadIdx.x] += sdata[threadIdx.x + 128];
}
__syncthreads();
if(threadIdx.x < 64){
sdata[threadIdx.x] += sdata[threadIdx.x + 64];
}
__syncthreads();
if(threadIdx.x < 32){
sdata[threadIdx.x] += sdata[threadIdx.x + 32];
}
__syncthreads();
if(threadIdx.x < 16) {
sdata[threadIdx.x] += sdata[threadIdx.x + 16];
}
__syncthreads();
if(threadIdx.x < 8) {
sdata[threadIdx.x] += sdata[threadIdx.x + 8];
}
__syncthreads();
if(threadIdx.x < 4) {
sdata[threadIdx.x] += sdata[threadIdx.x + 4];
}
__syncthreads();
if(threadIdx.x < 2) {
sdata[threadIdx.x] += sdata[threadIdx.x + 2];
}
__syncthreads();
if(threadIdx.x < 1) {
sdata[threadIdx.x] += sdata[threadIdx.x + 1];
}
if(threadIdx.x == 0) {
workspace[blockIdx.x] = sdata[0];
}
}
template <unsigned int BLOCKSIZE>
__launch_bounds__(BLOCKSIZE)
__global__ void vectors_equal(int n, const double* __restrict__ x,
double* __restrict__ workspace) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
double sum = 0.0;
for(int idx = gid; idx < n; idx += hipGridDim_x * blockDim.x) {
sum = fma(x[idx], x[idx], sum);
}
__shared__ double sdata[BLOCKSIZE];
sdata[threadIdx.x] = sum;
__syncthreads();
if(threadIdx.x < 128) {
sdata[threadIdx.x] += sdata[threadIdx.x + 128];
}
__syncthreads();
if(threadIdx.x < 64) {
sdata[threadIdx.x] += sdata[threadIdx.x + 64];
}
__syncthreads();
if(threadIdx.x < 32) {
sdata[threadIdx.x] += sdata[threadIdx.x + 32];
}
__syncthreads();
if(threadIdx.x < 16) {
sdata[threadIdx.x] += sdata[threadIdx.x + 16];
}
__syncthreads();
if(threadIdx.x < 8) {
sdata[threadIdx.x] += sdata[threadIdx.x + 8];
}
__syncthreads();
if(threadIdx.x < 4) {
sdata[threadIdx.x] += sdata[threadIdx.x + 4];
}
__syncthreads();
if(threadIdx.x < 2) {
sdata[threadIdx.x] += sdata[threadIdx.x + 2];
}
__syncthreads();
if(threadIdx.x < 1) {
sdata[threadIdx.x] += sdata[threadIdx.x + 1];
}
if(threadIdx.x == 0) {
workspace[blockIdx.x] = sdata[0];
}
}
template <unsigned int BLOCKSIZE>
__launch_bounds__(BLOCKSIZE)
__global__ void dot_reduction(double* __restrict__ workspace) {
__shared__ double sdata[BLOCKSIZE];
sdata[threadIdx.x] = workspace[threadIdx.x];
__syncthreads();
if(threadIdx.x < 128) {
sdata[threadIdx.x] += sdata[threadIdx.x + 128];
}
__syncthreads();
if(threadIdx.x < 64) {
sdata[threadIdx.x] += sdata[threadIdx.x + 64];
}
__syncthreads();
if(threadIdx.x < 32) {
sdata[threadIdx.x] += sdata[threadIdx.x + 32];
}
__syncthreads();
if(threadIdx.x < 16) {
sdata[threadIdx.x] += sdata[threadIdx.x + 16];
}
__syncthreads();
if(threadIdx.x < 8) {
sdata[threadIdx.x] += sdata[threadIdx.x + 8];
}
__syncthreads();
if(threadIdx.x < 4) {
sdata[threadIdx.x] += sdata[threadIdx.x + 4];
} __syncthreads();
if(threadIdx.x < 2) {
sdata[threadIdx.x] += sdata[threadIdx.x + 2];
}
__syncthreads();
if(threadIdx.x < 1) {
sdata[threadIdx.x] += sdata[threadIdx.x + 1];
}
if(threadIdx.x == 0) {
workspace[0] = sdata[0];
}
}
void computeDotProduct(int n, const double* x, const double* y, double& result,
double* workspace)
{
dim3 blocks(DOT_DIM);
dim3 threadsPerBlock(DOT_DIM);
if(x != y) {
hipLaunchKernelGGL(vectors_not_equal<DOT_DIM>, blocks, threadsPerBlock, 0, 0, n, x, y,
workspace);
}
else {
hipLaunchKernelGGL(vectors_equal<DOT_DIM>, blocks, threadsPerBlock, 0, 0, n, x, workspace);
}
// Part 2 of dot product computation
hipLaunchKernelGGL(dot_reduction<DOT_DIM>, dim3(1), threadsPerBlock, 0, 0, workspace);
// Copy the final dot product result back from the device
HIPCHECK(hipMemcpy(&result, workspace, sizeof(double), hipMemcpyDeviceToHost));
return;
}
int main(int argc, char* argv[]) {
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
if (nGpu < 1) {
failed("No GPU!");
}
hipDeviceProp_t props = {0};
props = {0};
HIPCHECK(hipSetDevice(p_gpuDevice));
HIPCHECK(hipGetDeviceProperties(&props, p_gpuDevice));
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << p_gpuDevice
<< std::endl;
int nx, ny, nz;
for (unsigned int testCase = 0; testCase < 3; testCase++) {
vector<int> vectorSize = {200, 300, 50};
switch(testCase) {
case 0:
nx = vectorSize[0];
ny = vectorSize[0];
nz = vectorSize[0];
break;
case 1:
nx = vectorSize[1];
ny = vectorSize[1];
nz = vectorSize[1];
break;
case 2:
nx = vectorSize[0];
ny = vectorSize[1];
nz = vectorSize[2];
break;
default:
break;
}
int trials = 200;
int size = nx * ny * nz;
vector<double> hx(size);
vector<double> hy(size);
double hresult_xy = 0.0;
double hresult_xx = 0.0;
srand(time(NULL));
for(int i = 0; i < size; ++i) {
hx[i] = 2.0 * (double)rand() / (double)RAND_MAX - 1.0;
hy[i] = 2.0 * (double)rand() / (double)RAND_MAX - 1.0;
hresult_xy += hx[i] * hy[i];
hresult_xx += hx[i] * hx[i];
}
double* dx;
double* dy;
double* workspace;
double dresult;
HIPCHECK(hipMalloc((void**)&dx, sizeof(double) * size));
HIPCHECK(hipMalloc((void**)&dy, sizeof(double) * size));
HIPCHECK(hipMalloc((void**)&workspace, sizeof(double) * DOT_DIM));
HIPCHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice));
// Warm up
computeDotProduct(size, dx, dy, dresult, workspace);
computeDotProduct(size, dx, dy, dresult, workspace);
computeDotProduct(size, dx, dy, dresult, workspace);
// Timed run for <x,y>
HIPCHECK(hipDeviceSynchronize());
auto all_start = std::chrono::steady_clock::now();
for(int i = 0; i < trials; ++i) {
computeDotProduct(size, dx, dy, dresult, workspace);
}
float time = 0;
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
time = all_kernel_time.count();
time /= trials;
double bw = sizeof(double) * size * 2.0 / 1e9;
double gf = 2.0 * size / 1e9;
cout << "\nVector Size: " << size << "\n[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
<< gf/(time / 1e3) << " GFlop/s" << endl;
// Verify the device kernel results comparing it with the host results
if(std::abs(dresult - hresult_xy) > std::max(dresult * 1e-10, 1e-8)) {
cerr << " Device results inconsistent with host results. "
<< " Host result: " << hresult_xy
<< " Device result: " << dresult;
}
// Warm up
computeDotProduct(size, dx, dx, dresult, workspace);
computeDotProduct(size, dx, dx, dresult, workspace);
computeDotProduct(size, dx, dx, dresult, workspace);
// Timed run for <x,x>
HIPCHECK(hipDeviceSynchronize());
all_start = std::chrono::steady_clock::now();
for(int i = 0; i < trials; ++i) {
computeDotProduct(size, dx, dx, dresult, workspace);
}
all_end = std::chrono::steady_clock::now();
all_kernel_time = all_end - all_start;
time = all_kernel_time.count();
time /= trials;
bw = sizeof(double) * size / 1e9;
cout << "[ddot] <x,y> " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;"
<< gf/(time / 1e3) << " GFlop/s" << endl;
// Verify the device kernel results comparing it with the host results
if(abs(dresult - hresult_xx) > max(dresult * 1e-10, 1e-8)) {
cerr << " Device results inconsistent with host results"
<< " Host result: " << hresult_xy
<< " Device result: " << dresult;
}
HIPCHECK(hipFree(dx));
HIPCHECK(hipFree(dy));
HIPCHECK(hipFree(workspace));
}
passed();
return 0;
}
-743
Просмотреть файл
@@ -1,743 +0,0 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
#include <iostream>
#include <chrono>
#include "test_common.h"
#include <hip/hip_vector_types.h>
#include <hip/math_functions.h>
#include <vector>
#include <string>
#include <map>
typedef struct {
double x;
double y;
double width;
} coordRec;
coordRec coords[] = {
{0.0, 0.0, 4.0}, // Whole set
{0.0, 0.0, 0.00001}, // All black
{-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail
};
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
template <typename T>
__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
uint maxIter) {
#pragma FP_CONTRACT ON
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
float x0 = (float)(xPos + xStep*i);
float y0 = (float)(yPos + yStep*j);
float x = x0;
float y = y0;
uint iter = 0;
float tmp;
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
tmp = x;
x = fma(-y,y,fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
}
out[tid] = iter;
};
template <typename T>
__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos,
T yPos, T xStep, T yStep, uint maxIter) {
#pragma FP_CONTRACT ON
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
float x0 = (float)(xPos + xStep*(float)i);
float y0 = (float)(yPos + yStep*(float)j);
float x = x0;
float y = y0;
#define FAST
uint iter = 0;
float tmp;
int stay;
int ccount = 0;
stay = (x*x+y*y) <= 4.0;
float savx = x;
float savy = y;
#ifdef FAST
for (iter = 0; (iter < maxIter); iter+=16) {
#else
for (iter = 0; stay && (iter < maxIter); iter+=16) {
#endif
x = savx;
y = savy;
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
stay = (x*x+y*y) <= 4.0;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
ccount += stay*16;
#ifdef FAST
if (!stay)
break;
#endif
}
// Handle remainder
if (!stay) {
iter = 16;
do {
x = savx;
y = savy;
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
tmp = x;
x = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
ccount += stay;
iter--;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
} while (stay && iter);
}
out[tid] = (uint)ccount;
};
template <typename T>
__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep,
uint maxIter) {
#pragma FP_CONTRACT ON
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
double x0 = (double)(xPos + xStep*i);
double y0 = (double)(yPos + yStep*j);
double x = x0;
double y = y0;
uint iter = 0;
double tmp;
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
tmp = x;
x = fma(-y,y,fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
}
out[tid] = iter;
};
template <typename T>
__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos,
T yPos, T xStep, T yStep, uint maxIter) {
#pragma FP_CONTRACT ON
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
double x0 = (double)(xPos + xStep*(double)i);
double y0 = (double)(yPos + yStep*(double)j);
double x = x0;
double y = y0;
#define FAST
uint iter = 0;
double tmp;
int stay;
int ccount = 0;
stay = (x*x+y*y) <= 4.0;
double savx = x;
double savy = y;
#ifdef FAST
for (iter = 0; (iter < maxIter); iter+=16)
#else
for (iter = 0; stay && (iter < maxIter); iter+=16)
#endif
{
x = savx;
y = savy;
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
// Two iterations
tmp = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*x,y,y0);
x = fma(-y,y, fma(tmp,tmp,x0));
y = fma(2.0f*tmp,y,y0);
stay = (x*x+y*y) <= 4.0;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
ccount += stay*16;
#ifdef FAST
if (!stay)
break;
#endif
}
// Handle remainder
if (!stay) {
iter = 16;
do {
x = savx;
y = savy;
stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter);
tmp = x;
x = fma(-y,y, fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
ccount += stay;
iter--;
savx = (stay ? x : savx);
savy = (stay ? y : savy);
}
while (stay && iter);
}
out[tid] = (uint)ccount;
};
static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15;
// Expected results for each kernel run at each coord
unsigned long long expectedIters[] = {
203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull,
120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull,
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull,
203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull,
120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull,
2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull};
class hipPerfMandelBrot {
public:
hipPerfMandelBrot();
~hipPerfMandelBrot();
void setNumKernels(unsigned int num) {
numKernels = num;
}
unsigned int getNumKernels() {
return numKernels;
}
void setNumStreams(unsigned int num) {
numStreams = num;
}
unsigned int getNumStreams() {
return numStreams;
}
void open(int deviceID);
void run(unsigned int testCase, unsigned int deviceId);
void printResults(void);
// array of funtion pointers
typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt);
// Wrappers
void float_mad(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t* streams,
int blocks, int threads_per_block, int kernelCnt);
void float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t* streams,
int blocks, int threads_per_block, int kernelCnt);
void double_mad(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt);
void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams, int blocks,
int threads_per_block, int kernelCnt);
hipStream_t streams[2];
private:
void setData(void *ptr, unsigned int value);
void checkData(uint *ptr);
unsigned int numKernels;
unsigned int numStreams;
std::map<std::string, std::vector<double>> results;
unsigned int width_;
unsigned int bufSize;
unsigned int maxIter;
unsigned int coordIdx;
volatile unsigned long long totalIters = 0;
int numCUs;
static const unsigned int numLoops = 10;
};
hipPerfMandelBrot::hipPerfMandelBrot() {}
hipPerfMandelBrot::~hipPerfMandelBrot() {}
void hipPerfMandelBrot::open(int deviceId) {
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
if (nGpu < 1) {
failed("No GPU!");
}
HIPCHECK(hipSetDevice(deviceId));
hipDeviceProp_t props = {0};
HIPCHECK(hipGetDeviceProperties(&props, deviceId));
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId
<< std::endl;
numCUs = props.multiProcessorCount;
}
void hipPerfMandelBrot::printResults() {
int numkernels = getNumKernels();
int numStreams = getNumStreams();
std::cout << "\n" <<"Measured perf for kernels in GFLOPS on "
<< numStreams << " streams (s)" << std::endl;
std::map<std::string, std::vector<double>>:: iterator itr;
for (itr = results.begin(); itr != results.end(); itr++) {
std::cout << "\n" << std::setw(20) << itr->first << " ";
for(auto i : results[itr->first]) {
std::cout << std::setw(10) << i << " ";
}
}
results.clear();
std::cout << std::endl;
}
// Wrappers for the kernel launches
void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter, hipStream_t* streams,
int blocks, int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(float_mad_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep,
maxIter);
}
void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t * streams,
int blocks, int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(float_mandel_unroll_kernel<float>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
}
void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t * streams,
int blocks, int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(double_mad_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
}
void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter, hipStream_t * streams,
int blocks, int threads_per_block, int kernelCnt) {
int streamCnt = getNumStreams();
hipLaunchKernelGGL(float_mandel_unroll_kernel<double>, dim3(blocks), dim3(threads_per_block), 0,
streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter);
}
void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
unsigned int numStreams = getNumStreams();
coordIdx = testCase % numCoords;
funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
&hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
// Maximum iteration count
maxIter = 32768;
uint * hPtr[numKernels];
uint * dPtr[numKernels];
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
width_ = 256;
bufSize = width_ * width_ * sizeof(uint);
// Create streams for concurrency
for (uint i = 0; i < numStreams; i++) {
HIPCHECK(hipStreamCreate(&streams[i]));
}
// Allocate memory on the host and device
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
setData(hPtr[i], 0xdeadbeef);
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
}
// Prepare kernel launch parameters
int threads = (bufSize/sizeof(uint));
int threads_per_block = 64;
int blocks = (threads/threads_per_block) + (threads % threads_per_block);
float xStep = (float)(coords[coordIdx].width / (double)width_);
float yStep = (float)(-coords[coordIdx].width / (double)width_);
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
// Copy memory asynchronously and concurrently from host to device
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
}
// Synchronize to make sure all the copies are completed
HIPCHECK(hipStreamSynchronize(0));
int kernelIdx;
if(testCase == 0 || testCase == 5 || testCase == 10) {
kernelIdx = 0;
}
else if(testCase == 1 || testCase == 6 || testCase == 11) {
kernelIdx = 1;
}
else if(testCase == 2 || testCase == 7 || testCase == 12) {
kernelIdx = 2;
}
else if(testCase == 3 || testCase == 8 || testCase == 13){
kernelIdx = 3;
}
double totalTime = 0.0;
for (unsigned int k = 0; k < numLoops; k++) {
if ((testCase == 0 || testCase == 1 || testCase == 2 ||
testCase == 5 || testCase == 6 || testCase == 7 ||
testCase == 10 || testCase == 11 || testCase == 12)) {
float xStep = (float)(coords[coordIdx].width / (double)width_);
float yStep = (float)(-coords[coordIdx].width / (double)width_);
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for (uint i = 0; i < numKernels; i++) {
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
threads_per_block, i);
}
// Synchronize all the concurrent streams to have completed execution
HIPCHECK(hipStreamSynchronize(0));
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
totalTime += all_kernel_time.count();
}
else {
double xStep = coords[coordIdx].width / (double)width_;
double yStep = -coords[coordIdx].width / (double)width_;
double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width;
double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width;
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for (uint i = 0; i < numKernels; i++) {
(this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks,
threads_per_block, i);
}
// Synchronize all the concurrent streams to have completed execution
HIPCHECK(hipStreamSynchronize(0));
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
totalTime += all_kernel_time.count();
}
}
// Copy data back from device to the host
for(uint i = 0; i < numKernels; i++) {
HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost));
}
for(uint i = 0; i < numKernels; i++) {
checkData(hPtr[i]);
int j =0;
while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) {
j++;
}
if(j==30) {
std::cout << "Incorrect iteration count detected. ";
}
}
// Compute GFLOPS. There are 7 FLOPs per iteration
double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) /
(totalTime / (double)numLoops);
std::vector<std::string> kernelName = {"float", "float_unroll",
"double", "double_unroll"};
// Print results except for Warm-up kernel
if(testCase!=100) {
results[kernelName[testCase % 4]].push_back(perf);
}
for(uint i = 0 ; i < numStreams; i++) {
HIPCHECK(hipStreamDestroy(streams[i]));
}
// Free host and device memory
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipHostFree(hPtr[i]));
HIPCHECK(hipFree(dPtr[i]));
}
}
void hipPerfMandelBrot::setData(void *ptr, unsigned int value) {
unsigned int *ptr2 = (unsigned int *)ptr;
for (unsigned int i = 0; i < width_ * width_; i++) {
ptr2[i] = value;
}
}
void hipPerfMandelBrot::checkData(uint *ptr) {
totalIters = 0;
for (unsigned int i = 0; i < width_ * width_; i++) {
totalIters += ptr[i];
}
}
int main(int argc, char* argv[]) {
hipPerfMandelBrot mandelbrotCompute;
int deviceId = 0;
mandelbrotCompute.open(deviceId);
for (unsigned int testCase = 0; testCase < 3; testCase++) {
switch (testCase) {
case 0: {
// Warmup-kernel - default stream executes serially
mandelbrotCompute.setNumStreams(1);
mandelbrotCompute.setNumKernels(1);
mandelbrotCompute.run(100/*Random number*/, deviceId);
break;
}
case 1: {
// run all - sync
int i = 0;
do {
mandelbrotCompute.setNumStreams(1);
mandelbrotCompute.setNumKernels(1);
mandelbrotCompute.run(i, deviceId);
i++;
}while(i < 12);
mandelbrotCompute.printResults();
break;
}
case 2: {
// run all - async
int i = 0;
do {
mandelbrotCompute.setNumStreams(2);
mandelbrotCompute.setNumKernels(2);
mandelbrotCompute.run(i, deviceId);
i++;
}while(i < 12);
mandelbrotCompute.printResults();
break;
}
default: {
break;
}
}
}
passed();
}
-284
Просмотреть файл
@@ -1,284 +0,0 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
#include <iostream>
#include <chrono>
#include "test_common.h"
typedef struct {
double x;
double y;
double width;
} coordRec;
static coordRec coords[] = {
{0.0, 0.0, 0.00001}, // All black
};
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep,
float yStep, uint maxIter) {
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % width;
int j = tid / width;
float x0 = (float)(xPos + xStep*i);
float y0 = (float)(yPos + yStep*j);
float x = x0;
float y = y0;
uint iter = 0;
float tmp;
for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) {
tmp = x;
x = fma(-y,y,fma(x,x,x0));
y = fma(2.0f*tmp,y,y0);
}
out[tid] = iter;
};
class hipPerfDeviceConcurrency {
public:
hipPerfDeviceConcurrency();
~hipPerfDeviceConcurrency();
void setNumGpus(unsigned int num) {
numDevices = num;
}
unsigned int getNumGpus() {
return numDevices;
}
void open(void);
void close(void);
void run(unsigned int testCase, int numGpus);
private:
void setData(void *ptr, unsigned int value);
void checkData(uint *ptr);
unsigned int numDevices;
unsigned int width_;
unsigned int bufSize;
unsigned int coordIdx;
unsigned long long totalIters = 0;
};
hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {}
hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {}
void hipPerfDeviceConcurrency::open(void) {
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
setNumGpus(nGpu);
if (nGpu < 1) {
failed("No GPU!");
}
}
void hipPerfDeviceConcurrency::close() {
}
void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
static int deviceId;
uint * hPtr[numGpus];
uint * dPtr[numGpus];
hipStream_t streams[numGpus];
int numCUs[numGpus];
unsigned int maxIter[numGpus];
unsigned long long expectedIters[numGpus];
int threads, threads_per_block, blocks;
float xStep, yStep, xPos, yPos;
for(int i = 0; i < numGpus; i++) {
if(testCase != 0) {
deviceId = i;
}
HIPCHECK(hipSetDevice(deviceId));
hipDeviceProp_t props = {0};
HIPCHECK(hipGetDeviceProperties(&props, i));
if (testCase != 0) {
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
<< " with " << props.multiProcessorCount << " CUs" << " and device ID: "
<< i << std::endl;
}
numCUs[i] = props.multiProcessorCount;
int clkFrequency = 0;
HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i));
clkFrequency =(unsigned int)clkFrequency/1000;
// Maximum iteration count
// maxIter = 8388608 * (engine_clock / 1000).serial execution
maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128);
maxIter[i] = (maxIter[i] + 15) & ~15;
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
width_ = 256;
bufSize = width_ * width_ * sizeof(uint);
// Create streams for concurrency
HIPCHECK(hipStreamCreate(&streams[i]));
// Allocate memory on the host and device
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
setData(hPtr[i], 0xdeadbeef);
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
// Prepare kernel launch parameters
threads = (bufSize/sizeof(uint));
threads_per_block = 64;
blocks = (threads/threads_per_block) + (threads % threads_per_block);
coordIdx = testCase % numCoords;
xStep = (float)(coords[coordIdx].width / (double)width_);
yStep = (float)(-coords[coordIdx].width / (double)width_);
xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
// Copy memory from host to device
HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice));
}
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for(int i = 0; i < numGpus; i++) {
if(testCase != 0) {
deviceId = i;
}
HIPCHECK(hipSetDevice(deviceId));
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i],
dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]);
}
for(int i = 0; i < numGpus; i++) {
HIPCHECK(hipStreamSynchronize(0));
}
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
for(int i = 0; i < numGpus; i++) {
if(testCase != 0) {
deviceId = i;
}
HIPCHECK(hipSetDevice(deviceId));
// Copy data back from device to the host
HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost));
checkData(hPtr[i]);
expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i];
if (testCase != 0) {
checkData(hPtr[i]);
if(totalIters != expectedIters[i]) {
std::cout << "Incorrect iteration count detected" << std::endl;
}
}
HIPCHECK(hipStreamDestroy(streams[i]));
// Free host and device memory
HIPCHECK(hipHostFree(hPtr[i]));
HIPCHECK(hipFree(dPtr[i]));
}
if (testCase != 0) {
std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): "
<< all_kernel_time.count() << " (s) " << '\n' << std::endl;
}
if(testCase == 0) {
deviceId++;
}
}
void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) {
unsigned int *ptr2 = (unsigned int *)ptr;
for (unsigned int i = 0; i < width_ * width_ ; i++) {
ptr2[i] = value;
}
}
void hipPerfDeviceConcurrency::checkData(uint *ptr) {
totalIters = 0;
for (unsigned int i = 0; i < width_ * width_; i++) {
totalIters += ptr[i];
}
}
int main(int argc, char* argv[]) {
hipPerfDeviceConcurrency deviceConcurrency;
deviceConcurrency.open();
int nGpu = deviceConcurrency.getNumGpus();
// testCase = 0 refers to warmup kernel run
int testCase = 0;
for (int i = 0; i < nGpu; i++) {
// Warm-up kernel on all devices
deviceConcurrency.run(testCase, 1);
}
// Time for kernel on 1 device
deviceConcurrency.run(++testCase, 1);
// Time for kernel on all available devices
deviceConcurrency.run(++testCase, nGpu);
passed();
}
-432
Просмотреть файл
@@ -1,432 +0,0 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
#include <iostream>
#include <chrono>
#include "test_common.h"
#include <hip/hip_vector_types.h>
#ifdef __HIP_PLATFORM_NVIDIA__
inline __device__ float4 operator*(float s, float4 a)
{
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __device__ float4 operator*(float4 a, float4 b)
{
return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
}
inline __device__ float4 operator+(float4 a, float4 b)
{
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
}
inline __device__ float4 operator-(float4 a, float4 b)
{
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}
#endif
typedef struct {
double x;
double y;
double width;
} coordRec;
static coordRec coords[] = {
{0.0, 0.0, 0.00001}, // All black
};
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter) {
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % (width/4);
int j = tid / (width/4);
int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3);
int4 vecj = make_int4(j, j, j, j);
float4 x0;
x0.x = (float)(xPos + xStep*veci.x);
x0.y = (float)(xPos + xStep*veci.y);
x0.z = (float)(xPos + xStep*veci.z);
x0.w = (float)(xPos + xStep*veci.w);
float4 y0;
y0.x = (float)(yPos + yStep*vecj.x);
y0.y = (float)(yPos + yStep*vecj.y);
y0.z = (float)(yPos + yStep*vecj.z);
y0.w = (float)(yPos + yStep*vecj.w);
float4 x = x0;
float4 y = y0;
uint iter = 0;
float4 tmp;
int4 stay;
int4 ccount = make_int4(0, 0, 0, 0);
float4 savx = x;
float4 savy = y;
stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter);
iter+=16) {
x = savx;
y = savy;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
savx.x = (bool)(stay.x ? x.x : savx.x);
savx.y = (bool)(stay.y ? x.y : savx.y);
savx.z = (bool)(stay.z ? x.z : savx.z);
savx.w = (bool)(stay.w ? x.w : savx.w);
savy.x = (bool)(stay.x ? y.x : savy.x);
savy.y = (bool)(stay.y ? y.y : savy.y);
savy.z = (bool)(stay.z ? y.z : savy.z);
savy.w = (bool)(stay.w ? y.w : savy.w);
ccount.x -= stay.x*16;
ccount.y -= stay.y*16;
ccount.z -= stay.z*16;
ccount.w -= stay.w*16;
}
// Handle remainder
if (!(stay.x & stay.y & stay.z & stay.w))
{
iter = 16;
do
{
x = savx;
y = savy;
stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x < maxIter);
stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y < maxIter);
stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z < maxIter);
stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w < maxIter);
tmp = x;
x = x*x + x0 - y*y;
y = 2.0f*tmp*y + y0;
ccount.x += stay.x;
ccount.y += stay.y;
ccount.z += stay.z;
ccount.w += stay.w;
iter--;
savx.x = (stay.x ? x.x : savx.x);
savx.y = (stay.y ? x.y : savx.y);
savx.z = (stay.z ? x.z : savx.z);
savx.w = (stay.w ? x.w : savx.w);
savy.x = (stay.x ? y.x : savy.x);
savy.y = (stay.y ? y.y : savy.y);
savy.z = (stay.z ? y.z : savy.z);
savy.w = (stay.w ? y.w : savy.w);
} while ((stay.x | stay.y | stay.z | stay.w) && iter);
}
uint4 *vecOut = (uint4 *)out;
vecOut[tid].x = (uint)(ccount.x);
vecOut[tid].y = (uint)(ccount.y);
vecOut[tid].z = (uint)(ccount.z);
vecOut[tid].w = (uint)(ccount.w);
}
class hipPerfStreamConcurrency {
public:
hipPerfStreamConcurrency();
~hipPerfStreamConcurrency();
void setNumKernels(unsigned int num) {
numKernels = num;
}
void setNumStreams(unsigned int num) {
numStreams = num;
}
unsigned int getNumStreams() {
return numStreams;
}
unsigned int getNumKernels() {
return numKernels;
}
void open(int deviceID);
void run(unsigned int testCase, unsigned int deviceId);
void close(void);
private:
void setData(void *ptr, unsigned int value);
void checkData(uint *ptr);
unsigned int numKernels;
unsigned int numStreams;
unsigned int width_;
unsigned int bufSize;
unsigned int maxIter;
unsigned int coordIdx;
unsigned long long totalIters;
int numCUs;
};
hipPerfStreamConcurrency::hipPerfStreamConcurrency() {}
hipPerfStreamConcurrency::~hipPerfStreamConcurrency() {}
void hipPerfStreamConcurrency::open(int deviceId) {
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
if (nGpu < 1) {
failed("No GPU!");
}
HIPCHECK(hipSetDevice(deviceId));
hipDeviceProp_t props = {0};
HIPCHECK(hipGetDeviceProperties(&props, deviceId));
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId << std::endl;
numCUs = props.multiProcessorCount;
}
void hipPerfStreamConcurrency::close() {
}
void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId) {
int clkFrequency = 0;
unsigned int numStreams = getNumStreams();
unsigned int numKernels = getNumKernels();
HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, deviceId));
clkFrequency =(unsigned int)clkFrequency/1000;
// Maximum iteration count
// maxIter = 8388608 * (engine_clock / 1000).serial execution
maxIter = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs) / 128);
maxIter = (maxIter + 15) & ~15;
hipStream_t streams[numStreams];
uint * hPtr[numKernels];
uint * dPtr[numKernels];
// Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once.
width_ = 256;
bufSize = width_ * sizeof(uint);
// Create streams for concurrency
for (uint i = 0; i < numStreams; i++) {
HIPCHECK(hipStreamCreate(&streams[i]));
}
// Allocate memory on the host and device
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault));
setData(hPtr[i], 0xdeadbeef);
HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize))
}
// Prepare kernel launch parameters
int threads = (bufSize/sizeof(uint));
int threads_per_block = 64;
int blocks = (threads/threads_per_block) + (threads % threads_per_block);
coordIdx = testCase % numCoords;
float xStep = (float)(coords[coordIdx].width / (double)width_);
float yStep = (float)(-coords[coordIdx].width / (double)width_);
float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width);
float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width);
// Copy memory asynchronously and concurrently from host to device
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]), hPtr[i], bufSize, streams[i % numStreams]));
}
// Synchronize to make sure all the copies are completed
for(uint i = 0; i < numStreams; i++) {
HIPCHECK(hipStreamSynchronize(streams[i]));
}
// Warm-up kernel with lower iteration
if (testCase == 0) {
maxIter = 256;
}
// Time the kernel execution
auto all_start = std::chrono::steady_clock::now();
for (uint i = 0; i < numKernels; i++) {
hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i%numStreams],
dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter);
}
// Synchronize all the concurrent streans to have completed execution
for(uint i = 0; i < numStreams; i++) {
HIPCHECK(hipStreamSynchronize(streams[i]));
}
auto all_end = std::chrono::steady_clock::now();
std::chrono::duration<double> all_kernel_time = all_end - all_start;
// Copy data back from device to the host
for(uint i = 0; i < numKernels; i++) {
HIPCHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize, streams[i % numStreams]));
}
if (testCase != 0) {
std::cout <<"Measured time for " << numKernels <<" kernels (s) on " << numStreams <<" stream (s): "
<< all_kernel_time.count() << std::endl;
}
unsigned long long expected =
(unsigned long long)width_ * (unsigned long long)maxIter;
for(uint i = 0 ; i < numStreams; i++) {
HIPCHECK(hipStreamDestroy(streams[i]));
}
// Free host and device memory
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipHostFree(hPtr[i]));
HIPCHECK(hipFree(dPtr[i]));
}
}
void hipPerfStreamConcurrency::setData(void *ptr, unsigned int value) {
unsigned int *ptr2 = (unsigned int *)ptr;
for (unsigned int i = 0; i < width_ ; i++) {
ptr2[i] = value;
}
}
void hipPerfStreamConcurrency::checkData(uint *ptr) {
totalIters = 0;
for (unsigned int i = 0; i < width_; i++) {
totalIters += ptr[i];
}
}
int main(int argc, char* argv[]) {
hipPerfStreamConcurrency streamConcurrency;
int deviceId = 0;
streamConcurrency.open(deviceId);
for (unsigned int testCase = 0; testCase < 5; testCase++) {
switch (testCase) {
case 0:
// Warm-up kernel
streamConcurrency.setNumStreams(1);
streamConcurrency.setNumKernels(1);
break;
case 1:
// default stream executes serially
streamConcurrency.setNumStreams(1);
streamConcurrency.setNumKernels(1);
break;
case 2:
// 2-way concurrency
streamConcurrency.setNumStreams(2);
streamConcurrency.setNumKernels(2);
break;
case 3:
// 4-way concurrency
streamConcurrency.setNumStreams(4);
streamConcurrency.setNumKernels(4);
break;
case 4:
streamConcurrency.setNumStreams(2);
streamConcurrency.setNumKernels(4);
break;
case 5:
break;
default:
break;
}
streamConcurrency.run(testCase, deviceId);
}
passed();
}
-131
Просмотреть файл
@@ -1,131 +0,0 @@
/*
Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
#include <iostream>
#include <chrono>
#include "test_common.h"
using namespace std;
#define BufSize 0x1000
#define Iterations 0x100
#define TotalStreams 4
#define TotalBufs 4
class hipPerfStreamCreateCopyDestroy {
private:
unsigned int numBuffers_;
unsigned int numStreams_;
const size_t totalStreams_[TotalStreams];
const size_t totalBuffers_[TotalBufs];
public:
hipPerfStreamCreateCopyDestroy() : numBuffers_(0), numStreams_(0),
totalStreams_{1, 2, 4, 8},
totalBuffers_{1, 100, 1000, 5000} {};
~hipPerfStreamCreateCopyDestroy() {};
void open(int deviceID);
void run(unsigned int testNumber);
};
void hipPerfStreamCreateCopyDestroy::open(int deviceId) {
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
if (nGpu < 1) {
failed("No GPU!");
}
HIPCHECK(hipSetDevice(deviceId));
hipDeviceProp_t props = {0};
HIPCHECK(hipGetDeviceProperties(&props, deviceId));
std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name
<< " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId << std::endl;
}
void hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) {
numStreams_ = totalStreams_[testNumber % TotalStreams];
size_t iter = Iterations / (numStreams_ * ((size_t)1 << (testNumber / TotalBufs + 1)));
hipStream_t streams[numStreams_];
numBuffers_ = totalBuffers_[testNumber / TotalBufs];
float* dSrc[numBuffers_];
size_t nBytes = BufSize * sizeof(float);
for (size_t b = 0; b < numBuffers_; ++b) {
HIPCHECK(hipMalloc(&dSrc[b], nBytes));
}
float* hSrc;
hSrc = new float[nBytes];
HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess);
for (size_t i = 0; i < BufSize; i++) {
hSrc[i] = 1.618f + i;
}
auto start = std::chrono::steady_clock::now();
for (size_t i = 0; i < iter; ++i) {
for (size_t s = 0; s < numStreams_; ++s) {
HIPCHECK(hipStreamCreate(&streams[s]));
}
for (size_t s = 0; s < numStreams_; ++s) {
for (size_t b = 0; b < numBuffers_; ++b) {
HIPCHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, hipMemcpyHostToDevice, streams[s]));
}
}
for (size_t s = 0; s < numStreams_; ++s) {
HIPCHECK(hipStreamDestroy(streams[s]));
}
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end - start;
auto time = static_cast<float>(diff.count() * 1000 / (iter * numStreams_));
cout << "Create+Copy+Destroy time for " << numStreams_ << " streams and "
<< setw(4) << numBuffers_ << " buffers " << " and " << setw(4)
<< iter << " iterations " << time << " (ms) " << endl;
delete [] hSrc;
for (size_t b = 0; b < numBuffers_; ++b) {
HIPCHECK(hipFree(dSrc[b]));
}
}
int main(int argc, char* argv[]) {
hipPerfStreamCreateCopyDestroy streamCCD;
int deviceId = 0;
streamCCD.open(deviceId);
for (auto testCase = 0; testCase < TotalStreams * TotalBufs; testCase++) {
streamCCD.run(testCase);
}
passed();
}